diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,94414 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 11.0, + "eval_steps": 500, + "global_step": 47190, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 0.13037056401371955, + "epoch": 0.0011656370206317753, + "grad_norm": 5.8125, + "learning_rate": 1.230769230769231e-06, + "loss": 1.6988, + "mean_token_accuracy": 0.7972599864006042, + "num_tokens": 20665.0, + "step": 5 + }, + { + "entropy": 0.2026868939399719, + "epoch": 0.0023312740412635507, + "grad_norm": 18.75, + "learning_rate": 2.7692307692307697e-06, + "loss": 2.2638, + "mean_token_accuracy": 0.7449834465980529, + "num_tokens": 35523.0, + "step": 10 + }, + { + "entropy": 0.1481923870742321, + "epoch": 0.003496911061895326, + "grad_norm": 11.6875, + "learning_rate": 4.307692307692308e-06, + "loss": 1.774, + "mean_token_accuracy": 0.8239133059978485, + "num_tokens": 48005.0, + "step": 15 + }, + { + "entropy": 0.14728636629879474, + "epoch": 0.004662548082527101, + "grad_norm": 29.75, + "learning_rate": 5.846153846153847e-06, + "loss": 1.534, + "mean_token_accuracy": 0.8404697060585022, + "num_tokens": 66174.0, + "step": 20 + }, + { + "entropy": 0.1347280215471983, + "epoch": 0.005828185103158876, + "grad_norm": 13.8125, + "learning_rate": 7.384615384615386e-06, + "loss": 1.4182, + "mean_token_accuracy": 0.8635528087615967, + "num_tokens": 83447.0, + "step": 25 + }, + { + "entropy": 0.1397606860846281, + "epoch": 0.006993822123790652, + "grad_norm": 12.3125, + "learning_rate": 8.923076923076925e-06, + "loss": 1.264, + "mean_token_accuracy": 0.864665013551712, + "num_tokens": 97990.0, + "step": 30 + }, + { + "entropy": 0.11616570875048637, + "epoch": 0.008159459144422426, + "grad_norm": 10.0625, + "learning_rate": 1.0461538461538463e-05, + "loss": 0.7635, + "mean_token_accuracy": 0.8946437478065491, + "num_tokens": 127034.0, + "step": 35 + }, + { + "entropy": 0.2478984471410513, + "epoch": 0.009325096165054203, + "grad_norm": 20.75, + "learning_rate": 1.2e-05, + "loss": 1.3525, + "mean_token_accuracy": 0.8422249376773834, + "num_tokens": 149561.0, + "step": 40 + }, + { + "entropy": 0.23592497650533914, + "epoch": 0.010490733185685977, + "grad_norm": 2.171875, + "learning_rate": 1.353846153846154e-05, + "loss": 0.8752, + "mean_token_accuracy": 0.8744755148887634, + "num_tokens": 180020.0, + "step": 45 + }, + { + "entropy": 0.27611723467707633, + "epoch": 0.011656370206317752, + "grad_norm": 4.28125, + "learning_rate": 1.5076923076923078e-05, + "loss": 1.0946, + "mean_token_accuracy": 0.8549042701721191, + "num_tokens": 193617.0, + "step": 50 + }, + { + "entropy": 0.24654684234410523, + "epoch": 0.012822007226949528, + "grad_norm": 5.71875, + "learning_rate": 1.6615384615384618e-05, + "loss": 0.6624, + "mean_token_accuracy": 0.8910752832889557, + "num_tokens": 216657.0, + "step": 55 + }, + { + "entropy": 0.4486447751522064, + "epoch": 0.013987644247581303, + "grad_norm": 5.5, + "learning_rate": 1.8153846153846155e-05, + "loss": 1.2489, + "mean_token_accuracy": 0.8384927690029145, + "num_tokens": 225069.0, + "step": 60 + }, + { + "entropy": 0.33645918890833854, + "epoch": 0.015153281268213078, + "grad_norm": 2.75, + "learning_rate": 1.9692307692307696e-05, + "loss": 0.8006, + "mean_token_accuracy": 0.8728708684444427, + "num_tokens": 239949.0, + "step": 65 + }, + { + "entropy": 0.32945496812462804, + "epoch": 0.016318918288844853, + "grad_norm": 4.28125, + "learning_rate": 1.9999999976141594e-05, + "loss": 0.7551, + "mean_token_accuracy": 0.8722834944725036, + "num_tokens": 263424.0, + "step": 70 + }, + { + "entropy": 0.22601164653897285, + "epoch": 0.01748455530947663, + "grad_norm": 0.7734375, + "learning_rate": 1.9999999879216817e-05, + "loss": 0.3863, + "mean_token_accuracy": 0.9277985453605652, + "num_tokens": 282642.0, + "step": 75 + }, + { + "entropy": 0.3768581360578537, + "epoch": 0.018650192330108405, + "grad_norm": 2.09375, + "learning_rate": 1.9999999707734522e-05, + "loss": 0.9283, + "mean_token_accuracy": 0.8531839072704315, + "num_tokens": 293090.0, + "step": 80 + }, + { + "entropy": 0.3327511861920357, + "epoch": 0.01981582935074018, + "grad_norm": 9.125, + "learning_rate": 1.9999999461694714e-05, + "loss": 0.9839, + "mean_token_accuracy": 0.855659818649292, + "num_tokens": 309153.0, + "step": 85 + }, + { + "entropy": 0.27368163913488386, + "epoch": 0.020981466371371955, + "grad_norm": 0.8828125, + "learning_rate": 1.9999999141097392e-05, + "loss": 0.6371, + "mean_token_accuracy": 0.9080254793167114, + "num_tokens": 325081.0, + "step": 90 + }, + { + "entropy": 0.3098975282162428, + "epoch": 0.02214710339200373, + "grad_norm": 0.890625, + "learning_rate": 1.999999874594256e-05, + "loss": 0.5994, + "mean_token_accuracy": 0.8943244755268097, + "num_tokens": 345901.0, + "step": 95 + }, + { + "entropy": 0.3766414072364569, + "epoch": 0.023312740412635504, + "grad_norm": 2.375, + "learning_rate": 1.9999998276230227e-05, + "loss": 0.986, + "mean_token_accuracy": 0.8288512468338013, + "num_tokens": 383154.0, + "step": 100 + }, + { + "entropy": 0.32121687904000285, + "epoch": 0.024478377433267282, + "grad_norm": 1.5078125, + "learning_rate": 1.9999997731960398e-05, + "loss": 0.4956, + "mean_token_accuracy": 0.9077661991119385, + "num_tokens": 400284.0, + "step": 105 + }, + { + "entropy": 0.31962902620434763, + "epoch": 0.025644014453899057, + "grad_norm": 3.046875, + "learning_rate": 1.9999997113133085e-05, + "loss": 0.7156, + "mean_token_accuracy": 0.8738201320171356, + "num_tokens": 415485.0, + "step": 110 + }, + { + "entropy": 0.33004358410835266, + "epoch": 0.02680965147453083, + "grad_norm": 1.15625, + "learning_rate": 1.9999996419748292e-05, + "loss": 0.5986, + "mean_token_accuracy": 0.9044980466365814, + "num_tokens": 447579.0, + "step": 115 + }, + { + "entropy": 0.3824078649282455, + "epoch": 0.027975288495162606, + "grad_norm": 2.984375, + "learning_rate": 1.9999995651806034e-05, + "loss": 0.9191, + "mean_token_accuracy": 0.8670506238937378, + "num_tokens": 459128.0, + "step": 120 + }, + { + "entropy": 0.4574072495102882, + "epoch": 0.02914092551579438, + "grad_norm": 5.34375, + "learning_rate": 1.9999994809306322e-05, + "loss": 1.0716, + "mean_token_accuracy": 0.8488256156444549, + "num_tokens": 470027.0, + "step": 125 + }, + { + "entropy": 0.2947243496775627, + "epoch": 0.030306562536426156, + "grad_norm": 0.63671875, + "learning_rate": 1.9999993892249164e-05, + "loss": 0.6576, + "mean_token_accuracy": 0.9072929441928863, + "num_tokens": 499546.0, + "step": 130 + }, + { + "entropy": 0.3057884469628334, + "epoch": 0.031472199557057934, + "grad_norm": 3.796875, + "learning_rate": 1.9999992900634578e-05, + "loss": 0.5276, + "mean_token_accuracy": 0.8684212446212769, + "num_tokens": 536170.0, + "step": 135 + }, + { + "entropy": 0.3543548423796892, + "epoch": 0.032637836577689705, + "grad_norm": 1.1953125, + "learning_rate": 1.999999183446258e-05, + "loss": 0.6128, + "mean_token_accuracy": 0.8815208315849304, + "num_tokens": 573314.0, + "step": 140 + }, + { + "entropy": 0.38243546336889267, + "epoch": 0.03380347359832148, + "grad_norm": 0.7578125, + "learning_rate": 1.9999990693733178e-05, + "loss": 0.8458, + "mean_token_accuracy": 0.8730092644691467, + "num_tokens": 590469.0, + "step": 145 + }, + { + "entropy": 0.3092967767268419, + "epoch": 0.03496911061895326, + "grad_norm": 6.84375, + "learning_rate": 1.9999989478446396e-05, + "loss": 0.6054, + "mean_token_accuracy": 0.8881650030612945, + "num_tokens": 616301.0, + "step": 150 + }, + { + "entropy": 0.3355667755007744, + "epoch": 0.03613474763958503, + "grad_norm": 2.203125, + "learning_rate": 1.9999988188602252e-05, + "loss": 0.6318, + "mean_token_accuracy": 0.8999499917030335, + "num_tokens": 634344.0, + "step": 155 + }, + { + "entropy": 0.3264051340520382, + "epoch": 0.03730038466021681, + "grad_norm": 1.9453125, + "learning_rate": 1.9999986824200764e-05, + "loss": 0.6383, + "mean_token_accuracy": 0.894915622472763, + "num_tokens": 653730.0, + "step": 160 + }, + { + "entropy": 0.2963225370272994, + "epoch": 0.03846602168084858, + "grad_norm": 0.462890625, + "learning_rate": 1.999998538524195e-05, + "loss": 0.434, + "mean_token_accuracy": 0.8856126844882966, + "num_tokens": 682057.0, + "step": 165 + }, + { + "entropy": 0.3581529125571251, + "epoch": 0.03963165870148036, + "grad_norm": 2.046875, + "learning_rate": 1.9999983871725833e-05, + "loss": 0.5984, + "mean_token_accuracy": 0.9039258837699891, + "num_tokens": 697306.0, + "step": 170 + }, + { + "entropy": 0.3010772816836834, + "epoch": 0.04079729572211213, + "grad_norm": 1.1640625, + "learning_rate": 1.999998228365244e-05, + "loss": 0.6009, + "mean_token_accuracy": 0.898896187543869, + "num_tokens": 713213.0, + "step": 175 + }, + { + "entropy": 0.2990272644907236, + "epoch": 0.04196293274274391, + "grad_norm": 2.46875, + "learning_rate": 1.999998062102179e-05, + "loss": 0.5517, + "mean_token_accuracy": 0.917355763912201, + "num_tokens": 741039.0, + "step": 180 + }, + { + "entropy": 0.24269667863845826, + "epoch": 0.04312856976337569, + "grad_norm": 1.6953125, + "learning_rate": 1.9999978883833904e-05, + "loss": 0.3234, + "mean_token_accuracy": 0.9207581281661987, + "num_tokens": 767100.0, + "step": 185 + }, + { + "entropy": 0.2515395663678646, + "epoch": 0.04429420678400746, + "grad_norm": 0.416015625, + "learning_rate": 1.9999977072088815e-05, + "loss": 0.3565, + "mean_token_accuracy": 0.9244686782360076, + "num_tokens": 799222.0, + "step": 190 + }, + { + "entropy": 0.3098013773560524, + "epoch": 0.04545984380463924, + "grad_norm": 3.265625, + "learning_rate": 1.999997518578655e-05, + "loss": 0.842, + "mean_token_accuracy": 0.8814567267894745, + "num_tokens": 810222.0, + "step": 195 + }, + { + "entropy": 0.3232828423380852, + "epoch": 0.04662548082527101, + "grad_norm": 2.203125, + "learning_rate": 1.999997322492713e-05, + "loss": 0.743, + "mean_token_accuracy": 0.8766939163208007, + "num_tokens": 826444.0, + "step": 200 + }, + { + "entropy": 0.32065615206956866, + "epoch": 0.047791117845902786, + "grad_norm": 0.75390625, + "learning_rate": 1.9999971189510594e-05, + "loss": 0.5851, + "mean_token_accuracy": 0.9104382634162903, + "num_tokens": 855814.0, + "step": 205 + }, + { + "entropy": 0.40788267850875853, + "epoch": 0.048956754866534564, + "grad_norm": 1.7265625, + "learning_rate": 1.9999969079536963e-05, + "loss": 0.7793, + "mean_token_accuracy": 0.8514861643314362, + "num_tokens": 894196.0, + "step": 210 + }, + { + "entropy": 0.445697608217597, + "epoch": 0.050122391887166336, + "grad_norm": 2.390625, + "learning_rate": 1.9999966895006273e-05, + "loss": 0.6643, + "mean_token_accuracy": 0.8878465294837952, + "num_tokens": 912147.0, + "step": 215 + }, + { + "entropy": 0.29761432111263275, + "epoch": 0.051288028907798114, + "grad_norm": 4.96875, + "learning_rate": 1.9999964635918557e-05, + "loss": 0.4497, + "mean_token_accuracy": 0.9224862337112427, + "num_tokens": 942808.0, + "step": 220 + }, + { + "entropy": 0.3196300931274891, + "epoch": 0.052453665928429885, + "grad_norm": 1.28125, + "learning_rate": 1.999996230227385e-05, + "loss": 0.5977, + "mean_token_accuracy": 0.8641709625720978, + "num_tokens": 966122.0, + "step": 225 + }, + { + "entropy": 0.3358492150902748, + "epoch": 0.05361930294906166, + "grad_norm": 2.359375, + "learning_rate": 1.9999959894072183e-05, + "loss": 0.6385, + "mean_token_accuracy": 0.8833851218223572, + "num_tokens": 976774.0, + "step": 230 + }, + { + "entropy": 0.38430210798978803, + "epoch": 0.054784939969693434, + "grad_norm": 3.328125, + "learning_rate": 1.9999957411313592e-05, + "loss": 0.8489, + "mean_token_accuracy": 0.8765129625797272, + "num_tokens": 989123.0, + "step": 235 + }, + { + "entropy": 0.3497027777135372, + "epoch": 0.05595057699032521, + "grad_norm": 0.67578125, + "learning_rate": 1.999995485399812e-05, + "loss": 0.7954, + "mean_token_accuracy": 0.8668942451477051, + "num_tokens": 1017050.0, + "step": 240 + }, + { + "entropy": 0.3112874172627926, + "epoch": 0.05711621401095699, + "grad_norm": 1.140625, + "learning_rate": 1.9999952222125795e-05, + "loss": 0.5732, + "mean_token_accuracy": 0.8893404483795166, + "num_tokens": 1043083.0, + "step": 245 + }, + { + "entropy": 0.32975875288248063, + "epoch": 0.05828185103158876, + "grad_norm": 17.25, + "learning_rate": 1.9999949515696662e-05, + "loss": 0.8527, + "mean_token_accuracy": 0.8816092252731323, + "num_tokens": 1066737.0, + "step": 250 + }, + { + "entropy": 0.45641955733299255, + "epoch": 0.05944748805222054, + "grad_norm": 2.0625, + "learning_rate": 1.9999946734710768e-05, + "loss": 0.8614, + "mean_token_accuracy": 0.8740241050720214, + "num_tokens": 1095622.0, + "step": 255 + }, + { + "entropy": 0.19169552214443683, + "epoch": 0.06061312507285231, + "grad_norm": 0.58203125, + "learning_rate": 1.9999943879168143e-05, + "loss": 0.2542, + "mean_token_accuracy": 0.9506396472454071, + "num_tokens": 1130270.0, + "step": 260 + }, + { + "entropy": 0.29430868923664094, + "epoch": 0.06177876209348409, + "grad_norm": 1.5234375, + "learning_rate": 1.9999940949068837e-05, + "loss": 0.4795, + "mean_token_accuracy": 0.8999245047569275, + "num_tokens": 1152894.0, + "step": 265 + }, + { + "entropy": 0.33416991904377935, + "epoch": 0.06294439911411587, + "grad_norm": 2.015625, + "learning_rate": 1.999993794441289e-05, + "loss": 0.7509, + "mean_token_accuracy": 0.8660208523273468, + "num_tokens": 1165983.0, + "step": 270 + }, + { + "entropy": 0.2795898959040642, + "epoch": 0.06411003613474764, + "grad_norm": 3.203125, + "learning_rate": 1.9999934865200345e-05, + "loss": 0.7822, + "mean_token_accuracy": 0.8882498800754547, + "num_tokens": 1177442.0, + "step": 275 + }, + { + "entropy": 0.3017091706395149, + "epoch": 0.06527567315537941, + "grad_norm": 1.59375, + "learning_rate": 1.9999931711431256e-05, + "loss": 0.655, + "mean_token_accuracy": 0.8937211573123932, + "num_tokens": 1192885.0, + "step": 280 + }, + { + "entropy": 0.3526634112000465, + "epoch": 0.0664413101760112, + "grad_norm": 2.46875, + "learning_rate": 1.9999928483105663e-05, + "loss": 0.6486, + "mean_token_accuracy": 0.8815766870975494, + "num_tokens": 1213874.0, + "step": 285 + }, + { + "entropy": 0.40433951616287234, + "epoch": 0.06760694719664297, + "grad_norm": 5.875, + "learning_rate": 1.9999925180223613e-05, + "loss": 0.8081, + "mean_token_accuracy": 0.876986026763916, + "num_tokens": 1230434.0, + "step": 290 + }, + { + "entropy": 0.2815289504826069, + "epoch": 0.06877258421727474, + "grad_norm": 0.53125, + "learning_rate": 1.999992180278516e-05, + "loss": 0.5367, + "mean_token_accuracy": 0.9034562647342682, + "num_tokens": 1245116.0, + "step": 295 + }, + { + "entropy": 0.36914983466267587, + "epoch": 0.06993822123790652, + "grad_norm": 5.90625, + "learning_rate": 1.9999918350790354e-05, + "loss": 0.6114, + "mean_token_accuracy": 0.8818127453327179, + "num_tokens": 1259884.0, + "step": 300 + }, + { + "entropy": 0.27012285105884076, + "epoch": 0.0711038582585383, + "grad_norm": 0.279296875, + "learning_rate": 1.9999914824239243e-05, + "loss": 0.5234, + "mean_token_accuracy": 0.9039028882980347, + "num_tokens": 1280626.0, + "step": 305 + }, + { + "entropy": 0.27597851008176805, + "epoch": 0.07226949527917007, + "grad_norm": 1.8125, + "learning_rate": 1.9999911223131885e-05, + "loss": 0.7649, + "mean_token_accuracy": 0.8721263349056244, + "num_tokens": 1293685.0, + "step": 310 + }, + { + "entropy": 0.38573506474494934, + "epoch": 0.07343513229980184, + "grad_norm": 1.140625, + "learning_rate": 1.9999907547468328e-05, + "loss": 0.6097, + "mean_token_accuracy": 0.8858954429626464, + "num_tokens": 1322562.0, + "step": 315 + }, + { + "entropy": 0.3330282442271709, + "epoch": 0.07460076932043362, + "grad_norm": 2.359375, + "learning_rate": 1.999990379724863e-05, + "loss": 0.6147, + "mean_token_accuracy": 0.8772442162036895, + "num_tokens": 1345673.0, + "step": 320 + }, + { + "entropy": 0.3170699439942837, + "epoch": 0.07576640634106539, + "grad_norm": 3.296875, + "learning_rate": 1.9999899972472843e-05, + "loss": 0.458, + "mean_token_accuracy": 0.8802279114723206, + "num_tokens": 1383625.0, + "step": 325 + }, + { + "entropy": 0.46571023017168045, + "epoch": 0.07693204336169716, + "grad_norm": 1.71875, + "learning_rate": 1.999989607314103e-05, + "loss": 0.8476, + "mean_token_accuracy": 0.8469921767711639, + "num_tokens": 1409144.0, + "step": 330 + }, + { + "entropy": 0.41157747209072115, + "epoch": 0.07809768038232895, + "grad_norm": 4.375, + "learning_rate": 1.9999892099253247e-05, + "loss": 0.8684, + "mean_token_accuracy": 0.8459766566753387, + "num_tokens": 1431361.0, + "step": 335 + }, + { + "entropy": 0.39902357161045077, + "epoch": 0.07926331740296072, + "grad_norm": 3.5, + "learning_rate": 1.999988805080955e-05, + "loss": 0.9777, + "mean_token_accuracy": 0.8353796422481536, + "num_tokens": 1442281.0, + "step": 340 + }, + { + "entropy": 0.42176967263221743, + "epoch": 0.08042895442359249, + "grad_norm": 2.703125, + "learning_rate": 1.9999883927810005e-05, + "loss": 1.0149, + "mean_token_accuracy": 0.8583439648151397, + "num_tokens": 1450914.0, + "step": 345 + }, + { + "entropy": 0.2953450310975313, + "epoch": 0.08159459144422426, + "grad_norm": 1.203125, + "learning_rate": 1.999987973025467e-05, + "loss": 0.3853, + "mean_token_accuracy": 0.9233019709587097, + "num_tokens": 1474403.0, + "step": 350 + }, + { + "entropy": 0.33179600164294243, + "epoch": 0.08276022846485605, + "grad_norm": 2.625, + "learning_rate": 1.9999875458143604e-05, + "loss": 0.4703, + "mean_token_accuracy": 0.8944461524486542, + "num_tokens": 1509158.0, + "step": 355 + }, + { + "entropy": 0.3036386102437973, + "epoch": 0.08392586548548782, + "grad_norm": 1.125, + "learning_rate": 1.999987111147688e-05, + "loss": 0.6394, + "mean_token_accuracy": 0.8986095011234283, + "num_tokens": 1526440.0, + "step": 360 + }, + { + "entropy": 0.31046125665307045, + "epoch": 0.08509150250611959, + "grad_norm": 3.421875, + "learning_rate": 1.9999866690254554e-05, + "loss": 0.6663, + "mean_token_accuracy": 0.8590294003486634, + "num_tokens": 1549145.0, + "step": 365 + }, + { + "entropy": 0.3097700498998165, + "epoch": 0.08625713952675138, + "grad_norm": 2.09375, + "learning_rate": 1.99998621944767e-05, + "loss": 0.4658, + "mean_token_accuracy": 0.9011349081993103, + "num_tokens": 1583727.0, + "step": 370 + }, + { + "entropy": 0.3039284236729145, + "epoch": 0.08742277654738315, + "grad_norm": 3.03125, + "learning_rate": 1.9999857624143373e-05, + "loss": 0.6759, + "mean_token_accuracy": 0.8913724482059479, + "num_tokens": 1597894.0, + "step": 375 + }, + { + "entropy": 0.24622596129775048, + "epoch": 0.08858841356801492, + "grad_norm": 1.5703125, + "learning_rate": 1.9999852979254655e-05, + "loss": 0.4396, + "mean_token_accuracy": 0.9359685719013214, + "num_tokens": 1623602.0, + "step": 380 + }, + { + "entropy": 0.39024817757308483, + "epoch": 0.08975405058864669, + "grad_norm": 2.734375, + "learning_rate": 1.9999848259810605e-05, + "loss": 0.8393, + "mean_token_accuracy": 0.8547381460666656, + "num_tokens": 1636767.0, + "step": 385 + }, + { + "entropy": 0.3580179035663605, + "epoch": 0.09091968760927847, + "grad_norm": 0.875, + "learning_rate": 1.9999843465811297e-05, + "loss": 0.6393, + "mean_token_accuracy": 0.8810992777347565, + "num_tokens": 1655707.0, + "step": 390 + }, + { + "entropy": 0.37314921617507935, + "epoch": 0.09208532462991025, + "grad_norm": 2.15625, + "learning_rate": 1.9999838597256807e-05, + "loss": 0.4304, + "mean_token_accuracy": 0.8614090740680694, + "num_tokens": 1696414.0, + "step": 395 + }, + { + "entropy": 0.29948038049042225, + "epoch": 0.09325096165054202, + "grad_norm": 2.953125, + "learning_rate": 1.99998336541472e-05, + "loss": 0.5126, + "mean_token_accuracy": 0.9010177552700043, + "num_tokens": 1715229.0, + "step": 400 + }, + { + "entropy": 0.3523738864809275, + "epoch": 0.0944165986711738, + "grad_norm": 2.84375, + "learning_rate": 1.9999828636482553e-05, + "loss": 0.7424, + "mean_token_accuracy": 0.8713351786136627, + "num_tokens": 1732900.0, + "step": 405 + }, + { + "entropy": 0.305879208445549, + "epoch": 0.09558223569180557, + "grad_norm": 0.72265625, + "learning_rate": 1.9999823544262942e-05, + "loss": 0.5051, + "mean_token_accuracy": 0.9006154477596283, + "num_tokens": 1747602.0, + "step": 410 + }, + { + "entropy": 0.3407420488074422, + "epoch": 0.09674787271243734, + "grad_norm": 1.0546875, + "learning_rate": 1.9999818377488443e-05, + "loss": 0.2896, + "mean_token_accuracy": 0.9086630046367645, + "num_tokens": 1772100.0, + "step": 415 + }, + { + "entropy": 0.2643546022474766, + "epoch": 0.09791350973306913, + "grad_norm": 0.53125, + "learning_rate": 1.999981313615913e-05, + "loss": 0.4521, + "mean_token_accuracy": 0.9116932094097138, + "num_tokens": 1791608.0, + "step": 420 + }, + { + "entropy": 0.343147162348032, + "epoch": 0.0990791467537009, + "grad_norm": 2.15625, + "learning_rate": 1.9999807820275082e-05, + "loss": 0.5072, + "mean_token_accuracy": 0.8775932788848877, + "num_tokens": 1819571.0, + "step": 425 + }, + { + "entropy": 0.287718054279685, + "epoch": 0.10024478377433267, + "grad_norm": 1.4453125, + "learning_rate": 1.9999802429836383e-05, + "loss": 0.4765, + "mean_token_accuracy": 0.9125708997249603, + "num_tokens": 1849569.0, + "step": 430 + }, + { + "entropy": 0.40832418352365496, + "epoch": 0.10141042079496444, + "grad_norm": 2.1875, + "learning_rate": 1.9999796964843104e-05, + "loss": 0.9028, + "mean_token_accuracy": 0.8589731276035308, + "num_tokens": 1858952.0, + "step": 435 + }, + { + "entropy": 0.30324168093502524, + "epoch": 0.10257605781559623, + "grad_norm": 1.75, + "learning_rate": 1.9999791425295338e-05, + "loss": 0.5968, + "mean_token_accuracy": 0.8990670144557953, + "num_tokens": 1879066.0, + "step": 440 + }, + { + "entropy": 0.2882155541330576, + "epoch": 0.103741694836228, + "grad_norm": 0.271484375, + "learning_rate": 1.9999785811193154e-05, + "loss": 0.4322, + "mean_token_accuracy": 0.9170081973075866, + "num_tokens": 1903364.0, + "step": 445 + }, + { + "entropy": 0.2898620326071978, + "epoch": 0.10490733185685977, + "grad_norm": 0.9375, + "learning_rate": 1.999978012253665e-05, + "loss": 0.447, + "mean_token_accuracy": 0.922281152009964, + "num_tokens": 1922997.0, + "step": 450 + }, + { + "entropy": 0.3374782703816891, + "epoch": 0.10607296887749156, + "grad_norm": 2.6875, + "learning_rate": 1.9999774359325905e-05, + "loss": 0.7052, + "mean_token_accuracy": 0.897776848077774, + "num_tokens": 1950028.0, + "step": 455 + }, + { + "entropy": 0.354191205278039, + "epoch": 0.10723860589812333, + "grad_norm": 2.15625, + "learning_rate": 1.9999768521561002e-05, + "loss": 0.5536, + "mean_token_accuracy": 0.8853957891464234, + "num_tokens": 1976726.0, + "step": 460 + }, + { + "entropy": 0.43799355179071425, + "epoch": 0.1084042429187551, + "grad_norm": 2.09375, + "learning_rate": 1.9999762609242028e-05, + "loss": 0.8109, + "mean_token_accuracy": 0.8636049032211304, + "num_tokens": 1987708.0, + "step": 465 + }, + { + "entropy": 0.3960087105631828, + "epoch": 0.10956987993938687, + "grad_norm": 1.65625, + "learning_rate": 1.9999756622369077e-05, + "loss": 0.8937, + "mean_token_accuracy": 0.8690516471862793, + "num_tokens": 1996457.0, + "step": 470 + }, + { + "entropy": 0.3699250042438507, + "epoch": 0.11073551696001865, + "grad_norm": 6.4375, + "learning_rate": 1.9999750560942234e-05, + "loss": 0.8203, + "mean_token_accuracy": 0.8731843948364257, + "num_tokens": 2011481.0, + "step": 475 + }, + { + "entropy": 0.17613436691462994, + "epoch": 0.11190115398065043, + "grad_norm": 0.42578125, + "learning_rate": 1.9999744424961588e-05, + "loss": 0.2405, + "mean_token_accuracy": 0.9508815348148346, + "num_tokens": 2043874.0, + "step": 480 + }, + { + "entropy": 0.26431639343500135, + "epoch": 0.1130667910012822, + "grad_norm": 3.375, + "learning_rate": 1.9999738214427236e-05, + "loss": 0.5847, + "mean_token_accuracy": 0.9056618392467499, + "num_tokens": 2064771.0, + "step": 485 + }, + { + "entropy": 0.2693479511886835, + "epoch": 0.11423242802191398, + "grad_norm": 0.384765625, + "learning_rate": 1.9999731929339263e-05, + "loss": 0.4551, + "mean_token_accuracy": 0.9169823467731476, + "num_tokens": 2090213.0, + "step": 490 + }, + { + "entropy": 0.4294183999300003, + "epoch": 0.11539806504254575, + "grad_norm": 3.046875, + "learning_rate": 1.999972556969777e-05, + "loss": 0.7813, + "mean_token_accuracy": 0.8543537199497223, + "num_tokens": 2113168.0, + "step": 495 + }, + { + "entropy": 0.26536157727241516, + "epoch": 0.11656370206317752, + "grad_norm": 2.546875, + "learning_rate": 1.999971913550285e-05, + "loss": 0.3274, + "mean_token_accuracy": 0.9273768246173859, + "num_tokens": 2134370.0, + "step": 500 + }, + { + "entropy": 0.2530547440052032, + "epoch": 0.1177293390838093, + "grad_norm": 0.51171875, + "learning_rate": 1.9999712626754593e-05, + "loss": 0.3245, + "mean_token_accuracy": 0.929281085729599, + "num_tokens": 2172563.0, + "step": 505 + }, + { + "entropy": 0.26137659288942816, + "epoch": 0.11889497610444108, + "grad_norm": 0.419921875, + "learning_rate": 1.9999706043453103e-05, + "loss": 0.6226, + "mean_token_accuracy": 0.8935856699943543, + "num_tokens": 2193099.0, + "step": 510 + }, + { + "entropy": 0.25196021553128956, + "epoch": 0.12006061312507285, + "grad_norm": 0.419921875, + "learning_rate": 1.9999699385598476e-05, + "loss": 0.338, + "mean_token_accuracy": 0.9162642061710358, + "num_tokens": 2230359.0, + "step": 515 + }, + { + "entropy": 0.3314253244549036, + "epoch": 0.12122625014570462, + "grad_norm": 1.390625, + "learning_rate": 1.999969265319081e-05, + "loss": 0.4916, + "mean_token_accuracy": 0.8800876617431641, + "num_tokens": 2256112.0, + "step": 520 + }, + { + "entropy": 0.2865926086902618, + "epoch": 0.12239188716633641, + "grad_norm": 0.640625, + "learning_rate": 1.999968584623021e-05, + "loss": 0.5819, + "mean_token_accuracy": 0.9126100778579712, + "num_tokens": 2269650.0, + "step": 525 + }, + { + "entropy": 0.2737023938447237, + "epoch": 0.12355752418696818, + "grad_norm": 1.609375, + "learning_rate": 1.999967896471677e-05, + "loss": 0.462, + "mean_token_accuracy": 0.910278183221817, + "num_tokens": 2294847.0, + "step": 530 + }, + { + "entropy": 0.250915889441967, + "epoch": 0.12472316120759995, + "grad_norm": 2.140625, + "learning_rate": 1.9999672008650603e-05, + "loss": 0.5603, + "mean_token_accuracy": 0.8969495117664337, + "num_tokens": 2310263.0, + "step": 535 + }, + { + "entropy": 0.31160888969898226, + "epoch": 0.12588879822823174, + "grad_norm": 0.890625, + "learning_rate": 1.99996649780318e-05, + "loss": 0.3346, + "mean_token_accuracy": 0.9062823116779327, + "num_tokens": 2328242.0, + "step": 540 + }, + { + "entropy": 0.42439975365996363, + "epoch": 0.1270544352488635, + "grad_norm": 4.1875, + "learning_rate": 1.9999657872860476e-05, + "loss": 0.8226, + "mean_token_accuracy": 0.8711081981658936, + "num_tokens": 2349683.0, + "step": 545 + }, + { + "entropy": 0.2684651080518961, + "epoch": 0.12822007226949528, + "grad_norm": 1.6875, + "learning_rate": 1.999965069313673e-05, + "loss": 0.3773, + "mean_token_accuracy": 0.8972374260425567, + "num_tokens": 2372089.0, + "step": 550 + }, + { + "entropy": 0.18653137236833572, + "epoch": 0.12938570929012705, + "grad_norm": 2.390625, + "learning_rate": 1.9999643438860674e-05, + "loss": 0.5196, + "mean_token_accuracy": 0.9188428163528443, + "num_tokens": 2404211.0, + "step": 555 + }, + { + "entropy": 0.2632068574428558, + "epoch": 0.13055134631075882, + "grad_norm": 0.8984375, + "learning_rate": 1.9999636110032415e-05, + "loss": 0.4882, + "mean_token_accuracy": 0.9154849767684936, + "num_tokens": 2419959.0, + "step": 560 + }, + { + "entropy": 0.25248654522001746, + "epoch": 0.1317169833313906, + "grad_norm": 3.03125, + "learning_rate": 1.999962870665206e-05, + "loss": 0.4536, + "mean_token_accuracy": 0.9148861825466156, + "num_tokens": 2442409.0, + "step": 565 + }, + { + "entropy": 0.2907357782125473, + "epoch": 0.1328826203520224, + "grad_norm": 0.69921875, + "learning_rate": 1.9999621228719724e-05, + "loss": 0.6041, + "mean_token_accuracy": 0.8970059275627136, + "num_tokens": 2458082.0, + "step": 570 + }, + { + "entropy": 0.275528746843338, + "epoch": 0.13404825737265416, + "grad_norm": 2.375, + "learning_rate": 1.9999613676235512e-05, + "loss": 0.6932, + "mean_token_accuracy": 0.8919620513916016, + "num_tokens": 2470546.0, + "step": 575 + }, + { + "entropy": 0.3516815423965454, + "epoch": 0.13521389439328593, + "grad_norm": 3.03125, + "learning_rate": 1.9999606049199543e-05, + "loss": 0.656, + "mean_token_accuracy": 0.8738880634307862, + "num_tokens": 2495063.0, + "step": 580 + }, + { + "entropy": 0.29452326260507106, + "epoch": 0.1363795314139177, + "grad_norm": 1.5078125, + "learning_rate": 1.999959834761193e-05, + "loss": 0.6642, + "mean_token_accuracy": 0.8788491487503052, + "num_tokens": 2511349.0, + "step": 585 + }, + { + "entropy": 0.2358495132997632, + "epoch": 0.13754516843454948, + "grad_norm": 0.21875, + "learning_rate": 1.999959057147278e-05, + "loss": 0.2328, + "mean_token_accuracy": 0.9249357283115387, + "num_tokens": 2549642.0, + "step": 590 + }, + { + "entropy": 0.2459204986691475, + "epoch": 0.13871080545518125, + "grad_norm": 1.765625, + "learning_rate": 1.9999582720782217e-05, + "loss": 0.5451, + "mean_token_accuracy": 0.9135986566543579, + "num_tokens": 2570037.0, + "step": 595 + }, + { + "entropy": 0.27237192876636984, + "epoch": 0.13987644247581305, + "grad_norm": 0.1796875, + "learning_rate": 1.9999574795540357e-05, + "loss": 0.4804, + "mean_token_accuracy": 0.9119515061378479, + "num_tokens": 2590600.0, + "step": 600 + }, + { + "entropy": 0.3028899788856506, + "epoch": 0.14104207949644482, + "grad_norm": 0.65625, + "learning_rate": 1.9999566795747316e-05, + "loss": 0.3404, + "mean_token_accuracy": 0.9131358981132507, + "num_tokens": 2620638.0, + "step": 605 + }, + { + "entropy": 0.2697331115603447, + "epoch": 0.1422077165170766, + "grad_norm": 1.421875, + "learning_rate": 1.9999558721403215e-05, + "loss": 0.4712, + "mean_token_accuracy": 0.9166036427021027, + "num_tokens": 2640375.0, + "step": 610 + }, + { + "entropy": 0.37615430131554606, + "epoch": 0.14337335353770836, + "grad_norm": 1.578125, + "learning_rate": 1.9999550572508174e-05, + "loss": 0.653, + "mean_token_accuracy": 0.8842727184295655, + "num_tokens": 2651304.0, + "step": 615 + }, + { + "entropy": 0.2938199445605278, + "epoch": 0.14453899055834013, + "grad_norm": 0.427734375, + "learning_rate": 1.9999542349062314e-05, + "loss": 0.679, + "mean_token_accuracy": 0.8845574855804443, + "num_tokens": 2672520.0, + "step": 620 + }, + { + "entropy": 0.391637334227562, + "epoch": 0.1457046275789719, + "grad_norm": 0.98828125, + "learning_rate": 1.9999534051065757e-05, + "loss": 0.7966, + "mean_token_accuracy": 0.845041885972023, + "num_tokens": 2694477.0, + "step": 625 + }, + { + "entropy": 0.3956002026796341, + "epoch": 0.14687026459960367, + "grad_norm": 2.953125, + "learning_rate": 1.9999525678518628e-05, + "loss": 0.9271, + "mean_token_accuracy": 0.8404446899890899, + "num_tokens": 2708270.0, + "step": 630 + }, + { + "entropy": 0.3675729542970657, + "epoch": 0.14803590162023547, + "grad_norm": 0.87890625, + "learning_rate": 1.9999517231421053e-05, + "loss": 0.5063, + "mean_token_accuracy": 0.8811340808868409, + "num_tokens": 2722600.0, + "step": 635 + }, + { + "entropy": 0.25438366681337354, + "epoch": 0.14920153864086724, + "grad_norm": 1.21875, + "learning_rate": 1.9999508709773155e-05, + "loss": 0.571, + "mean_token_accuracy": 0.9114357829093933, + "num_tokens": 2741534.0, + "step": 640 + }, + { + "entropy": 0.29461836684495213, + "epoch": 0.15036717566149901, + "grad_norm": 0.2041015625, + "learning_rate": 1.999950011357506e-05, + "loss": 0.508, + "mean_token_accuracy": 0.9003230154514312, + "num_tokens": 2766169.0, + "step": 645 + }, + { + "entropy": 0.36445762515068053, + "epoch": 0.15153281268213079, + "grad_norm": 1.6328125, + "learning_rate": 1.9999491442826903e-05, + "loss": 0.7562, + "mean_token_accuracy": 0.8814541339874268, + "num_tokens": 2775925.0, + "step": 650 + }, + { + "entropy": 0.2811070531606674, + "epoch": 0.15269844970276256, + "grad_norm": 1.78125, + "learning_rate": 1.9999482697528808e-05, + "loss": 0.4886, + "mean_token_accuracy": 0.9111315429210662, + "num_tokens": 2796279.0, + "step": 655 + }, + { + "entropy": 0.25250407978892325, + "epoch": 0.15386408672339433, + "grad_norm": 0.470703125, + "learning_rate": 1.9999473877680903e-05, + "loss": 0.371, + "mean_token_accuracy": 0.9198955953121185, + "num_tokens": 2829382.0, + "step": 660 + }, + { + "entropy": 0.35067772716283796, + "epoch": 0.1550297237440261, + "grad_norm": 1.328125, + "learning_rate": 1.9999464983283325e-05, + "loss": 0.5332, + "mean_token_accuracy": 0.8845549583435058, + "num_tokens": 2843150.0, + "step": 665 + }, + { + "entropy": 0.24629681333899497, + "epoch": 0.1561953607646579, + "grad_norm": 2.1875, + "learning_rate": 1.9999456014336206e-05, + "loss": 0.4008, + "mean_token_accuracy": 0.9258623898029328, + "num_tokens": 2868976.0, + "step": 670 + }, + { + "entropy": 0.3138530794531107, + "epoch": 0.15736099778528967, + "grad_norm": 2.8125, + "learning_rate": 1.9999446970839677e-05, + "loss": 0.6616, + "mean_token_accuracy": 0.8881543815135956, + "num_tokens": 2884671.0, + "step": 675 + }, + { + "entropy": 0.38320747911930086, + "epoch": 0.15852663480592144, + "grad_norm": 1.3515625, + "learning_rate": 1.9999437852793874e-05, + "loss": 0.7154, + "mean_token_accuracy": 0.8834590017795563, + "num_tokens": 2895630.0, + "step": 680 + }, + { + "entropy": 0.33303392827510836, + "epoch": 0.1596922718265532, + "grad_norm": 2.453125, + "learning_rate": 1.9999428660198933e-05, + "loss": 0.8552, + "mean_token_accuracy": 0.865819638967514, + "num_tokens": 2905209.0, + "step": 685 + }, + { + "entropy": 0.31202927231788635, + "epoch": 0.16085790884718498, + "grad_norm": 2.046875, + "learning_rate": 1.999941939305499e-05, + "loss": 0.5953, + "mean_token_accuracy": 0.8999407291412354, + "num_tokens": 2917687.0, + "step": 690 + }, + { + "entropy": 0.3931444585323334, + "epoch": 0.16202354586781675, + "grad_norm": 3.15625, + "learning_rate": 1.9999410051362185e-05, + "loss": 0.6859, + "mean_token_accuracy": 0.8783853471279144, + "num_tokens": 2935177.0, + "step": 695 + }, + { + "entropy": 0.4097077568992972, + "epoch": 0.16318918288844853, + "grad_norm": 0.5703125, + "learning_rate": 1.9999400635120656e-05, + "loss": 0.7382, + "mean_token_accuracy": 0.8664321959018707, + "num_tokens": 2962275.0, + "step": 700 + }, + { + "entropy": 0.3893901389092207, + "epoch": 0.16435481990908032, + "grad_norm": 3.0625, + "learning_rate": 1.9999391144330547e-05, + "loss": 0.6817, + "mean_token_accuracy": 0.8475090980529785, + "num_tokens": 2985130.0, + "step": 705 + }, + { + "entropy": 0.3332042768597603, + "epoch": 0.1655204569297121, + "grad_norm": 0.88671875, + "learning_rate": 1.9999381578991995e-05, + "loss": 0.5577, + "mean_token_accuracy": 0.8721989512443542, + "num_tokens": 3012620.0, + "step": 710 + }, + { + "entropy": 0.3115752834826708, + "epoch": 0.16668609395034387, + "grad_norm": 2.34375, + "learning_rate": 1.999937193910514e-05, + "loss": 0.7448, + "mean_token_accuracy": 0.8875145256519318, + "num_tokens": 3026831.0, + "step": 715 + }, + { + "entropy": 0.4049858648329973, + "epoch": 0.16785173097097564, + "grad_norm": 3.734375, + "learning_rate": 1.9999362224670136e-05, + "loss": 0.771, + "mean_token_accuracy": 0.8793026149272919, + "num_tokens": 3043491.0, + "step": 720 + }, + { + "entropy": 0.3188063256442547, + "epoch": 0.1690173679916074, + "grad_norm": 0.58203125, + "learning_rate": 1.999935243568712e-05, + "loss": 0.5652, + "mean_token_accuracy": 0.8942893028259278, + "num_tokens": 3068104.0, + "step": 725 + }, + { + "entropy": 0.4390459656715393, + "epoch": 0.17018300501223918, + "grad_norm": 2.9375, + "learning_rate": 1.9999342572156236e-05, + "loss": 0.993, + "mean_token_accuracy": 0.8510157585144043, + "num_tokens": 3076457.0, + "step": 730 + }, + { + "entropy": 0.2523434393107891, + "epoch": 0.17134864203287095, + "grad_norm": 0.279296875, + "learning_rate": 1.999933263407764e-05, + "loss": 0.4319, + "mean_token_accuracy": 0.91023069024086, + "num_tokens": 3102744.0, + "step": 735 + }, + { + "entropy": 0.2924702726304531, + "epoch": 0.17251427905350275, + "grad_norm": 0.66015625, + "learning_rate": 1.9999322621451472e-05, + "loss": 0.6122, + "mean_token_accuracy": 0.8960310935974121, + "num_tokens": 3116362.0, + "step": 740 + }, + { + "entropy": 0.34593453593552115, + "epoch": 0.17367991607413452, + "grad_norm": 4.75, + "learning_rate": 1.9999312534277886e-05, + "loss": 0.5829, + "mean_token_accuracy": 0.8776205480098724, + "num_tokens": 3142420.0, + "step": 745 + }, + { + "entropy": 0.2968858815729618, + "epoch": 0.1748455530947663, + "grad_norm": 2.015625, + "learning_rate": 1.999930237255703e-05, + "loss": 0.3671, + "mean_token_accuracy": 0.9223788380622864, + "num_tokens": 3163109.0, + "step": 750 + }, + { + "entropy": 0.28595022670924664, + "epoch": 0.17601119011539806, + "grad_norm": 0.953125, + "learning_rate": 1.9999292136289056e-05, + "loss": 0.4699, + "mean_token_accuracy": 0.8934225618839264, + "num_tokens": 3193584.0, + "step": 755 + }, + { + "entropy": 0.354030817002058, + "epoch": 0.17717682713602984, + "grad_norm": 2.0, + "learning_rate": 1.9999281825474117e-05, + "loss": 0.5145, + "mean_token_accuracy": 0.8838940739631653, + "num_tokens": 3214263.0, + "step": 760 + }, + { + "entropy": 0.43382971435785295, + "epoch": 0.1783424641566616, + "grad_norm": 2.28125, + "learning_rate": 1.9999271440112367e-05, + "loss": 0.8561, + "mean_token_accuracy": 0.8413860857486725, + "num_tokens": 3241698.0, + "step": 765 + }, + { + "entropy": 0.30166892930865286, + "epoch": 0.17950810117729338, + "grad_norm": 1.53125, + "learning_rate": 1.999926098020396e-05, + "loss": 0.4246, + "mean_token_accuracy": 0.9079195559024811, + "num_tokens": 3257616.0, + "step": 770 + }, + { + "entropy": 0.405256550014019, + "epoch": 0.18067373819792518, + "grad_norm": 2.359375, + "learning_rate": 1.9999250445749052e-05, + "loss": 0.5895, + "mean_token_accuracy": 0.8911011576652527, + "num_tokens": 3285955.0, + "step": 775 + }, + { + "entropy": 0.34804592877626417, + "epoch": 0.18183937521855695, + "grad_norm": 3.375, + "learning_rate": 1.9999239836747802e-05, + "loss": 0.6435, + "mean_token_accuracy": 0.8680084943771362, + "num_tokens": 3302017.0, + "step": 780 + }, + { + "entropy": 0.2602914243936539, + "epoch": 0.18300501223918872, + "grad_norm": 1.6640625, + "learning_rate": 1.9999229153200365e-05, + "loss": 0.3589, + "mean_token_accuracy": 0.930289226770401, + "num_tokens": 3321016.0, + "step": 785 + }, + { + "entropy": 0.31426837891340254, + "epoch": 0.1841706492598205, + "grad_norm": 1.9609375, + "learning_rate": 1.9999218395106906e-05, + "loss": 0.4683, + "mean_token_accuracy": 0.8997460782527924, + "num_tokens": 3348461.0, + "step": 790 + }, + { + "entropy": 0.22893422991037368, + "epoch": 0.18533628628045226, + "grad_norm": 1.3359375, + "learning_rate": 1.999920756246758e-05, + "loss": 0.2776, + "mean_token_accuracy": 0.9370344758033753, + "num_tokens": 3390799.0, + "step": 795 + }, + { + "entropy": 0.2902509465813637, + "epoch": 0.18650192330108403, + "grad_norm": 1.6796875, + "learning_rate": 1.9999196655282546e-05, + "loss": 0.4329, + "mean_token_accuracy": 0.9119967639446258, + "num_tokens": 3409234.0, + "step": 800 + }, + { + "entropy": 0.27368216067552564, + "epoch": 0.1876675603217158, + "grad_norm": 0.404296875, + "learning_rate": 1.9999185673551972e-05, + "loss": 0.4364, + "mean_token_accuracy": 0.9043579339981079, + "num_tokens": 3433336.0, + "step": 805 + }, + { + "entropy": 0.4705745026469231, + "epoch": 0.1888331973423476, + "grad_norm": 1.96875, + "learning_rate": 1.999917461727602e-05, + "loss": 0.8298, + "mean_token_accuracy": 0.8345303326845169, + "num_tokens": 3448349.0, + "step": 810 + }, + { + "entropy": 0.34988665878772734, + "epoch": 0.18999883436297937, + "grad_norm": 2.859375, + "learning_rate": 1.999916348645486e-05, + "loss": 0.7454, + "mean_token_accuracy": 0.892153388261795, + "num_tokens": 3466389.0, + "step": 815 + }, + { + "entropy": 0.44057891592383386, + "epoch": 0.19116447138361115, + "grad_norm": 3.40625, + "learning_rate": 1.999915228108865e-05, + "loss": 0.7607, + "mean_token_accuracy": 0.8726061344146728, + "num_tokens": 3478097.0, + "step": 820 + }, + { + "entropy": 0.3719429075717926, + "epoch": 0.19233010840424292, + "grad_norm": 3.578125, + "learning_rate": 1.999914100117756e-05, + "loss": 0.8634, + "mean_token_accuracy": 0.8735311985015869, + "num_tokens": 3492951.0, + "step": 825 + }, + { + "entropy": 0.34916748106479645, + "epoch": 0.1934957454248747, + "grad_norm": 1.8046875, + "learning_rate": 1.9999129646721757e-05, + "loss": 0.8812, + "mean_token_accuracy": 0.8656373739242553, + "num_tokens": 3503828.0, + "step": 830 + }, + { + "entropy": 0.4179299771785736, + "epoch": 0.19466138244550646, + "grad_norm": 4.09375, + "learning_rate": 1.9999118217721415e-05, + "loss": 0.8812, + "mean_token_accuracy": 0.8484634220600128, + "num_tokens": 3516704.0, + "step": 835 + }, + { + "entropy": 0.4467567354440689, + "epoch": 0.19582701946613826, + "grad_norm": 1.53125, + "learning_rate": 1.99991067141767e-05, + "loss": 0.9907, + "mean_token_accuracy": 0.8312698066234588, + "num_tokens": 3525728.0, + "step": 840 + }, + { + "entropy": 0.46012266874313357, + "epoch": 0.19699265648677003, + "grad_norm": 7.8125, + "learning_rate": 1.9999095136087785e-05, + "loss": 0.7815, + "mean_token_accuracy": 0.8484793066978454, + "num_tokens": 3549324.0, + "step": 845 + }, + { + "entropy": 0.226668768748641, + "epoch": 0.1981582935074018, + "grad_norm": 1.390625, + "learning_rate": 1.9999083483454842e-05, + "loss": 0.4126, + "mean_token_accuracy": 0.9244813799858094, + "num_tokens": 3568736.0, + "step": 850 + }, + { + "entropy": 0.31615454629063605, + "epoch": 0.19932393052803357, + "grad_norm": 0.87890625, + "learning_rate": 1.9999071756278046e-05, + "loss": 0.6654, + "mean_token_accuracy": 0.9037168562412262, + "num_tokens": 3583029.0, + "step": 855 + }, + { + "entropy": 0.3806022718548775, + "epoch": 0.20048956754866534, + "grad_norm": 1.6796875, + "learning_rate": 1.999905995455757e-05, + "loss": 0.8238, + "mean_token_accuracy": 0.8618193626403808, + "num_tokens": 3592622.0, + "step": 860 + }, + { + "entropy": 0.38668873235583306, + "epoch": 0.20165520456929711, + "grad_norm": 2.265625, + "learning_rate": 1.9999048078293594e-05, + "loss": 0.7331, + "mean_token_accuracy": 0.8647488534450531, + "num_tokens": 3612682.0, + "step": 865 + }, + { + "entropy": 0.3393211871385574, + "epoch": 0.20282084158992889, + "grad_norm": 1.6875, + "learning_rate": 1.999903612748629e-05, + "loss": 0.7762, + "mean_token_accuracy": 0.8786431312561035, + "num_tokens": 3622844.0, + "step": 870 + }, + { + "entropy": 0.3805833376944065, + "epoch": 0.20398647861056068, + "grad_norm": 1.2578125, + "learning_rate": 1.9999024102135838e-05, + "loss": 0.5378, + "mean_token_accuracy": 0.8691211700439453, + "num_tokens": 3650860.0, + "step": 875 + }, + { + "entropy": 0.21699289083480836, + "epoch": 0.20515211563119246, + "grad_norm": 0.494140625, + "learning_rate": 1.9999012002242417e-05, + "loss": 0.4605, + "mean_token_accuracy": 0.9262413024902344, + "num_tokens": 3669625.0, + "step": 880 + }, + { + "entropy": 0.28322131410241125, + "epoch": 0.20631775265182423, + "grad_norm": 0.56640625, + "learning_rate": 1.999899982780621e-05, + "loss": 0.718, + "mean_token_accuracy": 0.8780093967914582, + "num_tokens": 3688004.0, + "step": 885 + }, + { + "entropy": 0.2387631695717573, + "epoch": 0.207483389672456, + "grad_norm": 0.412109375, + "learning_rate": 1.9998987578827392e-05, + "loss": 0.4813, + "mean_token_accuracy": 0.9062504410743714, + "num_tokens": 3713401.0, + "step": 890 + }, + { + "entropy": 0.33120530694723127, + "epoch": 0.20864902669308777, + "grad_norm": 2.28125, + "learning_rate": 1.9998975255306157e-05, + "loss": 0.6927, + "mean_token_accuracy": 0.8789681971073151, + "num_tokens": 3725464.0, + "step": 895 + }, + { + "entropy": 0.2995867744088173, + "epoch": 0.20981466371371954, + "grad_norm": 0.4921875, + "learning_rate": 1.9998962857242678e-05, + "loss": 0.6636, + "mean_token_accuracy": 0.898937052488327, + "num_tokens": 3739091.0, + "step": 900 + }, + { + "entropy": 0.29027860462665556, + "epoch": 0.2109803007343513, + "grad_norm": 1.890625, + "learning_rate": 1.9998950384637146e-05, + "loss": 0.4248, + "mean_token_accuracy": 0.9191015899181366, + "num_tokens": 3756692.0, + "step": 905 + }, + { + "entropy": 0.3700431428849697, + "epoch": 0.2121459377549831, + "grad_norm": 5.875, + "learning_rate": 1.9998937837489746e-05, + "loss": 0.7384, + "mean_token_accuracy": 0.8679684460163116, + "num_tokens": 3779933.0, + "step": 910 + }, + { + "entropy": 0.2644999146461487, + "epoch": 0.21331157477561488, + "grad_norm": 0.44921875, + "learning_rate": 1.999892521580066e-05, + "loss": 0.1845, + "mean_token_accuracy": 0.9258367955684662, + "num_tokens": 3814612.0, + "step": 915 + }, + { + "entropy": 0.30614554286003115, + "epoch": 0.21447721179624665, + "grad_norm": 1.71875, + "learning_rate": 1.9998912519570083e-05, + "loss": 0.6013, + "mean_token_accuracy": 0.8913537681102752, + "num_tokens": 3828380.0, + "step": 920 + }, + { + "entropy": 0.36284519638866186, + "epoch": 0.21564284881687842, + "grad_norm": 1.40625, + "learning_rate": 1.99988997487982e-05, + "loss": 0.6176, + "mean_token_accuracy": 0.88633993268013, + "num_tokens": 3853807.0, + "step": 925 + }, + { + "entropy": 0.4497147111222148, + "epoch": 0.2168084858375102, + "grad_norm": 2.890625, + "learning_rate": 1.9998886903485204e-05, + "loss": 0.773, + "mean_token_accuracy": 0.8610835254192353, + "num_tokens": 3883069.0, + "step": 930 + }, + { + "entropy": 0.33425854444503783, + "epoch": 0.21797412285814197, + "grad_norm": 1.4609375, + "learning_rate": 1.9998873983631283e-05, + "loss": 0.7184, + "mean_token_accuracy": 0.8913797974586487, + "num_tokens": 3892539.0, + "step": 935 + }, + { + "entropy": 0.37412596940994264, + "epoch": 0.21913975987877374, + "grad_norm": 0.9296875, + "learning_rate": 1.9998860989236636e-05, + "loss": 0.6036, + "mean_token_accuracy": 0.8902391850948334, + "num_tokens": 3905637.0, + "step": 940 + }, + { + "entropy": 0.28550264425575733, + "epoch": 0.22030539689940554, + "grad_norm": 0.53515625, + "learning_rate": 1.999884792030145e-05, + "loss": 0.6622, + "mean_token_accuracy": 0.8766939222812653, + "num_tokens": 3926668.0, + "step": 945 + }, + { + "entropy": 0.4102290324866772, + "epoch": 0.2214710339200373, + "grad_norm": 3.328125, + "learning_rate": 1.9998834776825926e-05, + "loss": 0.7183, + "mean_token_accuracy": 0.8621988534927368, + "num_tokens": 3943647.0, + "step": 950 + }, + { + "entropy": 0.29528709389269353, + "epoch": 0.22263667094066908, + "grad_norm": 3.265625, + "learning_rate": 1.9998821558810254e-05, + "loss": 0.6308, + "mean_token_accuracy": 0.8999729871749877, + "num_tokens": 3971668.0, + "step": 955 + }, + { + "entropy": 0.26888910457491877, + "epoch": 0.22380230796130085, + "grad_norm": 1.6484375, + "learning_rate": 1.9998808266254633e-05, + "loss": 0.3323, + "mean_token_accuracy": 0.903475534915924, + "num_tokens": 3994987.0, + "step": 960 + }, + { + "entropy": 0.3866387724876404, + "epoch": 0.22496794498193262, + "grad_norm": 3.015625, + "learning_rate": 1.9998794899159266e-05, + "loss": 0.8279, + "mean_token_accuracy": 0.8675558984279632, + "num_tokens": 4012144.0, + "step": 965 + }, + { + "entropy": 0.31260843873023986, + "epoch": 0.2261335820025644, + "grad_norm": 1.3515625, + "learning_rate": 1.9998781457524345e-05, + "loss": 0.4928, + "mean_token_accuracy": 0.9094948828220367, + "num_tokens": 4030145.0, + "step": 970 + }, + { + "entropy": 0.26560680121183394, + "epoch": 0.22729921902319616, + "grad_norm": 2.796875, + "learning_rate": 1.9998767941350078e-05, + "loss": 0.5832, + "mean_token_accuracy": 0.9050080478191376, + "num_tokens": 4051878.0, + "step": 975 + }, + { + "entropy": 0.2385837372392416, + "epoch": 0.22846485604382796, + "grad_norm": 0.3203125, + "learning_rate": 1.999875435063666e-05, + "loss": 0.2669, + "mean_token_accuracy": 0.943767887353897, + "num_tokens": 4082619.0, + "step": 980 + }, + { + "entropy": 0.2860884163528681, + "epoch": 0.22963049306445973, + "grad_norm": 1.578125, + "learning_rate": 1.9998740685384293e-05, + "loss": 0.4171, + "mean_token_accuracy": 0.9251432895660401, + "num_tokens": 4102940.0, + "step": 985 + }, + { + "entropy": 0.2541156569495797, + "epoch": 0.2307961300850915, + "grad_norm": 0.5078125, + "learning_rate": 1.9998726945593186e-05, + "loss": 0.6771, + "mean_token_accuracy": 0.8742362856864929, + "num_tokens": 4132989.0, + "step": 990 + }, + { + "entropy": 0.3671549305319786, + "epoch": 0.23196176710572328, + "grad_norm": 3.5, + "learning_rate": 1.9998713131263545e-05, + "loss": 0.8071, + "mean_token_accuracy": 0.8727976560592652, + "num_tokens": 4143129.0, + "step": 995 + }, + { + "entropy": 0.331473582983017, + "epoch": 0.23312740412635505, + "grad_norm": 1.0546875, + "learning_rate": 1.999869924239557e-05, + "loss": 0.4618, + "mean_token_accuracy": 0.8977300941944122, + "num_tokens": 4161622.0, + "step": 1000 + }, + { + "entropy": 0.27616468332707883, + "epoch": 0.23429304114698682, + "grad_norm": 3.375, + "learning_rate": 1.9998685278989472e-05, + "loss": 0.485, + "mean_token_accuracy": 0.9088415265083313, + "num_tokens": 4180399.0, + "step": 1005 + }, + { + "entropy": 0.33261996433138846, + "epoch": 0.2354586781676186, + "grad_norm": 3.78125, + "learning_rate": 1.9998671241045454e-05, + "loss": 0.7578, + "mean_token_accuracy": 0.888950502872467, + "num_tokens": 4192069.0, + "step": 1010 + }, + { + "entropy": 0.24764457009732724, + "epoch": 0.2366243151882504, + "grad_norm": 0.734375, + "learning_rate": 1.9998657128563736e-05, + "loss": 0.1616, + "mean_token_accuracy": 0.9366363644599914, + "num_tokens": 4228810.0, + "step": 1015 + }, + { + "entropy": 0.2843809101730585, + "epoch": 0.23778995220888216, + "grad_norm": 0.431640625, + "learning_rate": 1.9998642941544518e-05, + "loss": 0.5265, + "mean_token_accuracy": 0.8996910214424133, + "num_tokens": 4254374.0, + "step": 1020 + }, + { + "entropy": 0.304288499802351, + "epoch": 0.23895558922951393, + "grad_norm": 0.9375, + "learning_rate": 1.9998628679988013e-05, + "loss": 0.6136, + "mean_token_accuracy": 0.8884929776191711, + "num_tokens": 4269516.0, + "step": 1025 + }, + { + "entropy": 0.3449162319302559, + "epoch": 0.2401212262501457, + "grad_norm": 1.9609375, + "learning_rate": 1.9998614343894438e-05, + "loss": 0.8248, + "mean_token_accuracy": 0.8706211626529694, + "num_tokens": 4279381.0, + "step": 1030 + }, + { + "entropy": 0.2974201008677483, + "epoch": 0.24128686327077747, + "grad_norm": 3.109375, + "learning_rate": 1.9998599933264007e-05, + "loss": 0.5335, + "mean_token_accuracy": 0.9134840488433837, + "num_tokens": 4299652.0, + "step": 1035 + }, + { + "entropy": 0.2299369264394045, + "epoch": 0.24245250029140925, + "grad_norm": 3.375, + "learning_rate": 1.999858544809693e-05, + "loss": 0.4326, + "mean_token_accuracy": 0.9048404097557068, + "num_tokens": 4326102.0, + "step": 1040 + }, + { + "entropy": 0.5418604515492916, + "epoch": 0.24361813731204104, + "grad_norm": 2.09375, + "learning_rate": 1.9998570888393427e-05, + "loss": 0.8301, + "mean_token_accuracy": 0.820774444937706, + "num_tokens": 4349589.0, + "step": 1045 + }, + { + "entropy": 0.34275285750627515, + "epoch": 0.24478377433267282, + "grad_norm": 1.4140625, + "learning_rate": 1.9998556254153715e-05, + "loss": 0.6571, + "mean_token_accuracy": 0.8875455260276794, + "num_tokens": 4363787.0, + "step": 1050 + }, + { + "entropy": 0.27274183109402655, + "epoch": 0.2459494113533046, + "grad_norm": 0.45703125, + "learning_rate": 1.9998541545378007e-05, + "loss": 0.4852, + "mean_token_accuracy": 0.9113678991794586, + "num_tokens": 4387489.0, + "step": 1055 + }, + { + "entropy": 0.35988821983337405, + "epoch": 0.24711504837393636, + "grad_norm": 2.859375, + "learning_rate": 1.999852676206653e-05, + "loss": 0.6556, + "mean_token_accuracy": 0.8755202710628509, + "num_tokens": 4403253.0, + "step": 1060 + }, + { + "entropy": 0.2266766732558608, + "epoch": 0.24828068539456813, + "grad_norm": 1.4609375, + "learning_rate": 1.99985119042195e-05, + "loss": 0.3077, + "mean_token_accuracy": 0.9274795413017273, + "num_tokens": 4434817.0, + "step": 1065 + }, + { + "entropy": 0.35136873573064803, + "epoch": 0.2494463224151999, + "grad_norm": 5.5, + "learning_rate": 1.9998496971837137e-05, + "loss": 1.0309, + "mean_token_accuracy": 0.8499768733978271, + "num_tokens": 4444556.0, + "step": 1070 + }, + { + "entropy": 0.2657496578991413, + "epoch": 0.2506119594358317, + "grad_norm": 1.3203125, + "learning_rate": 1.999848196491967e-05, + "loss": 0.4618, + "mean_token_accuracy": 0.9091532528400421, + "num_tokens": 4470876.0, + "step": 1075 + }, + { + "entropy": 0.2768456295132637, + "epoch": 0.25177759645646347, + "grad_norm": 3.0625, + "learning_rate": 1.9998466883467316e-05, + "loss": 0.7123, + "mean_token_accuracy": 0.8955871105194092, + "num_tokens": 4482427.0, + "step": 1080 + }, + { + "entropy": 0.20596542172133922, + "epoch": 0.25294323347709524, + "grad_norm": 2.578125, + "learning_rate": 1.9998451727480302e-05, + "loss": 0.3453, + "mean_token_accuracy": 0.9377319753170014, + "num_tokens": 4504480.0, + "step": 1085 + }, + { + "entropy": 0.3682212561368942, + "epoch": 0.254108870497727, + "grad_norm": 3.21875, + "learning_rate": 1.999843649695886e-05, + "loss": 0.7303, + "mean_token_accuracy": 0.8806298255920411, + "num_tokens": 4513098.0, + "step": 1090 + }, + { + "entropy": 0.20294536799192428, + "epoch": 0.2552745075183588, + "grad_norm": 0.373046875, + "learning_rate": 1.9998421191903204e-05, + "loss": 0.4142, + "mean_token_accuracy": 0.9349018275737763, + "num_tokens": 4538371.0, + "step": 1095 + }, + { + "entropy": 0.2562996305525303, + "epoch": 0.25644014453899056, + "grad_norm": 1.5625, + "learning_rate": 1.9998405812313573e-05, + "loss": 0.5281, + "mean_token_accuracy": 0.9155136287212372, + "num_tokens": 4552309.0, + "step": 1100 + }, + { + "entropy": 0.38680734038352965, + "epoch": 0.2576057815596223, + "grad_norm": 4.125, + "learning_rate": 1.9998390358190197e-05, + "loss": 0.6167, + "mean_token_accuracy": 0.8845182538032532, + "num_tokens": 4579283.0, + "step": 1105 + }, + { + "entropy": 0.39438874665647744, + "epoch": 0.2587714185802541, + "grad_norm": 0.6484375, + "learning_rate": 1.9998374829533298e-05, + "loss": 0.6791, + "mean_token_accuracy": 0.8696678400039672, + "num_tokens": 4597972.0, + "step": 1110 + }, + { + "entropy": 0.29664220958948134, + "epoch": 0.25993705560088587, + "grad_norm": 1.21875, + "learning_rate": 1.9998359226343113e-05, + "loss": 0.4791, + "mean_token_accuracy": 0.9082686901092529, + "num_tokens": 4615795.0, + "step": 1115 + }, + { + "entropy": 0.2910390578210354, + "epoch": 0.26110269262151764, + "grad_norm": 0.73828125, + "learning_rate": 1.9998343548619878e-05, + "loss": 0.5356, + "mean_token_accuracy": 0.8995784521102905, + "num_tokens": 4630562.0, + "step": 1120 + }, + { + "entropy": 0.3535892143845558, + "epoch": 0.2622683296421494, + "grad_norm": 2.421875, + "learning_rate": 1.9998327796363818e-05, + "loss": 0.7542, + "mean_token_accuracy": 0.8523446142673492, + "num_tokens": 4644011.0, + "step": 1125 + }, + { + "entropy": 0.31830633580684664, + "epoch": 0.2634339666627812, + "grad_norm": 1.578125, + "learning_rate": 1.9998311969575174e-05, + "loss": 0.5886, + "mean_token_accuracy": 0.8993684887886048, + "num_tokens": 4656443.0, + "step": 1130 + }, + { + "entropy": 0.19877025708556176, + "epoch": 0.264599603683413, + "grad_norm": 1.4296875, + "learning_rate": 1.9998296068254183e-05, + "loss": 0.3187, + "mean_token_accuracy": 0.9322415173053742, + "num_tokens": 4700790.0, + "step": 1135 + }, + { + "entropy": 0.3790184512734413, + "epoch": 0.2657652407040448, + "grad_norm": 2.75, + "learning_rate": 1.9998280092401076e-05, + "loss": 0.7684, + "mean_token_accuracy": 0.8741475522518158, + "num_tokens": 4711374.0, + "step": 1140 + }, + { + "entropy": 0.4036370933055878, + "epoch": 0.26693087772467655, + "grad_norm": 4.28125, + "learning_rate": 1.9998264042016096e-05, + "loss": 0.8014, + "mean_token_accuracy": 0.8721184432506561, + "num_tokens": 4720625.0, + "step": 1145 + }, + { + "entropy": 0.31758620887994765, + "epoch": 0.2680965147453083, + "grad_norm": 1.2265625, + "learning_rate": 1.9998247917099482e-05, + "loss": 0.8177, + "mean_token_accuracy": 0.8595401406288147, + "num_tokens": 4731984.0, + "step": 1150 + }, + { + "entropy": 0.3104529224336147, + "epoch": 0.2692621517659401, + "grad_norm": 2.265625, + "learning_rate": 1.9998231717651476e-05, + "loss": 0.5926, + "mean_token_accuracy": 0.8918087661266327, + "num_tokens": 4754962.0, + "step": 1155 + }, + { + "entropy": 0.4066608652472496, + "epoch": 0.27042778878657187, + "grad_norm": 1.8046875, + "learning_rate": 1.9998215443672316e-05, + "loss": 0.8178, + "mean_token_accuracy": 0.8719375371932984, + "num_tokens": 4765334.0, + "step": 1160 + }, + { + "entropy": 0.34720696657896044, + "epoch": 0.27159342580720364, + "grad_norm": 3.046875, + "learning_rate": 1.9998199095162242e-05, + "loss": 0.7829, + "mean_token_accuracy": 0.869851005077362, + "num_tokens": 4778433.0, + "step": 1165 + }, + { + "entropy": 0.3552531830966473, + "epoch": 0.2727590628278354, + "grad_norm": 0.75390625, + "learning_rate": 1.9998182672121506e-05, + "loss": 0.5921, + "mean_token_accuracy": 0.8870375871658325, + "num_tokens": 4812441.0, + "step": 1170 + }, + { + "entropy": 0.28509389981627464, + "epoch": 0.2739246998484672, + "grad_norm": 1.4296875, + "learning_rate": 1.9998166174550348e-05, + "loss": 0.4108, + "mean_token_accuracy": 0.9073728084564209, + "num_tokens": 4836739.0, + "step": 1175 + }, + { + "entropy": 0.4455971851944923, + "epoch": 0.27509033686909895, + "grad_norm": 2.90625, + "learning_rate": 1.9998149602449014e-05, + "loss": 0.9755, + "mean_token_accuracy": 0.8507632434368133, + "num_tokens": 4844095.0, + "step": 1180 + }, + { + "entropy": 0.15860873758792876, + "epoch": 0.2762559738897307, + "grad_norm": 0.55859375, + "learning_rate": 1.9998132955817753e-05, + "loss": 0.2743, + "mean_token_accuracy": 0.9363688051700592, + "num_tokens": 4874147.0, + "step": 1185 + }, + { + "entropy": 0.3147427745163441, + "epoch": 0.2774216109103625, + "grad_norm": 0.365234375, + "learning_rate": 1.999811623465681e-05, + "loss": 0.5888, + "mean_token_accuracy": 0.9021156907081604, + "num_tokens": 4891157.0, + "step": 1190 + }, + { + "entropy": 0.20595242828130722, + "epoch": 0.27858724793099426, + "grad_norm": 1.28125, + "learning_rate": 1.9998099438966437e-05, + "loss": 0.2696, + "mean_token_accuracy": 0.9339096426963807, + "num_tokens": 4933168.0, + "step": 1195 + }, + { + "entropy": 0.33057937026023865, + "epoch": 0.2797528849516261, + "grad_norm": 2.15625, + "learning_rate": 1.9998082568746888e-05, + "loss": 0.6004, + "mean_token_accuracy": 0.8792477488517761, + "num_tokens": 4955692.0, + "step": 1200 + }, + { + "entropy": 0.3509062934666872, + "epoch": 0.28091852197225786, + "grad_norm": 1.4453125, + "learning_rate": 1.9998065623998403e-05, + "loss": 0.5698, + "mean_token_accuracy": 0.8836005508899689, + "num_tokens": 4971106.0, + "step": 1205 + }, + { + "entropy": 0.3051776558160782, + "epoch": 0.28208415899288963, + "grad_norm": 0.412109375, + "learning_rate": 1.9998048604721248e-05, + "loss": 0.4699, + "mean_token_accuracy": 0.9227140247821808, + "num_tokens": 5008881.0, + "step": 1210 + }, + { + "entropy": 0.38542407751083374, + "epoch": 0.2832497960135214, + "grad_norm": 2.984375, + "learning_rate": 1.9998031510915666e-05, + "loss": 0.6204, + "mean_token_accuracy": 0.8672150433063507, + "num_tokens": 5028439.0, + "step": 1215 + }, + { + "entropy": 0.3344720020890236, + "epoch": 0.2844154330341532, + "grad_norm": 3.21875, + "learning_rate": 1.9998014342581922e-05, + "loss": 0.6178, + "mean_token_accuracy": 0.8967957139015198, + "num_tokens": 5042219.0, + "step": 1220 + }, + { + "entropy": 0.26223552152514457, + "epoch": 0.28558107005478495, + "grad_norm": 0.87890625, + "learning_rate": 1.9997997099720263e-05, + "loss": 0.1843, + "mean_token_accuracy": 0.9286192059516907, + "num_tokens": 5070919.0, + "step": 1225 + }, + { + "entropy": 0.3363313525915146, + "epoch": 0.2867467070754167, + "grad_norm": 8.875, + "learning_rate": 1.9997979782330953e-05, + "loss": 0.8308, + "mean_token_accuracy": 0.8689966082572937, + "num_tokens": 5081678.0, + "step": 1230 + }, + { + "entropy": 0.3352774230763316, + "epoch": 0.2879123440960485, + "grad_norm": 2.90625, + "learning_rate": 1.9997962390414243e-05, + "loss": 0.7084, + "mean_token_accuracy": 0.8848056674003602, + "num_tokens": 5103156.0, + "step": 1235 + }, + { + "entropy": 0.2327698018401861, + "epoch": 0.28907798111668026, + "grad_norm": 1.25, + "learning_rate": 1.9997944923970397e-05, + "loss": 0.3922, + "mean_token_accuracy": 0.92334885597229, + "num_tokens": 5123337.0, + "step": 1240 + }, + { + "entropy": 0.27879791483283045, + "epoch": 0.29024361813731203, + "grad_norm": 2.328125, + "learning_rate": 1.9997927382999677e-05, + "loss": 0.445, + "mean_token_accuracy": 0.9180491745471955, + "num_tokens": 5148142.0, + "step": 1245 + }, + { + "entropy": 0.4868896633386612, + "epoch": 0.2914092551579438, + "grad_norm": 2.65625, + "learning_rate": 1.9997909767502342e-05, + "loss": 0.6209, + "mean_token_accuracy": 0.8842796742916107, + "num_tokens": 5164243.0, + "step": 1250 + }, + { + "entropy": 0.2981738731265068, + "epoch": 0.2925748921785756, + "grad_norm": 6.03125, + "learning_rate": 1.9997892077478654e-05, + "loss": 0.5485, + "mean_token_accuracy": 0.9020951867103577, + "num_tokens": 5186669.0, + "step": 1255 + }, + { + "entropy": 0.3072267949581146, + "epoch": 0.29374052919920735, + "grad_norm": 0.8828125, + "learning_rate": 1.9997874312928878e-05, + "loss": 0.4802, + "mean_token_accuracy": 0.8995555102825165, + "num_tokens": 5209969.0, + "step": 1260 + }, + { + "entropy": 0.2941398710012436, + "epoch": 0.2949061662198391, + "grad_norm": 0.349609375, + "learning_rate": 1.999785647385328e-05, + "loss": 0.3772, + "mean_token_accuracy": 0.9175294041633606, + "num_tokens": 5236167.0, + "step": 1265 + }, + { + "entropy": 0.420187583938241, + "epoch": 0.29607180324047094, + "grad_norm": 2.765625, + "learning_rate": 1.9997838560252122e-05, + "loss": 0.6751, + "mean_token_accuracy": 0.8618550479412079, + "num_tokens": 5253019.0, + "step": 1270 + }, + { + "entropy": 0.30597032606601715, + "epoch": 0.2972374402611027, + "grad_norm": 4.0625, + "learning_rate": 1.999782057212568e-05, + "loss": 0.7702, + "mean_token_accuracy": 0.8824066698551178, + "num_tokens": 5272393.0, + "step": 1275 + }, + { + "entropy": 0.32190938405692576, + "epoch": 0.2984030772817345, + "grad_norm": 0.94140625, + "learning_rate": 1.999780250947421e-05, + "loss": 0.5561, + "mean_token_accuracy": 0.8893412470817565, + "num_tokens": 5287540.0, + "step": 1280 + }, + { + "entropy": 0.25939189046621325, + "epoch": 0.29956871430236626, + "grad_norm": 5.09375, + "learning_rate": 1.9997784372297987e-05, + "loss": 0.5246, + "mean_token_accuracy": 0.9045629918575286, + "num_tokens": 5306321.0, + "step": 1285 + }, + { + "entropy": 0.3444667488336563, + "epoch": 0.30073435132299803, + "grad_norm": 3.28125, + "learning_rate": 1.9997766160597285e-05, + "loss": 0.7877, + "mean_token_accuracy": 0.866803640127182, + "num_tokens": 5323357.0, + "step": 1290 + }, + { + "entropy": 0.2251311082392931, + "epoch": 0.3018999883436298, + "grad_norm": 1.953125, + "learning_rate": 1.9997747874372374e-05, + "loss": 0.4677, + "mean_token_accuracy": 0.9120672941207886, + "num_tokens": 5342328.0, + "step": 1295 + }, + { + "entropy": 0.4307561069726944, + "epoch": 0.30306562536426157, + "grad_norm": 1.671875, + "learning_rate": 1.9997729513623523e-05, + "loss": 0.7848, + "mean_token_accuracy": 0.8459433108568192, + "num_tokens": 5357527.0, + "step": 1300 + }, + { + "entropy": 0.22801509127020836, + "epoch": 0.30423126238489334, + "grad_norm": 2.25, + "learning_rate": 1.999771107835101e-05, + "loss": 0.5102, + "mean_token_accuracy": 0.904511559009552, + "num_tokens": 5372786.0, + "step": 1305 + }, + { + "entropy": 0.38460721224546435, + "epoch": 0.3053968994055251, + "grad_norm": 4.46875, + "learning_rate": 1.9997692568555102e-05, + "loss": 0.7097, + "mean_token_accuracy": 0.8811452388763428, + "num_tokens": 5383589.0, + "step": 1310 + }, + { + "entropy": 0.34651473462581633, + "epoch": 0.3065625364261569, + "grad_norm": 2.0, + "learning_rate": 1.9997673984236085e-05, + "loss": 0.7524, + "mean_token_accuracy": 0.8840455830097198, + "num_tokens": 5394471.0, + "step": 1315 + }, + { + "entropy": 0.25617843568325044, + "epoch": 0.30772817344678866, + "grad_norm": 2.53125, + "learning_rate": 1.9997655325394232e-05, + "loss": 0.6068, + "mean_token_accuracy": 0.910620141029358, + "num_tokens": 5408393.0, + "step": 1320 + }, + { + "entropy": 0.2709501812234521, + "epoch": 0.3088938104674204, + "grad_norm": 2.46875, + "learning_rate": 1.999763659202982e-05, + "loss": 0.4298, + "mean_token_accuracy": 0.8932905435562134, + "num_tokens": 5437024.0, + "step": 1325 + }, + { + "entropy": 0.28042895793914796, + "epoch": 0.3100594474880522, + "grad_norm": 0.5234375, + "learning_rate": 1.9997617784143132e-05, + "loss": 0.5373, + "mean_token_accuracy": 0.883009546995163, + "num_tokens": 5466512.0, + "step": 1330 + }, + { + "entropy": 0.29205573797225953, + "epoch": 0.31122508450868397, + "grad_norm": 1.7265625, + "learning_rate": 1.9997598901734444e-05, + "loss": 0.5968, + "mean_token_accuracy": 0.8985353469848633, + "num_tokens": 5480158.0, + "step": 1335 + }, + { + "entropy": 0.28621186055243014, + "epoch": 0.3123907215293158, + "grad_norm": 1.65625, + "learning_rate": 1.9997579944804038e-05, + "loss": 0.4702, + "mean_token_accuracy": 0.9099491775035858, + "num_tokens": 5496242.0, + "step": 1340 + }, + { + "entropy": 0.388295142352581, + "epoch": 0.31355635854994757, + "grad_norm": 2.546875, + "learning_rate": 1.9997560913352202e-05, + "loss": 0.7143, + "mean_token_accuracy": 0.8692742168903351, + "num_tokens": 5521237.0, + "step": 1345 + }, + { + "entropy": 0.26313092596828935, + "epoch": 0.31472199557057934, + "grad_norm": 0.35546875, + "learning_rate": 1.9997541807379213e-05, + "loss": 0.5295, + "mean_token_accuracy": 0.9129705667495728, + "num_tokens": 5543355.0, + "step": 1350 + }, + { + "entropy": 0.2963793471455574, + "epoch": 0.3158876325912111, + "grad_norm": 4.75, + "learning_rate": 1.999752262688536e-05, + "loss": 0.5275, + "mean_token_accuracy": 0.8948480784893036, + "num_tokens": 5567454.0, + "step": 1355 + }, + { + "entropy": 0.23987912703305483, + "epoch": 0.3170532696118429, + "grad_norm": 2.921875, + "learning_rate": 1.999750337187093e-05, + "loss": 0.5735, + "mean_token_accuracy": 0.8970786690711975, + "num_tokens": 5591399.0, + "step": 1360 + }, + { + "entropy": 0.44506589621305465, + "epoch": 0.31821890663247465, + "grad_norm": 3.625, + "learning_rate": 1.9997484042336207e-05, + "loss": 0.7905, + "mean_token_accuracy": 0.8698137998580933, + "num_tokens": 5606132.0, + "step": 1365 + }, + { + "entropy": 0.2002415034919977, + "epoch": 0.3193845436531064, + "grad_norm": 0.265625, + "learning_rate": 1.9997464638281475e-05, + "loss": 0.1768, + "mean_token_accuracy": 0.9350118517875672, + "num_tokens": 5645491.0, + "step": 1370 + }, + { + "entropy": 0.233334668725729, + "epoch": 0.3205501806737382, + "grad_norm": 0.8828125, + "learning_rate": 1.9997445159707035e-05, + "loss": 0.2001, + "mean_token_accuracy": 0.9283820569515229, + "num_tokens": 5677361.0, + "step": 1375 + }, + { + "entropy": 0.28531249314546586, + "epoch": 0.32171581769436997, + "grad_norm": 0.859375, + "learning_rate": 1.999742560661317e-05, + "loss": 0.8321, + "mean_token_accuracy": 0.8798725843429566, + "num_tokens": 5691533.0, + "step": 1380 + }, + { + "entropy": 0.33057908453047274, + "epoch": 0.32288145471500174, + "grad_norm": 2.234375, + "learning_rate": 1.9997405979000172e-05, + "loss": 0.6412, + "mean_token_accuracy": 0.8915965855121613, + "num_tokens": 5706375.0, + "step": 1385 + }, + { + "entropy": 0.33576981276273726, + "epoch": 0.3240470917356335, + "grad_norm": 3.125, + "learning_rate": 1.9997386276868332e-05, + "loss": 0.6126, + "mean_token_accuracy": 0.8828125953674316, + "num_tokens": 5723036.0, + "step": 1390 + }, + { + "entropy": 0.4078744914382696, + "epoch": 0.3252127287562653, + "grad_norm": 3.046875, + "learning_rate": 1.999736650021795e-05, + "loss": 0.573, + "mean_token_accuracy": 0.8879797518253326, + "num_tokens": 5745820.0, + "step": 1395 + }, + { + "entropy": 0.40988370552659037, + "epoch": 0.32637836577689705, + "grad_norm": 2.421875, + "learning_rate": 1.9997346649049314e-05, + "loss": 0.6882, + "mean_token_accuracy": 0.8822323262691498, + "num_tokens": 5773501.0, + "step": 1400 + }, + { + "entropy": 0.2149012926965952, + "epoch": 0.3275440027975289, + "grad_norm": 0.62890625, + "learning_rate": 1.9997326723362725e-05, + "loss": 0.2689, + "mean_token_accuracy": 0.9297385811805725, + "num_tokens": 5800329.0, + "step": 1405 + }, + { + "entropy": 0.38212478160858154, + "epoch": 0.32870963981816065, + "grad_norm": 4.28125, + "learning_rate": 1.9997306723158477e-05, + "loss": 0.7987, + "mean_token_accuracy": 0.8482033610343933, + "num_tokens": 5809536.0, + "step": 1410 + }, + { + "entropy": 0.33134956248104575, + "epoch": 0.3298752768387924, + "grad_norm": 2.84375, + "learning_rate": 1.999728664843687e-05, + "loss": 0.5945, + "mean_token_accuracy": 0.8864220082759857, + "num_tokens": 5826701.0, + "step": 1415 + }, + { + "entropy": 0.34533427506685255, + "epoch": 0.3310409138594242, + "grad_norm": 3.6875, + "learning_rate": 1.9997266499198203e-05, + "loss": 0.7109, + "mean_token_accuracy": 0.8641054213047028, + "num_tokens": 5840942.0, + "step": 1420 + }, + { + "entropy": 0.3226647362112999, + "epoch": 0.33220655088005596, + "grad_norm": 2.0625, + "learning_rate": 1.9997246275442776e-05, + "loss": 0.7447, + "mean_token_accuracy": 0.8617903709411621, + "num_tokens": 5854515.0, + "step": 1425 + }, + { + "entropy": 0.300795790925622, + "epoch": 0.33337218790068773, + "grad_norm": 1.21875, + "learning_rate": 1.999722597717089e-05, + "loss": 0.7193, + "mean_token_accuracy": 0.8860871911048889, + "num_tokens": 5870124.0, + "step": 1430 + }, + { + "entropy": 0.27445814162492754, + "epoch": 0.3345378249213195, + "grad_norm": 0.94921875, + "learning_rate": 1.999720560438285e-05, + "loss": 0.4695, + "mean_token_accuracy": 0.9113145053386689, + "num_tokens": 5894072.0, + "step": 1435 + }, + { + "entropy": 0.2666664507240057, + "epoch": 0.3357034619419513, + "grad_norm": 1.109375, + "learning_rate": 1.9997185157078958e-05, + "loss": 0.5023, + "mean_token_accuracy": 0.9125133752822876, + "num_tokens": 5908980.0, + "step": 1440 + }, + { + "entropy": 0.2709391973912716, + "epoch": 0.33686909896258305, + "grad_norm": 0.90234375, + "learning_rate": 1.9997164635259515e-05, + "loss": 0.6362, + "mean_token_accuracy": 0.9022528111934662, + "num_tokens": 5930796.0, + "step": 1445 + }, + { + "entropy": 0.31120129972696303, + "epoch": 0.3380347359832148, + "grad_norm": 1.890625, + "learning_rate": 1.9997144038924836e-05, + "loss": 0.6915, + "mean_token_accuracy": 0.8787729322910309, + "num_tokens": 5942512.0, + "step": 1450 + }, + { + "entropy": 0.41584598571062087, + "epoch": 0.3392003730038466, + "grad_norm": 4.09375, + "learning_rate": 1.999712336807522e-05, + "loss": 0.8791, + "mean_token_accuracy": 0.8504727482795715, + "num_tokens": 5954320.0, + "step": 1455 + }, + { + "entropy": 0.3380214340984821, + "epoch": 0.34036601002447836, + "grad_norm": 2.421875, + "learning_rate": 1.9997102622710983e-05, + "loss": 0.5363, + "mean_token_accuracy": 0.8888375997543335, + "num_tokens": 5981457.0, + "step": 1460 + }, + { + "entropy": 0.2891572520136833, + "epoch": 0.34153164704511013, + "grad_norm": 0.2890625, + "learning_rate": 1.9997081802832427e-05, + "loss": 0.6988, + "mean_token_accuracy": 0.8861670017242431, + "num_tokens": 6004171.0, + "step": 1465 + }, + { + "entropy": 0.3689094468951225, + "epoch": 0.3426972840657419, + "grad_norm": 1.6953125, + "learning_rate": 1.9997060908439864e-05, + "loss": 0.8413, + "mean_token_accuracy": 0.8352766156196594, + "num_tokens": 6013575.0, + "step": 1470 + }, + { + "entropy": 0.262055104598403, + "epoch": 0.34386292108637373, + "grad_norm": 2.703125, + "learning_rate": 1.999703993953361e-05, + "loss": 0.4639, + "mean_token_accuracy": 0.9029669284820556, + "num_tokens": 6033804.0, + "step": 1475 + }, + { + "entropy": 0.2753362640738487, + "epoch": 0.3450285581070055, + "grad_norm": 2.28125, + "learning_rate": 1.999701889611397e-05, + "loss": 0.4095, + "mean_token_accuracy": 0.9083203911781311, + "num_tokens": 6055032.0, + "step": 1480 + }, + { + "entropy": 0.32363075017929077, + "epoch": 0.34619419512763727, + "grad_norm": 5.0, + "learning_rate": 1.9996997778181268e-05, + "loss": 0.8892, + "mean_token_accuracy": 0.8496161878108979, + "num_tokens": 6065287.0, + "step": 1485 + }, + { + "entropy": 0.27738819643855095, + "epoch": 0.34735983214826904, + "grad_norm": 2.171875, + "learning_rate": 1.999697658573581e-05, + "loss": 0.5262, + "mean_token_accuracy": 0.904751843214035, + "num_tokens": 6080176.0, + "step": 1490 + }, + { + "entropy": 0.29178103134036065, + "epoch": 0.3485254691689008, + "grad_norm": 1.40625, + "learning_rate": 1.9996955318777914e-05, + "loss": 0.6819, + "mean_token_accuracy": 0.8912020802497864, + "num_tokens": 6104331.0, + "step": 1495 + }, + { + "entropy": 0.30065750852227213, + "epoch": 0.3496911061895326, + "grad_norm": 4.78125, + "learning_rate": 1.99969339773079e-05, + "loss": 0.4326, + "mean_token_accuracy": 0.8998681366443634, + "num_tokens": 6120457.0, + "step": 1500 + }, + { + "entropy": 0.38931316807866095, + "epoch": 0.35085674321016436, + "grad_norm": 0.240234375, + "learning_rate": 1.9996912561326082e-05, + "loss": 0.7776, + "mean_token_accuracy": 0.8692282378673554, + "num_tokens": 6142385.0, + "step": 1505 + }, + { + "entropy": 0.2787280652672052, + "epoch": 0.35202238023079613, + "grad_norm": 5.71875, + "learning_rate": 1.9996891070832785e-05, + "loss": 0.5146, + "mean_token_accuracy": 0.8925648927688599, + "num_tokens": 6167164.0, + "step": 1510 + }, + { + "entropy": 0.38545300289988516, + "epoch": 0.3531880172514279, + "grad_norm": 0.38671875, + "learning_rate": 1.9996869505828327e-05, + "loss": 0.6809, + "mean_token_accuracy": 0.873318862915039, + "num_tokens": 6181075.0, + "step": 1515 + }, + { + "entropy": 0.4982053153216839, + "epoch": 0.35435365427205967, + "grad_norm": 2.453125, + "learning_rate": 1.9996847866313026e-05, + "loss": 0.8632, + "mean_token_accuracy": 0.8616338968276978, + "num_tokens": 6192075.0, + "step": 1520 + }, + { + "entropy": 0.29975649788975717, + "epoch": 0.35551929129269144, + "grad_norm": 1.203125, + "learning_rate": 1.999682615228721e-05, + "loss": 0.4278, + "mean_token_accuracy": 0.8942910373210907, + "num_tokens": 6211561.0, + "step": 1525 + }, + { + "entropy": 0.35748309791088106, + "epoch": 0.3566849283133232, + "grad_norm": 1.546875, + "learning_rate": 1.99968043637512e-05, + "loss": 0.8788, + "mean_token_accuracy": 0.8682412981987, + "num_tokens": 6222311.0, + "step": 1530 + }, + { + "entropy": 0.30189107209444044, + "epoch": 0.357850565333955, + "grad_norm": 2.65625, + "learning_rate": 1.999678250070532e-05, + "loss": 0.6564, + "mean_token_accuracy": 0.8856256902217865, + "num_tokens": 6232548.0, + "step": 1535 + }, + { + "entropy": 0.30567995868623254, + "epoch": 0.35901620235458676, + "grad_norm": 0.57421875, + "learning_rate": 1.9996760563149898e-05, + "loss": 0.6377, + "mean_token_accuracy": 0.8897257745265961, + "num_tokens": 6256342.0, + "step": 1540 + }, + { + "entropy": 0.3873858168721199, + "epoch": 0.3601818393752186, + "grad_norm": 4.5625, + "learning_rate": 1.999673855108526e-05, + "loss": 0.7463, + "mean_token_accuracy": 0.8492149412631989, + "num_tokens": 6269451.0, + "step": 1545 + }, + { + "entropy": 0.333609714359045, + "epoch": 0.36134747639585035, + "grad_norm": 5.0625, + "learning_rate": 1.9996716464511735e-05, + "loss": 0.7832, + "mean_token_accuracy": 0.8789271175861358, + "num_tokens": 6279773.0, + "step": 1550 + }, + { + "entropy": 0.27410609275102615, + "epoch": 0.3625131134164821, + "grad_norm": 2.703125, + "learning_rate": 1.9996694303429653e-05, + "loss": 0.6224, + "mean_token_accuracy": 0.9015877664089202, + "num_tokens": 6298073.0, + "step": 1555 + }, + { + "entropy": 0.2983283132314682, + "epoch": 0.3636787504371139, + "grad_norm": 0.8359375, + "learning_rate": 1.9996672067839344e-05, + "loss": 0.541, + "mean_token_accuracy": 0.8865351200103759, + "num_tokens": 6328075.0, + "step": 1560 + }, + { + "entropy": 0.3246372312307358, + "epoch": 0.36484438745774567, + "grad_norm": 0.828125, + "learning_rate": 1.9996649757741138e-05, + "loss": 0.5547, + "mean_token_accuracy": 0.8967100918293, + "num_tokens": 6356726.0, + "step": 1565 + }, + { + "entropy": 0.26282679475843906, + "epoch": 0.36601002447837744, + "grad_norm": 1.9609375, + "learning_rate": 1.999662737313537e-05, + "loss": 0.3613, + "mean_token_accuracy": 0.917979234457016, + "num_tokens": 6380621.0, + "step": 1570 + }, + { + "entropy": 0.3043300576508045, + "epoch": 0.3671756614990092, + "grad_norm": 1.2734375, + "learning_rate": 1.999660491402237e-05, + "loss": 0.3796, + "mean_token_accuracy": 0.9121894776821137, + "num_tokens": 6405319.0, + "step": 1575 + }, + { + "entropy": 0.2572882641106844, + "epoch": 0.368341298519641, + "grad_norm": 3.28125, + "learning_rate": 1.9996582380402475e-05, + "loss": 0.6378, + "mean_token_accuracy": 0.8814368367195129, + "num_tokens": 6424737.0, + "step": 1580 + }, + { + "entropy": 0.21875108852982522, + "epoch": 0.36950693554027275, + "grad_norm": 0.87890625, + "learning_rate": 1.9996559772276024e-05, + "loss": 0.3042, + "mean_token_accuracy": 0.9384470582008362, + "num_tokens": 6459703.0, + "step": 1585 + }, + { + "entropy": 0.3576366286724806, + "epoch": 0.3706725725609045, + "grad_norm": 1.0234375, + "learning_rate": 1.9996537089643352e-05, + "loss": 0.5118, + "mean_token_accuracy": 0.8810594260692597, + "num_tokens": 6486737.0, + "step": 1590 + }, + { + "entropy": 0.22341410033404827, + "epoch": 0.3718382095815363, + "grad_norm": 0.9375, + "learning_rate": 1.9996514332504795e-05, + "loss": 0.3055, + "mean_token_accuracy": 0.9157830238342285, + "num_tokens": 6522014.0, + "step": 1595 + }, + { + "entropy": 0.3736105814576149, + "epoch": 0.37300384660216807, + "grad_norm": 1.09375, + "learning_rate": 1.999649150086069e-05, + "loss": 0.854, + "mean_token_accuracy": 0.8709351003170014, + "num_tokens": 6530820.0, + "step": 1600 + }, + { + "entropy": 0.3235956601798534, + "epoch": 0.37416948362279984, + "grad_norm": 2.828125, + "learning_rate": 1.999646859471139e-05, + "loss": 0.5949, + "mean_token_accuracy": 0.8829461574554444, + "num_tokens": 6545363.0, + "step": 1605 + }, + { + "entropy": 0.2963440466672182, + "epoch": 0.3753351206434316, + "grad_norm": 1.8984375, + "learning_rate": 1.9996445614057227e-05, + "loss": 0.5785, + "mean_token_accuracy": 0.9058303475379944, + "num_tokens": 6561658.0, + "step": 1610 + }, + { + "entropy": 0.34780505336821077, + "epoch": 0.37650075766406343, + "grad_norm": 0.3125, + "learning_rate": 1.999642255889854e-05, + "loss": 0.7008, + "mean_token_accuracy": 0.8779121518135071, + "num_tokens": 6586862.0, + "step": 1615 + }, + { + "entropy": 0.31498599648475645, + "epoch": 0.3776663946846952, + "grad_norm": 6.125, + "learning_rate": 1.999639942923568e-05, + "loss": 0.7251, + "mean_token_accuracy": 0.8791738033294678, + "num_tokens": 6600831.0, + "step": 1620 + }, + { + "entropy": 0.4721190705895424, + "epoch": 0.378832031705327, + "grad_norm": 2.015625, + "learning_rate": 1.999637622506899e-05, + "loss": 0.8876, + "mean_token_accuracy": 0.8171874463558197, + "num_tokens": 6627349.0, + "step": 1625 + }, + { + "entropy": 0.24465088918805122, + "epoch": 0.37999766872595875, + "grad_norm": 1.3671875, + "learning_rate": 1.9996352946398812e-05, + "loss": 0.5261, + "mean_token_accuracy": 0.9152758538722991, + "num_tokens": 6640587.0, + "step": 1630 + }, + { + "entropy": 0.19936469215899705, + "epoch": 0.3811633057465905, + "grad_norm": 1.421875, + "learning_rate": 1.99963295932255e-05, + "loss": 0.1646, + "mean_token_accuracy": 0.9334083795547485, + "num_tokens": 6683779.0, + "step": 1635 + }, + { + "entropy": 0.2622247666120529, + "epoch": 0.3823289427672223, + "grad_norm": 0.55078125, + "learning_rate": 1.9996306165549398e-05, + "loss": 0.5497, + "mean_token_accuracy": 0.9146570563316345, + "num_tokens": 6701209.0, + "step": 1640 + }, + { + "entropy": 0.2674731440842152, + "epoch": 0.38349457978785406, + "grad_norm": 1.4140625, + "learning_rate": 1.9996282663370855e-05, + "loss": 0.5693, + "mean_token_accuracy": 0.9051762282848358, + "num_tokens": 6720783.0, + "step": 1645 + }, + { + "entropy": 0.26045389622449877, + "epoch": 0.38466021680848583, + "grad_norm": 1.390625, + "learning_rate": 1.9996259086690225e-05, + "loss": 0.551, + "mean_token_accuracy": 0.907971054315567, + "num_tokens": 6732408.0, + "step": 1650 + }, + { + "entropy": 0.19415582679212093, + "epoch": 0.3858258538291176, + "grad_norm": 2.09375, + "learning_rate": 1.9996235435507857e-05, + "loss": 0.2055, + "mean_token_accuracy": 0.9295479655265808, + "num_tokens": 6762843.0, + "step": 1655 + }, + { + "entropy": 0.2839368298649788, + "epoch": 0.3869914908497494, + "grad_norm": 2.734375, + "learning_rate": 1.9996211709824103e-05, + "loss": 0.602, + "mean_token_accuracy": 0.8842163443565368, + "num_tokens": 6774026.0, + "step": 1660 + }, + { + "entropy": 0.277092095091939, + "epoch": 0.38815712787038115, + "grad_norm": 2.578125, + "learning_rate": 1.999618790963932e-05, + "loss": 0.4622, + "mean_token_accuracy": 0.8996878266334534, + "num_tokens": 6791014.0, + "step": 1665 + }, + { + "entropy": 0.30382088720798495, + "epoch": 0.3893227648910129, + "grad_norm": 1.6640625, + "learning_rate": 1.9996164034953856e-05, + "loss": 0.6885, + "mean_token_accuracy": 0.8841532289981842, + "num_tokens": 6803386.0, + "step": 1670 + }, + { + "entropy": 0.26629671454429626, + "epoch": 0.3904884019116447, + "grad_norm": 1.2890625, + "learning_rate": 1.9996140085768075e-05, + "loss": 0.2775, + "mean_token_accuracy": 0.918737781047821, + "num_tokens": 6834927.0, + "step": 1675 + }, + { + "entropy": 0.2534227319061756, + "epoch": 0.3916540389322765, + "grad_norm": 1.3203125, + "learning_rate": 1.9996116062082328e-05, + "loss": 0.4968, + "mean_token_accuracy": 0.9031592726707458, + "num_tokens": 6853376.0, + "step": 1680 + }, + { + "entropy": 0.466543810069561, + "epoch": 0.3928196759529083, + "grad_norm": 3.078125, + "learning_rate": 1.9996091963896977e-05, + "loss": 1.0411, + "mean_token_accuracy": 0.8225615203380585, + "num_tokens": 6863019.0, + "step": 1685 + }, + { + "entropy": 0.41575510799884796, + "epoch": 0.39398531297354006, + "grad_norm": 6.5, + "learning_rate": 1.999606779121238e-05, + "loss": 0.7638, + "mean_token_accuracy": 0.8457536458969116, + "num_tokens": 6877920.0, + "step": 1690 + }, + { + "entropy": 0.26502939909696577, + "epoch": 0.39515094999417183, + "grad_norm": 5.5625, + "learning_rate": 1.99960435440289e-05, + "loss": 0.5983, + "mean_token_accuracy": 0.8899563789367676, + "num_tokens": 6892767.0, + "step": 1695 + }, + { + "entropy": 0.3662456482648849, + "epoch": 0.3963165870148036, + "grad_norm": 2.75, + "learning_rate": 1.9996019222346896e-05, + "loss": 0.7134, + "mean_token_accuracy": 0.8851879596710205, + "num_tokens": 6909505.0, + "step": 1700 + }, + { + "entropy": 0.24746759831905366, + "epoch": 0.39748222403543537, + "grad_norm": 0.298828125, + "learning_rate": 1.9995994826166728e-05, + "loss": 0.3404, + "mean_token_accuracy": 0.9107642292976379, + "num_tokens": 6929397.0, + "step": 1705 + }, + { + "entropy": 0.3495868884027004, + "epoch": 0.39864786105606714, + "grad_norm": 2.375, + "learning_rate": 1.9995970355488765e-05, + "loss": 0.7069, + "mean_token_accuracy": 0.865958285331726, + "num_tokens": 6945542.0, + "step": 1710 + }, + { + "entropy": 0.2592170963063836, + "epoch": 0.3998134980766989, + "grad_norm": 0.328125, + "learning_rate": 1.999594581031337e-05, + "loss": 0.5571, + "mean_token_accuracy": 0.8999188899993896, + "num_tokens": 6971445.0, + "step": 1715 + }, + { + "entropy": 0.29373019393533467, + "epoch": 0.4009791350973307, + "grad_norm": 0.158203125, + "learning_rate": 1.999592119064091e-05, + "loss": 0.4141, + "mean_token_accuracy": 0.9174761772155762, + "num_tokens": 6999820.0, + "step": 1720 + }, + { + "entropy": 0.2551829546689987, + "epoch": 0.40214477211796246, + "grad_norm": 2.515625, + "learning_rate": 1.999589649647175e-05, + "loss": 0.6868, + "mean_token_accuracy": 0.8982496917247772, + "num_tokens": 7015487.0, + "step": 1725 + }, + { + "entropy": 0.3725998356938362, + "epoch": 0.40331040913859423, + "grad_norm": 1.0703125, + "learning_rate": 1.9995871727806257e-05, + "loss": 0.6217, + "mean_token_accuracy": 0.8710966169834137, + "num_tokens": 7047747.0, + "step": 1730 + }, + { + "entropy": 0.3806193895637989, + "epoch": 0.404476046159226, + "grad_norm": 1.359375, + "learning_rate": 1.99958468846448e-05, + "loss": 0.717, + "mean_token_accuracy": 0.8549498856067658, + "num_tokens": 7075138.0, + "step": 1735 + }, + { + "entropy": 0.5255543380975723, + "epoch": 0.40564168317985777, + "grad_norm": 3.828125, + "learning_rate": 1.9995821966987754e-05, + "loss": 1.1207, + "mean_token_accuracy": 0.8527864456176758, + "num_tokens": 7093547.0, + "step": 1740 + }, + { + "entropy": 0.28944441452622416, + "epoch": 0.40680732020048954, + "grad_norm": 0.796875, + "learning_rate": 1.9995796974835492e-05, + "loss": 0.4932, + "mean_token_accuracy": 0.9065792560577393, + "num_tokens": 7113040.0, + "step": 1745 + }, + { + "entropy": 0.341873537003994, + "epoch": 0.40797295722112137, + "grad_norm": 2.546875, + "learning_rate": 1.9995771908188377e-05, + "loss": 0.8112, + "mean_token_accuracy": 0.8654444336891174, + "num_tokens": 7138138.0, + "step": 1750 + }, + { + "entropy": 0.23857189230620862, + "epoch": 0.40913859424175314, + "grad_norm": 1.609375, + "learning_rate": 1.9995746767046794e-05, + "loss": 0.4241, + "mean_token_accuracy": 0.9137138366699219, + "num_tokens": 7162249.0, + "step": 1755 + }, + { + "entropy": 0.2880829580128193, + "epoch": 0.4103042312623849, + "grad_norm": 2.078125, + "learning_rate": 1.999572155141111e-05, + "loss": 0.6309, + "mean_token_accuracy": 0.8943787276744842, + "num_tokens": 7177792.0, + "step": 1760 + }, + { + "entropy": 0.3655803993344307, + "epoch": 0.4114698682830167, + "grad_norm": 0.3515625, + "learning_rate": 1.9995696261281703e-05, + "loss": 0.6261, + "mean_token_accuracy": 0.8850533306598664, + "num_tokens": 7192528.0, + "step": 1765 + }, + { + "entropy": 0.2609210178256035, + "epoch": 0.41263550530364845, + "grad_norm": 1.015625, + "learning_rate": 1.999567089665895e-05, + "loss": 0.3491, + "mean_token_accuracy": 0.9026422142982483, + "num_tokens": 7216693.0, + "step": 1770 + }, + { + "entropy": 0.3143754366785288, + "epoch": 0.4138011423242802, + "grad_norm": 3.453125, + "learning_rate": 1.9995645457543232e-05, + "loss": 0.5448, + "mean_token_accuracy": 0.8901515245437622, + "num_tokens": 7236689.0, + "step": 1775 + }, + { + "entropy": 0.33804006576538087, + "epoch": 0.414966779344912, + "grad_norm": 3.859375, + "learning_rate": 1.999561994393493e-05, + "loss": 0.8207, + "mean_token_accuracy": 0.8533077538013458, + "num_tokens": 7246576.0, + "step": 1780 + }, + { + "entropy": 0.19590835236012935, + "epoch": 0.41613241636554377, + "grad_norm": 1.5, + "learning_rate": 1.9995594355834417e-05, + "loss": 0.3561, + "mean_token_accuracy": 0.9268556416034699, + "num_tokens": 7274075.0, + "step": 1785 + }, + { + "entropy": 0.41329563707113265, + "epoch": 0.41729805338617554, + "grad_norm": 6.3125, + "learning_rate": 1.999556869324208e-05, + "loss": 0.9415, + "mean_token_accuracy": 0.8555647194385528, + "num_tokens": 7291078.0, + "step": 1790 + }, + { + "entropy": 0.23999712951481342, + "epoch": 0.4184636904068073, + "grad_norm": 2.140625, + "learning_rate": 1.9995542956158296e-05, + "loss": 0.4092, + "mean_token_accuracy": 0.9112877190113068, + "num_tokens": 7332326.0, + "step": 1795 + }, + { + "entropy": 0.2772911600768566, + "epoch": 0.4196293274274391, + "grad_norm": 0.482421875, + "learning_rate": 1.9995517144583454e-05, + "loss": 0.579, + "mean_token_accuracy": 0.9155928194522858, + "num_tokens": 7348012.0, + "step": 1800 + }, + { + "entropy": 0.3538199070841074, + "epoch": 0.42079496444807085, + "grad_norm": 1.09375, + "learning_rate": 1.999549125851794e-05, + "loss": 0.8149, + "mean_token_accuracy": 0.8583323359489441, + "num_tokens": 7361301.0, + "step": 1805 + }, + { + "entropy": 0.4551861360669136, + "epoch": 0.4219606014687026, + "grad_norm": 4.5, + "learning_rate": 1.9995465297962138e-05, + "loss": 1.06, + "mean_token_accuracy": 0.8336413264274597, + "num_tokens": 7368604.0, + "step": 1810 + }, + { + "entropy": 0.28974622786045073, + "epoch": 0.4231262384893344, + "grad_norm": 0.5625, + "learning_rate": 1.9995439262916433e-05, + "loss": 0.5063, + "mean_token_accuracy": 0.9072389006614685, + "num_tokens": 7392166.0, + "step": 1815 + }, + { + "entropy": 0.35121317878365516, + "epoch": 0.4242918755099662, + "grad_norm": 1.34375, + "learning_rate": 1.9995413153381215e-05, + "loss": 0.6501, + "mean_token_accuracy": 0.8880411267280579, + "num_tokens": 7409022.0, + "step": 1820 + }, + { + "entropy": 0.3598515219986439, + "epoch": 0.425457512530598, + "grad_norm": 0.7265625, + "learning_rate": 1.9995386969356875e-05, + "loss": 0.6913, + "mean_token_accuracy": 0.8816120862960816, + "num_tokens": 7431789.0, + "step": 1825 + }, + { + "entropy": 0.5655100494623184, + "epoch": 0.42662314955122976, + "grad_norm": 2.953125, + "learning_rate": 1.9995360710843797e-05, + "loss": 1.1957, + "mean_token_accuracy": 0.8307225584983826, + "num_tokens": 7454374.0, + "step": 1830 + }, + { + "entropy": 0.2977691352367401, + "epoch": 0.42778878657186153, + "grad_norm": 0.5703125, + "learning_rate": 1.9995334377842385e-05, + "loss": 0.6536, + "mean_token_accuracy": 0.8897909998893738, + "num_tokens": 7473775.0, + "step": 1835 + }, + { + "entropy": 0.255497844517231, + "epoch": 0.4289544235924933, + "grad_norm": 2.765625, + "learning_rate": 1.999530797035302e-05, + "loss": 0.4676, + "mean_token_accuracy": 0.9138294994831085, + "num_tokens": 7487979.0, + "step": 1840 + }, + { + "entropy": 0.368525767326355, + "epoch": 0.4301200606131251, + "grad_norm": 4.90625, + "learning_rate": 1.9995281488376097e-05, + "loss": 0.652, + "mean_token_accuracy": 0.8939944744110108, + "num_tokens": 7500991.0, + "step": 1845 + }, + { + "entropy": 0.3272201903164387, + "epoch": 0.43128569763375685, + "grad_norm": 2.359375, + "learning_rate": 1.9995254931912017e-05, + "loss": 0.5455, + "mean_token_accuracy": 0.885904461145401, + "num_tokens": 7514349.0, + "step": 1850 + }, + { + "entropy": 0.2522231835871935, + "epoch": 0.4324513346543886, + "grad_norm": 0.40625, + "learning_rate": 1.9995228300961175e-05, + "loss": 0.2802, + "mean_token_accuracy": 0.917109364271164, + "num_tokens": 7550584.0, + "step": 1855 + }, + { + "entropy": 0.38599798902869226, + "epoch": 0.4336169716750204, + "grad_norm": 4.09375, + "learning_rate": 1.999520159552396e-05, + "loss": 0.5278, + "mean_token_accuracy": 0.8643686592578887, + "num_tokens": 7582607.0, + "step": 1860 + }, + { + "entropy": 0.3535305127501488, + "epoch": 0.43478260869565216, + "grad_norm": 2.109375, + "learning_rate": 1.999517481560078e-05, + "loss": 1.0017, + "mean_token_accuracy": 0.8477705299854279, + "num_tokens": 7591089.0, + "step": 1865 + }, + { + "entropy": 0.2962531797587872, + "epoch": 0.43594824571628393, + "grad_norm": 0.75390625, + "learning_rate": 1.999514796119203e-05, + "loss": 0.4272, + "mean_token_accuracy": 0.9027734518051147, + "num_tokens": 7619933.0, + "step": 1870 + }, + { + "entropy": 0.23546839468181133, + "epoch": 0.4371138827369157, + "grad_norm": 0.42578125, + "learning_rate": 1.9995121032298107e-05, + "loss": 0.3185, + "mean_token_accuracy": 0.9401553213596344, + "num_tokens": 7639102.0, + "step": 1875 + }, + { + "entropy": 0.2960228305310011, + "epoch": 0.4382795197575475, + "grad_norm": 0.306640625, + "learning_rate": 1.999509402891942e-05, + "loss": 0.3647, + "mean_token_accuracy": 0.9110294997692108, + "num_tokens": 7662303.0, + "step": 1880 + }, + { + "entropy": 0.28797239661216734, + "epoch": 0.4394451567781793, + "grad_norm": 1.28125, + "learning_rate": 1.999506695105637e-05, + "loss": 0.3928, + "mean_token_accuracy": 0.9123729705810547, + "num_tokens": 7690576.0, + "step": 1885 + }, + { + "entropy": 0.4960401579737663, + "epoch": 0.4406107937988111, + "grad_norm": 2.453125, + "learning_rate": 1.9995039798709356e-05, + "loss": 0.8772, + "mean_token_accuracy": 0.8687143921852112, + "num_tokens": 7697941.0, + "step": 1890 + }, + { + "entropy": 0.3485689952969551, + "epoch": 0.44177643081944284, + "grad_norm": 1.828125, + "learning_rate": 1.9995012571878784e-05, + "loss": 0.7626, + "mean_token_accuracy": 0.8469592273235321, + "num_tokens": 7717516.0, + "step": 1895 + }, + { + "entropy": 0.3791528955101967, + "epoch": 0.4429420678400746, + "grad_norm": 3.0625, + "learning_rate": 1.9994985270565068e-05, + "loss": 0.7173, + "mean_token_accuracy": 0.8701053142547608, + "num_tokens": 7736106.0, + "step": 1900 + }, + { + "entropy": 0.25195520669221877, + "epoch": 0.4441077048607064, + "grad_norm": 1.78125, + "learning_rate": 1.99949578947686e-05, + "loss": 0.5638, + "mean_token_accuracy": 0.9047086894512176, + "num_tokens": 7748913.0, + "step": 1905 + }, + { + "entropy": 0.3371236763894558, + "epoch": 0.44527334188133816, + "grad_norm": 0.359375, + "learning_rate": 1.99949304444898e-05, + "loss": 0.5597, + "mean_token_accuracy": 0.8984895050525665, + "num_tokens": 7766235.0, + "step": 1910 + }, + { + "entropy": 0.305242264457047, + "epoch": 0.44643897890196993, + "grad_norm": 4.40625, + "learning_rate": 1.999490291972908e-05, + "loss": 0.4301, + "mean_token_accuracy": 0.9092363059520722, + "num_tokens": 7794204.0, + "step": 1915 + }, + { + "entropy": 0.2959679692983627, + "epoch": 0.4476046159226017, + "grad_norm": 2.546875, + "learning_rate": 1.9994875320486837e-05, + "loss": 0.6583, + "mean_token_accuracy": 0.9009743809700013, + "num_tokens": 7809446.0, + "step": 1920 + }, + { + "entropy": 0.24514828026294708, + "epoch": 0.44877025294323347, + "grad_norm": 2.484375, + "learning_rate": 1.9994847646763495e-05, + "loss": 0.6453, + "mean_token_accuracy": 0.8898572206497193, + "num_tokens": 7821972.0, + "step": 1925 + }, + { + "entropy": 0.40760628655552866, + "epoch": 0.44993588996386524, + "grad_norm": 1.8828125, + "learning_rate": 1.9994819898559458e-05, + "loss": 0.809, + "mean_token_accuracy": 0.8585144937038421, + "num_tokens": 7841586.0, + "step": 1930 + }, + { + "entropy": 0.44150213301181795, + "epoch": 0.451101526984497, + "grad_norm": 3.265625, + "learning_rate": 1.9994792075875147e-05, + "loss": 0.9715, + "mean_token_accuracy": 0.853536581993103, + "num_tokens": 7854407.0, + "step": 1935 + }, + { + "entropy": 0.49898901283741, + "epoch": 0.4522671640051288, + "grad_norm": 2.328125, + "learning_rate": 1.9994764178710974e-05, + "loss": 0.9198, + "mean_token_accuracy": 0.8207449436187744, + "num_tokens": 7883591.0, + "step": 1940 + }, + { + "entropy": 0.30882971081882715, + "epoch": 0.45343280102576056, + "grad_norm": 2.71875, + "learning_rate": 1.999473620706735e-05, + "loss": 0.645, + "mean_token_accuracy": 0.8867849349975586, + "num_tokens": 7903835.0, + "step": 1945 + }, + { + "entropy": 0.3548956707119942, + "epoch": 0.45459843804639233, + "grad_norm": 1.7421875, + "learning_rate": 1.9994708160944702e-05, + "loss": 0.7981, + "mean_token_accuracy": 0.85789794921875, + "num_tokens": 7914330.0, + "step": 1950 + }, + { + "entropy": 0.3712115705013275, + "epoch": 0.45576407506702415, + "grad_norm": 2.34375, + "learning_rate": 1.999468004034344e-05, + "loss": 0.6738, + "mean_token_accuracy": 0.8534205198287964, + "num_tokens": 7935519.0, + "step": 1955 + }, + { + "entropy": 0.22481756322085858, + "epoch": 0.4569297120876559, + "grad_norm": 0.58984375, + "learning_rate": 1.9994651845263986e-05, + "loss": 0.2575, + "mean_token_accuracy": 0.946908849477768, + "num_tokens": 7966658.0, + "step": 1960 + }, + { + "entropy": 0.26531825475394727, + "epoch": 0.4580953491082877, + "grad_norm": 0.3515625, + "learning_rate": 1.9994623575706762e-05, + "loss": 0.3248, + "mean_token_accuracy": 0.9264973163604736, + "num_tokens": 7987994.0, + "step": 1965 + }, + { + "entropy": 0.27761743124574423, + "epoch": 0.45926098612891947, + "grad_norm": 1.625, + "learning_rate": 1.9994595231672188e-05, + "loss": 0.5992, + "mean_token_accuracy": 0.8765576720237732, + "num_tokens": 8009731.0, + "step": 1970 + }, + { + "entropy": 0.30080183520913123, + "epoch": 0.46042662314955124, + "grad_norm": 0.546875, + "learning_rate": 1.9994566813160686e-05, + "loss": 0.6036, + "mean_token_accuracy": 0.878866708278656, + "num_tokens": 8025321.0, + "step": 1975 + }, + { + "entropy": 0.36402922198176385, + "epoch": 0.461592260170183, + "grad_norm": 5.21875, + "learning_rate": 1.999453832017268e-05, + "loss": 0.6552, + "mean_token_accuracy": 0.8646969377994538, + "num_tokens": 8053893.0, + "step": 1980 + }, + { + "entropy": 0.28284838795661926, + "epoch": 0.4627578971908148, + "grad_norm": 1.4296875, + "learning_rate": 1.9994509752708596e-05, + "loss": 0.4469, + "mean_token_accuracy": 0.9201807320117951, + "num_tokens": 8076731.0, + "step": 1985 + }, + { + "entropy": 0.37472383230924605, + "epoch": 0.46392353421144655, + "grad_norm": 2.21875, + "learning_rate": 1.999448111076886e-05, + "loss": 0.7793, + "mean_token_accuracy": 0.8679640293121338, + "num_tokens": 8085669.0, + "step": 1990 + }, + { + "entropy": 0.21981629952788354, + "epoch": 0.4650891712320783, + "grad_norm": 6.46875, + "learning_rate": 1.99944523943539e-05, + "loss": 0.4973, + "mean_token_accuracy": 0.9069263756275177, + "num_tokens": 8101729.0, + "step": 1995 + }, + { + "entropy": 0.48663657903671265, + "epoch": 0.4662548082527101, + "grad_norm": 0.9375, + "learning_rate": 1.999442360346414e-05, + "loss": 0.3556, + "mean_token_accuracy": 0.9074055433273316, + "num_tokens": 8128873.0, + "step": 2000 + }, + { + "entropy": 0.36330757662653923, + "epoch": 0.46742044527334187, + "grad_norm": 3.46875, + "learning_rate": 1.9994394738100014e-05, + "loss": 0.689, + "mean_token_accuracy": 0.8651768624782562, + "num_tokens": 8156836.0, + "step": 2005 + }, + { + "entropy": 0.29512323513627053, + "epoch": 0.46858608229397364, + "grad_norm": 1.1796875, + "learning_rate": 1.999436579826195e-05, + "loss": 0.5125, + "mean_token_accuracy": 0.8990586638450623, + "num_tokens": 8176283.0, + "step": 2010 + }, + { + "entropy": 0.3674930900335312, + "epoch": 0.4697517193146054, + "grad_norm": 1.390625, + "learning_rate": 1.999433678395038e-05, + "loss": 0.7055, + "mean_token_accuracy": 0.8752959787845611, + "num_tokens": 8211887.0, + "step": 2015 + }, + { + "entropy": 0.3832183495163918, + "epoch": 0.4709173563352372, + "grad_norm": 4.53125, + "learning_rate": 1.9994307695165732e-05, + "loss": 0.7355, + "mean_token_accuracy": 0.8708587050437927, + "num_tokens": 8231249.0, + "step": 2020 + }, + { + "entropy": 0.32334342747926714, + "epoch": 0.472082993355869, + "grad_norm": 1.1953125, + "learning_rate": 1.999427853190845e-05, + "loss": 0.4626, + "mean_token_accuracy": 0.8914056956768036, + "num_tokens": 8254247.0, + "step": 2025 + }, + { + "entropy": 0.39032787531614305, + "epoch": 0.4732486303765008, + "grad_norm": 2.609375, + "learning_rate": 1.9994249294178964e-05, + "loss": 0.6039, + "mean_token_accuracy": 0.8988476514816284, + "num_tokens": 8271475.0, + "step": 2030 + }, + { + "entropy": 0.2363220054656267, + "epoch": 0.47441426739713255, + "grad_norm": 3.453125, + "learning_rate": 1.9994219981977704e-05, + "loss": 0.3266, + "mean_token_accuracy": 0.9166711091995239, + "num_tokens": 8298232.0, + "step": 2035 + }, + { + "entropy": 0.2717366095632315, + "epoch": 0.4755799044177643, + "grad_norm": 1.96875, + "learning_rate": 1.9994190595305115e-05, + "loss": 0.4732, + "mean_token_accuracy": 0.8963268280029297, + "num_tokens": 8335594.0, + "step": 2040 + }, + { + "entropy": 0.26531027555465697, + "epoch": 0.4767455414383961, + "grad_norm": 0.384765625, + "learning_rate": 1.9994161134161632e-05, + "loss": 0.5162, + "mean_token_accuracy": 0.9053733050823212, + "num_tokens": 8353409.0, + "step": 2045 + }, + { + "entropy": 0.2092266406863928, + "epoch": 0.47791117845902786, + "grad_norm": 2.21875, + "learning_rate": 1.9994131598547698e-05, + "loss": 0.3521, + "mean_token_accuracy": 0.9272272467613221, + "num_tokens": 8391098.0, + "step": 2050 + }, + { + "entropy": 0.26025751419365406, + "epoch": 0.47907681547965963, + "grad_norm": 0.546875, + "learning_rate": 1.9994101988463748e-05, + "loss": 0.588, + "mean_token_accuracy": 0.8997723400592804, + "num_tokens": 8410541.0, + "step": 2055 + }, + { + "entropy": 0.23223126903176308, + "epoch": 0.4802424525002914, + "grad_norm": 0.59375, + "learning_rate": 1.9994072303910226e-05, + "loss": 0.2832, + "mean_token_accuracy": 0.9168312430381775, + "num_tokens": 8443139.0, + "step": 2060 + }, + { + "entropy": 0.29509652592241764, + "epoch": 0.4814080895209232, + "grad_norm": 3.078125, + "learning_rate": 1.9994042544887574e-05, + "loss": 0.3704, + "mean_token_accuracy": 0.9128056645393372, + "num_tokens": 8473193.0, + "step": 2065 + }, + { + "entropy": 0.33343763947486876, + "epoch": 0.48257372654155495, + "grad_norm": 2.75, + "learning_rate": 1.9994012711396235e-05, + "loss": 0.5006, + "mean_token_accuracy": 0.8973432779312134, + "num_tokens": 8507250.0, + "step": 2070 + }, + { + "entropy": 0.25346662774682044, + "epoch": 0.4837393635621867, + "grad_norm": 5.0, + "learning_rate": 1.999398280343666e-05, + "loss": 0.3977, + "mean_token_accuracy": 0.9170755028724671, + "num_tokens": 8527345.0, + "step": 2075 + }, + { + "entropy": 0.27076412811875344, + "epoch": 0.4849050005828185, + "grad_norm": 1.296875, + "learning_rate": 1.9993952821009284e-05, + "loss": 0.3173, + "mean_token_accuracy": 0.9087885797023774, + "num_tokens": 8553539.0, + "step": 2080 + }, + { + "entropy": 0.2886177830398083, + "epoch": 0.48607063760345026, + "grad_norm": 1.0546875, + "learning_rate": 1.9993922764114563e-05, + "loss": 0.487, + "mean_token_accuracy": 0.8957848846912384, + "num_tokens": 8571399.0, + "step": 2085 + }, + { + "entropy": 0.27993311583995817, + "epoch": 0.4872362746240821, + "grad_norm": 0.42578125, + "learning_rate": 1.9993892632752944e-05, + "loss": 0.6064, + "mean_token_accuracy": 0.9026101946830749, + "num_tokens": 8586558.0, + "step": 2090 + }, + { + "entropy": 0.3958744205534458, + "epoch": 0.48840191164471386, + "grad_norm": 3.046875, + "learning_rate": 1.999386242692487e-05, + "loss": 0.8093, + "mean_token_accuracy": 0.8745874762535095, + "num_tokens": 8595053.0, + "step": 2095 + }, + { + "entropy": 0.24650852084159852, + "epoch": 0.48956754866534563, + "grad_norm": 1.5703125, + "learning_rate": 1.9993832146630798e-05, + "loss": 0.5389, + "mean_token_accuracy": 0.911456036567688, + "num_tokens": 8615637.0, + "step": 2100 + }, + { + "entropy": 0.42095171473920345, + "epoch": 0.4907331856859774, + "grad_norm": 0.6015625, + "learning_rate": 1.9993801791871178e-05, + "loss": 0.5099, + "mean_token_accuracy": 0.8906022787094117, + "num_tokens": 8634768.0, + "step": 2105 + }, + { + "entropy": 0.21017308831214904, + "epoch": 0.4918988227066092, + "grad_norm": 2.265625, + "learning_rate": 1.9993771362646462e-05, + "loss": 0.3232, + "mean_token_accuracy": 0.9302698016166687, + "num_tokens": 8660305.0, + "step": 2110 + }, + { + "entropy": 0.3113011471927166, + "epoch": 0.49306445972724094, + "grad_norm": 2.125, + "learning_rate": 1.999374085895711e-05, + "loss": 0.5919, + "mean_token_accuracy": 0.8718327701091766, + "num_tokens": 8688826.0, + "step": 2115 + }, + { + "entropy": 0.49419403113424776, + "epoch": 0.4942300967478727, + "grad_norm": 2.28125, + "learning_rate": 1.999371028080356e-05, + "loss": 0.8276, + "mean_token_accuracy": 0.8368937015533447, + "num_tokens": 8716357.0, + "step": 2120 + }, + { + "entropy": 0.3417753532528877, + "epoch": 0.4953957337685045, + "grad_norm": 1.609375, + "learning_rate": 1.9993679628186285e-05, + "loss": 0.7675, + "mean_token_accuracy": 0.8862788140773773, + "num_tokens": 8725375.0, + "step": 2125 + }, + { + "entropy": 0.30906201936304567, + "epoch": 0.49656137078913626, + "grad_norm": 2.578125, + "learning_rate": 1.9993648901105734e-05, + "loss": 0.4319, + "mean_token_accuracy": 0.8808507740497589, + "num_tokens": 8769781.0, + "step": 2130 + }, + { + "entropy": 0.38649284690618513, + "epoch": 0.49772700780976803, + "grad_norm": 1.4921875, + "learning_rate": 1.9993618099562367e-05, + "loss": 0.6999, + "mean_token_accuracy": 0.8772247135639191, + "num_tokens": 8782281.0, + "step": 2135 + }, + { + "entropy": 0.31261972039937974, + "epoch": 0.4988926448303998, + "grad_norm": 3.8125, + "learning_rate": 1.9993587223556646e-05, + "loss": 0.6166, + "mean_token_accuracy": 0.8909159898757935, + "num_tokens": 8793835.0, + "step": 2140 + }, + { + "entropy": 0.33045312389731407, + "epoch": 0.5000582818510316, + "grad_norm": 0.6953125, + "learning_rate": 1.9993556273089027e-05, + "loss": 0.4532, + "mean_token_accuracy": 0.9167261719703674, + "num_tokens": 8825829.0, + "step": 2145 + }, + { + "entropy": 0.31448785960674286, + "epoch": 0.5012239188716634, + "grad_norm": 0.7265625, + "learning_rate": 1.999352524815997e-05, + "loss": 0.4656, + "mean_token_accuracy": 0.8962031364440918, + "num_tokens": 8846276.0, + "step": 2150 + }, + { + "entropy": 0.3556225474923849, + "epoch": 0.5023895558922952, + "grad_norm": 0.796875, + "learning_rate": 1.9993494148769944e-05, + "loss": 0.3641, + "mean_token_accuracy": 0.8676750302314759, + "num_tokens": 8882856.0, + "step": 2155 + }, + { + "entropy": 0.38040348403155805, + "epoch": 0.5035551929129269, + "grad_norm": 3.96875, + "learning_rate": 1.9993462974919412e-05, + "loss": 0.5104, + "mean_token_accuracy": 0.8853009760379791, + "num_tokens": 8900856.0, + "step": 2160 + }, + { + "entropy": 0.24543123692274094, + "epoch": 0.5047208299335587, + "grad_norm": 0.53125, + "learning_rate": 1.9993431726608832e-05, + "loss": 0.3641, + "mean_token_accuracy": 0.9188506543636322, + "num_tokens": 8939209.0, + "step": 2165 + }, + { + "entropy": 0.38197765350341795, + "epoch": 0.5058864669541905, + "grad_norm": 1.1328125, + "learning_rate": 1.9993400403838676e-05, + "loss": 0.6432, + "mean_token_accuracy": 0.8996619880199432, + "num_tokens": 8965669.0, + "step": 2170 + }, + { + "entropy": 0.21522373408079148, + "epoch": 0.5070521039748223, + "grad_norm": 1.1171875, + "learning_rate": 1.999336900660941e-05, + "loss": 0.285, + "mean_token_accuracy": 0.9250701904296875, + "num_tokens": 8995150.0, + "step": 2175 + }, + { + "entropy": 0.24849132001399993, + "epoch": 0.508217740995454, + "grad_norm": 0.94921875, + "learning_rate": 1.99933375349215e-05, + "loss": 0.2888, + "mean_token_accuracy": 0.8985641777515412, + "num_tokens": 9025083.0, + "step": 2180 + }, + { + "entropy": 0.3158140107989311, + "epoch": 0.5093833780160858, + "grad_norm": 1.7109375, + "learning_rate": 1.999330598877542e-05, + "loss": 0.7789, + "mean_token_accuracy": 0.8875580310821534, + "num_tokens": 9044814.0, + "step": 2185 + }, + { + "entropy": 0.29365613460540774, + "epoch": 0.5105490150367176, + "grad_norm": 2.1875, + "learning_rate": 1.9993274368171635e-05, + "loss": 0.489, + "mean_token_accuracy": 0.9092134416103363, + "num_tokens": 9059605.0, + "step": 2190 + }, + { + "entropy": 0.3623622298240662, + "epoch": 0.5117146520573493, + "grad_norm": 3.015625, + "learning_rate": 1.999324267311062e-05, + "loss": 0.7613, + "mean_token_accuracy": 0.8859258830547333, + "num_tokens": 9068729.0, + "step": 2195 + }, + { + "entropy": 0.2239295145496726, + "epoch": 0.5128802890779811, + "grad_norm": 1.875, + "learning_rate": 1.9993210903592845e-05, + "loss": 0.3056, + "mean_token_accuracy": 0.9294314622879029, + "num_tokens": 9104084.0, + "step": 2200 + }, + { + "entropy": 0.3030864965170622, + "epoch": 0.5140459260986129, + "grad_norm": 2.734375, + "learning_rate": 1.9993179059618786e-05, + "loss": 0.4594, + "mean_token_accuracy": 0.8959613561630249, + "num_tokens": 9130649.0, + "step": 2205 + }, + { + "entropy": 0.36943227648735044, + "epoch": 0.5152115631192447, + "grad_norm": 1.203125, + "learning_rate": 1.999314714118892e-05, + "loss": 0.7231, + "mean_token_accuracy": 0.8850698232650757, + "num_tokens": 9144581.0, + "step": 2210 + }, + { + "entropy": 0.32705827206373217, + "epoch": 0.5163772001398764, + "grad_norm": 2.0, + "learning_rate": 1.9993115148303713e-05, + "loss": 0.5517, + "mean_token_accuracy": 0.879249781370163, + "num_tokens": 9159448.0, + "step": 2215 + }, + { + "entropy": 0.3374114118516445, + "epoch": 0.5175428371605082, + "grad_norm": 1.2578125, + "learning_rate": 1.9993083080963655e-05, + "loss": 0.5958, + "mean_token_accuracy": 0.8918022990226746, + "num_tokens": 9182436.0, + "step": 2220 + }, + { + "entropy": 0.2363934338092804, + "epoch": 0.51870847418114, + "grad_norm": 1.703125, + "learning_rate": 1.9993050939169217e-05, + "loss": 0.4725, + "mean_token_accuracy": 0.9231416463851929, + "num_tokens": 9200503.0, + "step": 2225 + }, + { + "entropy": 0.21474622301757335, + "epoch": 0.5198741112017717, + "grad_norm": 1.828125, + "learning_rate": 1.999301872292088e-05, + "loss": 0.3416, + "mean_token_accuracy": 0.9220646381378174, + "num_tokens": 9229185.0, + "step": 2230 + }, + { + "entropy": 0.33478106260299684, + "epoch": 0.5210397482224035, + "grad_norm": 4.34375, + "learning_rate": 1.9992986432219122e-05, + "loss": 0.7656, + "mean_token_accuracy": 0.8595859110355377, + "num_tokens": 9239938.0, + "step": 2235 + }, + { + "entropy": 0.2523132786154747, + "epoch": 0.5222053852430353, + "grad_norm": 1.7109375, + "learning_rate": 1.999295406706443e-05, + "loss": 0.3842, + "mean_token_accuracy": 0.8974947988986969, + "num_tokens": 9262129.0, + "step": 2240 + }, + { + "entropy": 0.3062707144767046, + "epoch": 0.523371022263667, + "grad_norm": 2.859375, + "learning_rate": 1.9992921627457278e-05, + "loss": 0.6809, + "mean_token_accuracy": 0.8839954435825348, + "num_tokens": 9278003.0, + "step": 2245 + }, + { + "entropy": 0.2711375970393419, + "epoch": 0.5245366592842988, + "grad_norm": 2.90625, + "learning_rate": 1.9992889113398158e-05, + "loss": 0.5016, + "mean_token_accuracy": 0.9179113209247589, + "num_tokens": 9292655.0, + "step": 2250 + }, + { + "entropy": 0.27142874896526337, + "epoch": 0.5257022963049306, + "grad_norm": 1.1328125, + "learning_rate": 1.9992856524887553e-05, + "loss": 0.6119, + "mean_token_accuracy": 0.892951512336731, + "num_tokens": 9303397.0, + "step": 2255 + }, + { + "entropy": 0.24010205119848252, + "epoch": 0.5268679333255624, + "grad_norm": 1.8671875, + "learning_rate": 1.9992823861925944e-05, + "loss": 0.5484, + "mean_token_accuracy": 0.9073122262954711, + "num_tokens": 9315706.0, + "step": 2260 + }, + { + "entropy": 0.2968090422451496, + "epoch": 0.5280335703461942, + "grad_norm": 0.55859375, + "learning_rate": 1.999279112451382e-05, + "loss": 0.5365, + "mean_token_accuracy": 0.8851703405380249, + "num_tokens": 9328933.0, + "step": 2265 + }, + { + "entropy": 0.25726330243051054, + "epoch": 0.529199207366826, + "grad_norm": 0.345703125, + "learning_rate": 1.9992758312651673e-05, + "loss": 0.5031, + "mean_token_accuracy": 0.9195921361446381, + "num_tokens": 9353825.0, + "step": 2270 + }, + { + "entropy": 0.2275122195482254, + "epoch": 0.5303648443874578, + "grad_norm": 0.4375, + "learning_rate": 1.9992725426339995e-05, + "loss": 0.2281, + "mean_token_accuracy": 0.9220161437988281, + "num_tokens": 9392817.0, + "step": 2275 + }, + { + "entropy": 0.33882615994662046, + "epoch": 0.5315304814080896, + "grad_norm": 1.46875, + "learning_rate": 1.9992692465579266e-05, + "loss": 0.5083, + "mean_token_accuracy": 0.8918252646923065, + "num_tokens": 9418700.0, + "step": 2280 + }, + { + "entropy": 0.2928564824163914, + "epoch": 0.5326961184287213, + "grad_norm": 1.2890625, + "learning_rate": 1.9992659430369984e-05, + "loss": 0.4381, + "mean_token_accuracy": 0.9111608743667603, + "num_tokens": 9439423.0, + "step": 2285 + }, + { + "entropy": 0.2320990853011608, + "epoch": 0.5338617554493531, + "grad_norm": 1.5, + "learning_rate": 1.999262632071264e-05, + "loss": 0.4826, + "mean_token_accuracy": 0.9189637899398804, + "num_tokens": 9461479.0, + "step": 2290 + }, + { + "entropy": 0.26314267963171006, + "epoch": 0.5350273924699849, + "grad_norm": 1.0, + "learning_rate": 1.999259313660773e-05, + "loss": 0.4041, + "mean_token_accuracy": 0.9190383732318879, + "num_tokens": 9491437.0, + "step": 2295 + }, + { + "entropy": 0.24241818580776453, + "epoch": 0.5361930294906166, + "grad_norm": 1.8125, + "learning_rate": 1.9992559878055743e-05, + "loss": 0.382, + "mean_token_accuracy": 0.9189897537231445, + "num_tokens": 9514680.0, + "step": 2300 + }, + { + "entropy": 0.2492776945233345, + "epoch": 0.5373586665112484, + "grad_norm": 3.28125, + "learning_rate": 1.9992526545057184e-05, + "loss": 0.6233, + "mean_token_accuracy": 0.8866626501083374, + "num_tokens": 9535986.0, + "step": 2305 + }, + { + "entropy": 0.26340335980057716, + "epoch": 0.5385243035318802, + "grad_norm": 1.0859375, + "learning_rate": 1.9992493137612543e-05, + "loss": 0.6176, + "mean_token_accuracy": 0.8974877238273621, + "num_tokens": 9548719.0, + "step": 2310 + }, + { + "entropy": 0.44216432273387907, + "epoch": 0.539689940552512, + "grad_norm": 0.27734375, + "learning_rate": 1.999245965572232e-05, + "loss": 0.7069, + "mean_token_accuracy": 0.8899427652359009, + "num_tokens": 9585119.0, + "step": 2315 + }, + { + "entropy": 0.28867107182741164, + "epoch": 0.5408555775731437, + "grad_norm": 5.625, + "learning_rate": 1.9992426099387014e-05, + "loss": 0.5598, + "mean_token_accuracy": 0.8989390969276428, + "num_tokens": 9597896.0, + "step": 2320 + }, + { + "entropy": 0.27894147783517836, + "epoch": 0.5420212145937755, + "grad_norm": 0.400390625, + "learning_rate": 1.9992392468607127e-05, + "loss": 0.5465, + "mean_token_accuracy": 0.8931815326213837, + "num_tokens": 9619759.0, + "step": 2325 + }, + { + "entropy": 0.2875699769705534, + "epoch": 0.5431868516144073, + "grad_norm": 1.3671875, + "learning_rate": 1.999235876338316e-05, + "loss": 0.4764, + "mean_token_accuracy": 0.8938539862632752, + "num_tokens": 9639899.0, + "step": 2330 + }, + { + "entropy": 0.33880536295473573, + "epoch": 0.544352488635039, + "grad_norm": 0.27734375, + "learning_rate": 1.9992324983715612e-05, + "loss": 0.5468, + "mean_token_accuracy": 0.8840242743492126, + "num_tokens": 9683774.0, + "step": 2335 + }, + { + "entropy": 0.19824768621474503, + "epoch": 0.5455181256556708, + "grad_norm": 0.4765625, + "learning_rate": 1.999229112960499e-05, + "loss": 0.3, + "mean_token_accuracy": 0.9403502106666565, + "num_tokens": 9715282.0, + "step": 2340 + }, + { + "entropy": 0.4543208494782448, + "epoch": 0.5466837626763026, + "grad_norm": 0.734375, + "learning_rate": 1.9992257201051802e-05, + "loss": 0.6336, + "mean_token_accuracy": 0.8709604918956757, + "num_tokens": 9729682.0, + "step": 2345 + }, + { + "entropy": 0.3436585277318954, + "epoch": 0.5478493996969344, + "grad_norm": 2.234375, + "learning_rate": 1.9992223198056545e-05, + "loss": 0.6982, + "mean_token_accuracy": 0.8596957445144653, + "num_tokens": 9755689.0, + "step": 2350 + }, + { + "entropy": 0.36073665916919706, + "epoch": 0.5490150367175661, + "grad_norm": 2.984375, + "learning_rate": 1.9992189120619736e-05, + "loss": 0.9042, + "mean_token_accuracy": 0.8605527520179749, + "num_tokens": 9763825.0, + "step": 2355 + }, + { + "entropy": 0.3805040195584297, + "epoch": 0.5501806737381979, + "grad_norm": 2.390625, + "learning_rate": 1.9992154968741877e-05, + "loss": 0.7636, + "mean_token_accuracy": 0.8586589694023132, + "num_tokens": 9776348.0, + "step": 2360 + }, + { + "entropy": 0.20938777849078177, + "epoch": 0.5513463107588297, + "grad_norm": 1.609375, + "learning_rate": 1.9992120742423476e-05, + "loss": 0.4021, + "mean_token_accuracy": 0.9294235825538635, + "num_tokens": 9794244.0, + "step": 2365 + }, + { + "entropy": 0.3471638299524784, + "epoch": 0.5525119477794614, + "grad_norm": 2.65625, + "learning_rate": 1.9992086441665052e-05, + "loss": 0.5431, + "mean_token_accuracy": 0.8725887358188629, + "num_tokens": 9808138.0, + "step": 2370 + }, + { + "entropy": 0.33833655789494516, + "epoch": 0.5536775848000932, + "grad_norm": 2.734375, + "learning_rate": 1.9992052066467106e-05, + "loss": 0.6093, + "mean_token_accuracy": 0.8938002645969391, + "num_tokens": 9824435.0, + "step": 2375 + }, + { + "entropy": 0.29400101453065874, + "epoch": 0.554843221820725, + "grad_norm": 4.125, + "learning_rate": 1.9992017616830156e-05, + "loss": 0.6356, + "mean_token_accuracy": 0.8934900283813476, + "num_tokens": 9838139.0, + "step": 2380 + }, + { + "entropy": 0.30893067717552186, + "epoch": 0.5560088588413568, + "grad_norm": 1.640625, + "learning_rate": 1.9991983092754717e-05, + "loss": 0.631, + "mean_token_accuracy": 0.8898307919502259, + "num_tokens": 9855668.0, + "step": 2385 + }, + { + "entropy": 0.28324234634637835, + "epoch": 0.5571744958619885, + "grad_norm": 1.3515625, + "learning_rate": 1.99919484942413e-05, + "loss": 0.4305, + "mean_token_accuracy": 0.9174308836460113, + "num_tokens": 9870495.0, + "step": 2390 + }, + { + "entropy": 0.29785450994968415, + "epoch": 0.5583401328826203, + "grad_norm": 3.5, + "learning_rate": 1.9991913821290423e-05, + "loss": 0.7319, + "mean_token_accuracy": 0.8782787978649139, + "num_tokens": 9881271.0, + "step": 2395 + }, + { + "entropy": 0.29459096789360045, + "epoch": 0.5595057699032522, + "grad_norm": 3.03125, + "learning_rate": 1.99918790739026e-05, + "loss": 0.4413, + "mean_token_accuracy": 0.9159037947654725, + "num_tokens": 9897443.0, + "step": 2400 + }, + { + "entropy": 0.2606068912893534, + "epoch": 0.560671406923884, + "grad_norm": 1.8046875, + "learning_rate": 1.9991844252078355e-05, + "loss": 0.5554, + "mean_token_accuracy": 0.9029782056808472, + "num_tokens": 9915846.0, + "step": 2405 + }, + { + "entropy": 0.3030439902096987, + "epoch": 0.5618370439445157, + "grad_norm": 1.9296875, + "learning_rate": 1.9991809355818207e-05, + "loss": 0.4412, + "mean_token_accuracy": 0.89673712849617, + "num_tokens": 9941287.0, + "step": 2410 + }, + { + "entropy": 0.2712526571005583, + "epoch": 0.5630026809651475, + "grad_norm": 1.7109375, + "learning_rate": 1.9991774385122665e-05, + "loss": 0.3325, + "mean_token_accuracy": 0.9228278815746307, + "num_tokens": 9960373.0, + "step": 2415 + }, + { + "entropy": 0.32818218022584916, + "epoch": 0.5641683179857793, + "grad_norm": 3.28125, + "learning_rate": 1.9991739339992266e-05, + "loss": 0.6853, + "mean_token_accuracy": 0.869037389755249, + "num_tokens": 9979671.0, + "step": 2420 + }, + { + "entropy": 0.3927590001374483, + "epoch": 0.565333955006411, + "grad_norm": 0.51953125, + "learning_rate": 1.9991704220427522e-05, + "loss": 0.5787, + "mean_token_accuracy": 0.8790532112121582, + "num_tokens": 10000556.0, + "step": 2425 + }, + { + "entropy": 0.37120668292045594, + "epoch": 0.5664995920270428, + "grad_norm": 1.9609375, + "learning_rate": 1.9991669026428965e-05, + "loss": 0.8423, + "mean_token_accuracy": 0.879127699136734, + "num_tokens": 10008605.0, + "step": 2430 + }, + { + "entropy": 0.3100695013999939, + "epoch": 0.5676652290476746, + "grad_norm": 2.125, + "learning_rate": 1.999163375799711e-05, + "loss": 0.347, + "mean_token_accuracy": 0.8831665992736817, + "num_tokens": 10033245.0, + "step": 2435 + }, + { + "entropy": 0.39593042582273485, + "epoch": 0.5688308660683064, + "grad_norm": 1.9296875, + "learning_rate": 1.999159841513249e-05, + "loss": 0.6707, + "mean_token_accuracy": 0.8801019787788391, + "num_tokens": 10044394.0, + "step": 2440 + }, + { + "entropy": 0.36338729187846186, + "epoch": 0.5699965030889381, + "grad_norm": 2.125, + "learning_rate": 1.9991562997835626e-05, + "loss": 0.5301, + "mean_token_accuracy": 0.8973929762840271, + "num_tokens": 10055701.0, + "step": 2445 + }, + { + "entropy": 0.3218478888273239, + "epoch": 0.5711621401095699, + "grad_norm": 2.65625, + "learning_rate": 1.9991527506107052e-05, + "loss": 0.6329, + "mean_token_accuracy": 0.871570247411728, + "num_tokens": 10077584.0, + "step": 2450 + }, + { + "entropy": 0.2377581749111414, + "epoch": 0.5723277771302017, + "grad_norm": 4.375, + "learning_rate": 1.99914919399473e-05, + "loss": 0.5299, + "mean_token_accuracy": 0.9120466470718384, + "num_tokens": 10097728.0, + "step": 2455 + }, + { + "entropy": 0.3393100008368492, + "epoch": 0.5734934141508334, + "grad_norm": 0.94921875, + "learning_rate": 1.999145629935689e-05, + "loss": 0.4899, + "mean_token_accuracy": 0.9051253616809845, + "num_tokens": 10109022.0, + "step": 2460 + }, + { + "entropy": 0.1963107619434595, + "epoch": 0.5746590511714652, + "grad_norm": 1.25, + "learning_rate": 1.999142058433636e-05, + "loss": 0.4243, + "mean_token_accuracy": 0.9210612416267395, + "num_tokens": 10132061.0, + "step": 2465 + }, + { + "entropy": 0.32366092354059217, + "epoch": 0.575824688192097, + "grad_norm": 3.96875, + "learning_rate": 1.999138479488624e-05, + "loss": 0.68, + "mean_token_accuracy": 0.8752106666564942, + "num_tokens": 10150067.0, + "step": 2470 + }, + { + "entropy": 0.2483457863330841, + "epoch": 0.5769903252127287, + "grad_norm": 2.234375, + "learning_rate": 1.999134893100707e-05, + "loss": 0.6091, + "mean_token_accuracy": 0.9089722335338593, + "num_tokens": 10161991.0, + "step": 2475 + }, + { + "entropy": 0.32887863293290137, + "epoch": 0.5781559622333605, + "grad_norm": 2.796875, + "learning_rate": 1.9991312992699377e-05, + "loss": 0.4883, + "mean_token_accuracy": 0.8847136437892914, + "num_tokens": 10176161.0, + "step": 2480 + }, + { + "entropy": 0.3010684911161661, + "epoch": 0.5793215992539923, + "grad_norm": 1.1328125, + "learning_rate": 1.99912769799637e-05, + "loss": 0.4217, + "mean_token_accuracy": 0.893145751953125, + "num_tokens": 10206309.0, + "step": 2485 + }, + { + "entropy": 0.27251421064138415, + "epoch": 0.5804872362746241, + "grad_norm": 0.458984375, + "learning_rate": 1.9991240892800576e-05, + "loss": 0.4683, + "mean_token_accuracy": 0.9054334104061127, + "num_tokens": 10231252.0, + "step": 2490 + }, + { + "entropy": 0.3114484429359436, + "epoch": 0.5816528732952558, + "grad_norm": 1.6796875, + "learning_rate": 1.9991204731210543e-05, + "loss": 0.5862, + "mean_token_accuracy": 0.8936571300029754, + "num_tokens": 10249983.0, + "step": 2495 + }, + { + "entropy": 0.24855418205261232, + "epoch": 0.5828185103158876, + "grad_norm": 2.40625, + "learning_rate": 1.9991168495194138e-05, + "loss": 0.5488, + "mean_token_accuracy": 0.9062022984027862, + "num_tokens": 10261309.0, + "step": 2500 + }, + { + "entropy": 0.22968003787100316, + "epoch": 0.5839841473365194, + "grad_norm": 1.453125, + "learning_rate": 1.999113218475191e-05, + "loss": 0.4208, + "mean_token_accuracy": 0.9215640425682068, + "num_tokens": 10279292.0, + "step": 2505 + }, + { + "entropy": 0.3279329985380173, + "epoch": 0.5851497843571511, + "grad_norm": 3.375, + "learning_rate": 1.9991095799884392e-05, + "loss": 0.5601, + "mean_token_accuracy": 0.8970583975315094, + "num_tokens": 10297929.0, + "step": 2510 + }, + { + "entropy": 0.2907501269131899, + "epoch": 0.5863154213777829, + "grad_norm": 2.171875, + "learning_rate": 1.9991059340592125e-05, + "loss": 0.3888, + "mean_token_accuracy": 0.9076000213623047, + "num_tokens": 10329976.0, + "step": 2515 + }, + { + "entropy": 0.3549231544137001, + "epoch": 0.5874810583984147, + "grad_norm": 2.96875, + "learning_rate": 1.9991022806875656e-05, + "loss": 0.6519, + "mean_token_accuracy": 0.8874382436275482, + "num_tokens": 10347544.0, + "step": 2520 + }, + { + "entropy": 0.3080939695239067, + "epoch": 0.5886466954190465, + "grad_norm": 3.6875, + "learning_rate": 1.9990986198735533e-05, + "loss": 0.5544, + "mean_token_accuracy": 0.8929486870765686, + "num_tokens": 10366002.0, + "step": 2525 + }, + { + "entropy": 0.3876295104622841, + "epoch": 0.5898123324396782, + "grad_norm": 1.9921875, + "learning_rate": 1.99909495161723e-05, + "loss": 0.7927, + "mean_token_accuracy": 0.8583209037780761, + "num_tokens": 10378268.0, + "step": 2530 + }, + { + "entropy": 0.41035100668668745, + "epoch": 0.59097796946031, + "grad_norm": 2.1875, + "learning_rate": 1.9990912759186498e-05, + "loss": 0.8022, + "mean_token_accuracy": 0.8597595751285553, + "num_tokens": 10390596.0, + "step": 2535 + }, + { + "entropy": 0.24308755919337272, + "epoch": 0.5921436064809419, + "grad_norm": 1.703125, + "learning_rate": 1.9990875927778684e-05, + "loss": 0.3967, + "mean_token_accuracy": 0.9243945956230164, + "num_tokens": 10413960.0, + "step": 2540 + }, + { + "entropy": 0.21663215793669224, + "epoch": 0.5933092435015737, + "grad_norm": 0.1796875, + "learning_rate": 1.99908390219494e-05, + "loss": 0.2111, + "mean_token_accuracy": 0.9359542548656463, + "num_tokens": 10437663.0, + "step": 2545 + }, + { + "entropy": 0.408858510479331, + "epoch": 0.5944748805222054, + "grad_norm": 0.67578125, + "learning_rate": 1.9990802041699206e-05, + "loss": 0.6026, + "mean_token_accuracy": 0.8689159452915192, + "num_tokens": 10455699.0, + "step": 2550 + }, + { + "entropy": 0.29723052904009817, + "epoch": 0.5956405175428372, + "grad_norm": 3.390625, + "learning_rate": 1.9990764987028642e-05, + "loss": 0.5624, + "mean_token_accuracy": 0.8806465566158295, + "num_tokens": 10474849.0, + "step": 2555 + }, + { + "entropy": 0.312545171380043, + "epoch": 0.596806154563469, + "grad_norm": 6.09375, + "learning_rate": 1.9990727857938265e-05, + "loss": 0.6364, + "mean_token_accuracy": 0.8904350399971008, + "num_tokens": 10484376.0, + "step": 2560 + }, + { + "entropy": 0.2090074449777603, + "epoch": 0.5979717915841007, + "grad_norm": 2.15625, + "learning_rate": 1.9990690654428627e-05, + "loss": 0.3618, + "mean_token_accuracy": 0.9283179759979248, + "num_tokens": 10510343.0, + "step": 2565 + }, + { + "entropy": 0.24909560680389403, + "epoch": 0.5991374286047325, + "grad_norm": 1.2578125, + "learning_rate": 1.999065337650029e-05, + "loss": 0.5294, + "mean_token_accuracy": 0.9161015510559082, + "num_tokens": 10522462.0, + "step": 2570 + }, + { + "entropy": 0.436211097240448, + "epoch": 0.6003030656253643, + "grad_norm": 0.357421875, + "learning_rate": 1.9990616024153804e-05, + "loss": 0.7157, + "mean_token_accuracy": 0.8679668664932251, + "num_tokens": 10555004.0, + "step": 2575 + }, + { + "entropy": 0.3197683900594711, + "epoch": 0.6014687026459961, + "grad_norm": 1.796875, + "learning_rate": 1.9990578597389726e-05, + "loss": 0.6512, + "mean_token_accuracy": 0.8892592430114746, + "num_tokens": 10564631.0, + "step": 2580 + }, + { + "entropy": 0.3422405393794179, + "epoch": 0.6026343396666278, + "grad_norm": 1.5234375, + "learning_rate": 1.9990541096208614e-05, + "loss": 0.5619, + "mean_token_accuracy": 0.8856872081756592, + "num_tokens": 10601595.0, + "step": 2585 + }, + { + "entropy": 0.3777252405881882, + "epoch": 0.6037999766872596, + "grad_norm": 0.5078125, + "learning_rate": 1.999050352061103e-05, + "loss": 0.4936, + "mean_token_accuracy": 0.8734250783920288, + "num_tokens": 10621546.0, + "step": 2590 + }, + { + "entropy": 0.32311970740556717, + "epoch": 0.6049656137078914, + "grad_norm": 2.9375, + "learning_rate": 1.9990465870597528e-05, + "loss": 0.7566, + "mean_token_accuracy": 0.8670526087284088, + "num_tokens": 10632893.0, + "step": 2595 + }, + { + "entropy": 0.3077698152512312, + "epoch": 0.6061312507285231, + "grad_norm": 2.328125, + "learning_rate": 1.999042814616868e-05, + "loss": 0.4922, + "mean_token_accuracy": 0.913305151462555, + "num_tokens": 10660453.0, + "step": 2600 + }, + { + "entropy": 0.40748190470039847, + "epoch": 0.6072968877491549, + "grad_norm": 1.953125, + "learning_rate": 1.9990390347325037e-05, + "loss": 0.5576, + "mean_token_accuracy": 0.8577117413282395, + "num_tokens": 10701815.0, + "step": 2605 + }, + { + "entropy": 0.27749653831124305, + "epoch": 0.6084625247697867, + "grad_norm": 0.71875, + "learning_rate": 1.9990352474067173e-05, + "loss": 0.4882, + "mean_token_accuracy": 0.9072460174560547, + "num_tokens": 10720236.0, + "step": 2610 + }, + { + "entropy": 0.3385494023561478, + "epoch": 0.6096281617904185, + "grad_norm": 2.578125, + "learning_rate": 1.999031452639564e-05, + "loss": 0.6127, + "mean_token_accuracy": 0.8825340390205383, + "num_tokens": 10741518.0, + "step": 2615 + }, + { + "entropy": 0.2643695339560509, + "epoch": 0.6107937988110502, + "grad_norm": 3.3125, + "learning_rate": 1.9990276504311018e-05, + "loss": 0.4195, + "mean_token_accuracy": 0.9126249015331268, + "num_tokens": 10762998.0, + "step": 2620 + }, + { + "entropy": 0.3385010756552219, + "epoch": 0.611959435831682, + "grad_norm": 3.71875, + "learning_rate": 1.9990238407813866e-05, + "loss": 0.6874, + "mean_token_accuracy": 0.8858413577079773, + "num_tokens": 10778292.0, + "step": 2625 + }, + { + "entropy": 0.25794244520366194, + "epoch": 0.6131250728523138, + "grad_norm": 0.578125, + "learning_rate": 1.999020023690475e-05, + "loss": 0.3079, + "mean_token_accuracy": 0.9154360055923462, + "num_tokens": 10825350.0, + "step": 2630 + }, + { + "entropy": 0.2832651875913143, + "epoch": 0.6142907098729455, + "grad_norm": 0.28125, + "learning_rate": 1.9990161991584253e-05, + "loss": 0.3404, + "mean_token_accuracy": 0.8956524491310119, + "num_tokens": 10849280.0, + "step": 2635 + }, + { + "entropy": 0.2753490924835205, + "epoch": 0.6154563468935773, + "grad_norm": 3.078125, + "learning_rate": 1.9990123671852927e-05, + "loss": 0.6067, + "mean_token_accuracy": 0.9064594686031342, + "num_tokens": 10863035.0, + "step": 2640 + }, + { + "entropy": 0.31272173710167406, + "epoch": 0.6166219839142091, + "grad_norm": 0.294921875, + "learning_rate": 1.9990085277711352e-05, + "loss": 0.47, + "mean_token_accuracy": 0.8716696619987487, + "num_tokens": 10888521.0, + "step": 2645 + }, + { + "entropy": 0.2767592526972294, + "epoch": 0.6177876209348409, + "grad_norm": 0.7734375, + "learning_rate": 1.99900468091601e-05, + "loss": 0.4332, + "mean_token_accuracy": 0.8956589341163635, + "num_tokens": 10918722.0, + "step": 2650 + }, + { + "entropy": 0.1582114040851593, + "epoch": 0.6189532579554726, + "grad_norm": 1.7734375, + "learning_rate": 1.9990008266199747e-05, + "loss": 0.2572, + "mean_token_accuracy": 0.9478217661380768, + "num_tokens": 10950005.0, + "step": 2655 + }, + { + "entropy": 0.3565956801176071, + "epoch": 0.6201188949761044, + "grad_norm": 1.8515625, + "learning_rate": 1.998996964883086e-05, + "loss": 0.5539, + "mean_token_accuracy": 0.8841744959354401, + "num_tokens": 10971433.0, + "step": 2660 + }, + { + "entropy": 0.2978044833987951, + "epoch": 0.6212845319967362, + "grad_norm": 3.59375, + "learning_rate": 1.9989930957054028e-05, + "loss": 0.5773, + "mean_token_accuracy": 0.9038799822330474, + "num_tokens": 10987337.0, + "step": 2665 + }, + { + "entropy": 0.3411064647138119, + "epoch": 0.6224501690173679, + "grad_norm": 0.154296875, + "learning_rate": 1.9989892190869816e-05, + "loss": 0.6672, + "mean_token_accuracy": 0.8761695384979248, + "num_tokens": 11004420.0, + "step": 2670 + }, + { + "entropy": 0.31689151339232924, + "epoch": 0.6236158060379998, + "grad_norm": 6.0, + "learning_rate": 1.9989853350278804e-05, + "loss": 0.7847, + "mean_token_accuracy": 0.8588998973369598, + "num_tokens": 11022152.0, + "step": 2675 + }, + { + "entropy": 0.3129567734897137, + "epoch": 0.6247814430586316, + "grad_norm": 0.625, + "learning_rate": 1.9989814435281576e-05, + "loss": 0.5403, + "mean_token_accuracy": 0.890529477596283, + "num_tokens": 11060882.0, + "step": 2680 + }, + { + "entropy": 0.26123612076044084, + "epoch": 0.6259470800792634, + "grad_norm": 1.234375, + "learning_rate": 1.998977544587871e-05, + "loss": 0.3597, + "mean_token_accuracy": 0.9167574405670166, + "num_tokens": 11078695.0, + "step": 2685 + }, + { + "entropy": 0.3337280437350273, + "epoch": 0.6271127170998951, + "grad_norm": 0.59375, + "learning_rate": 1.9989736382070787e-05, + "loss": 0.5563, + "mean_token_accuracy": 0.8964059770107269, + "num_tokens": 11104248.0, + "step": 2690 + }, + { + "entropy": 0.20731370337307453, + "epoch": 0.6282783541205269, + "grad_norm": 1.8984375, + "learning_rate": 1.9989697243858388e-05, + "loss": 0.3084, + "mean_token_accuracy": 0.9278476953506469, + "num_tokens": 11130600.0, + "step": 2695 + }, + { + "entropy": 0.3032124895602465, + "epoch": 0.6294439911411587, + "grad_norm": 3.0625, + "learning_rate": 1.99896580312421e-05, + "loss": 0.6284, + "mean_token_accuracy": 0.8884129106998444, + "num_tokens": 11148703.0, + "step": 2700 + }, + { + "entropy": 0.22488728277385234, + "epoch": 0.6306096281617904, + "grad_norm": 1.8046875, + "learning_rate": 1.9989618744222506e-05, + "loss": 0.4241, + "mean_token_accuracy": 0.9202291667461395, + "num_tokens": 11169931.0, + "step": 2705 + }, + { + "entropy": 0.25353550985455514, + "epoch": 0.6317752651824222, + "grad_norm": 0.57421875, + "learning_rate": 1.998957938280019e-05, + "loss": 0.3501, + "mean_token_accuracy": 0.9135826289653778, + "num_tokens": 11207948.0, + "step": 2710 + }, + { + "entropy": 0.27142262272536755, + "epoch": 0.632940902203054, + "grad_norm": 0.6796875, + "learning_rate": 1.998953994697574e-05, + "loss": 0.5832, + "mean_token_accuracy": 0.8925474107265472, + "num_tokens": 11225668.0, + "step": 2715 + }, + { + "entropy": 0.2797514094039798, + "epoch": 0.6341065392236858, + "grad_norm": 1.2578125, + "learning_rate": 1.9989500436749746e-05, + "loss": 0.4563, + "mean_token_accuracy": 0.9044257879257203, + "num_tokens": 11253386.0, + "step": 2720 + }, + { + "entropy": 0.3232995979487896, + "epoch": 0.6352721762443175, + "grad_norm": 2.625, + "learning_rate": 1.9989460852122798e-05, + "loss": 0.6513, + "mean_token_accuracy": 0.8970522165298462, + "num_tokens": 11264773.0, + "step": 2725 + }, + { + "entropy": 0.32866962999105453, + "epoch": 0.6364378132649493, + "grad_norm": 1.0625, + "learning_rate": 1.998942119309548e-05, + "loss": 0.5501, + "mean_token_accuracy": 0.8841784775257111, + "num_tokens": 11293142.0, + "step": 2730 + }, + { + "entropy": 0.3042488098144531, + "epoch": 0.6376034502855811, + "grad_norm": 1.7890625, + "learning_rate": 1.9989381459668392e-05, + "loss": 0.6684, + "mean_token_accuracy": 0.8961536705493927, + "num_tokens": 11303088.0, + "step": 2735 + }, + { + "entropy": 0.3178449098020792, + "epoch": 0.6387690873062128, + "grad_norm": 0.8203125, + "learning_rate": 1.9989341651842117e-05, + "loss": 0.3019, + "mean_token_accuracy": 0.9054975390434266, + "num_tokens": 11329525.0, + "step": 2740 + }, + { + "entropy": 0.27289107590913775, + "epoch": 0.6399347243268446, + "grad_norm": 3.828125, + "learning_rate": 1.9989301769617258e-05, + "loss": 0.5949, + "mean_token_accuracy": 0.899017083644867, + "num_tokens": 11341596.0, + "step": 2745 + }, + { + "entropy": 0.3231787905097008, + "epoch": 0.6411003613474764, + "grad_norm": 1.7578125, + "learning_rate": 1.99892618129944e-05, + "loss": 0.7994, + "mean_token_accuracy": 0.8684586107730865, + "num_tokens": 11350540.0, + "step": 2750 + }, + { + "entropy": 0.29003828167915346, + "epoch": 0.6422659983681082, + "grad_norm": 0.486328125, + "learning_rate": 1.998922178197415e-05, + "loss": 0.3397, + "mean_token_accuracy": 0.9051499962806702, + "num_tokens": 11389605.0, + "step": 2755 + }, + { + "entropy": 0.29986562281847, + "epoch": 0.6434316353887399, + "grad_norm": 0.302734375, + "learning_rate": 1.9989181676557097e-05, + "loss": 0.6057, + "mean_token_accuracy": 0.8865071713924408, + "num_tokens": 11411773.0, + "step": 2760 + }, + { + "entropy": 0.2845982387661934, + "epoch": 0.6445972724093717, + "grad_norm": 0.44921875, + "learning_rate": 1.998914149674384e-05, + "loss": 0.4129, + "mean_token_accuracy": 0.9082658767700196, + "num_tokens": 11432083.0, + "step": 2765 + }, + { + "entropy": 0.4338554725050926, + "epoch": 0.6457629094300035, + "grad_norm": 1.5546875, + "learning_rate": 1.998910124253498e-05, + "loss": 0.7722, + "mean_token_accuracy": 0.8590993523597718, + "num_tokens": 11447520.0, + "step": 2770 + }, + { + "entropy": 0.3870300319045782, + "epoch": 0.6469285464506352, + "grad_norm": 2.765625, + "learning_rate": 1.9989060913931117e-05, + "loss": 0.6756, + "mean_token_accuracy": 0.8583252966403961, + "num_tokens": 11471002.0, + "step": 2775 + }, + { + "entropy": 0.23096679151058197, + "epoch": 0.648094183471267, + "grad_norm": 2.984375, + "learning_rate": 1.998902051093285e-05, + "loss": 0.4019, + "mean_token_accuracy": 0.9051249861717224, + "num_tokens": 11496091.0, + "step": 2780 + }, + { + "entropy": 0.401183944940567, + "epoch": 0.6492598204918988, + "grad_norm": 1.8828125, + "learning_rate": 1.9988980033540787e-05, + "loss": 0.6471, + "mean_token_accuracy": 0.856478476524353, + "num_tokens": 11528974.0, + "step": 2785 + }, + { + "entropy": 0.2962135147303343, + "epoch": 0.6504254575125306, + "grad_norm": 6.5, + "learning_rate": 1.9988939481755523e-05, + "loss": 0.6008, + "mean_token_accuracy": 0.8978751420974731, + "num_tokens": 11552542.0, + "step": 2790 + }, + { + "entropy": 0.42872381433844564, + "epoch": 0.6515910945331623, + "grad_norm": 2.6875, + "learning_rate": 1.998889885557767e-05, + "loss": 0.6457, + "mean_token_accuracy": 0.8847284972667694, + "num_tokens": 11563152.0, + "step": 2795 + }, + { + "entropy": 0.307503542304039, + "epoch": 0.6527567315537941, + "grad_norm": 0.7734375, + "learning_rate": 1.9988858155007832e-05, + "loss": 0.4287, + "mean_token_accuracy": 0.9065074861049652, + "num_tokens": 11587159.0, + "step": 2800 + }, + { + "entropy": 0.3470982015132904, + "epoch": 0.6539223685744259, + "grad_norm": 3.515625, + "learning_rate": 1.9988817380046615e-05, + "loss": 0.7755, + "mean_token_accuracy": 0.8728764653205872, + "num_tokens": 11609631.0, + "step": 2805 + }, + { + "entropy": 0.2724352993071079, + "epoch": 0.6550880055950578, + "grad_norm": 2.609375, + "learning_rate": 1.9988776530694624e-05, + "loss": 0.3999, + "mean_token_accuracy": 0.925034087896347, + "num_tokens": 11626566.0, + "step": 2810 + }, + { + "entropy": 0.19500094205141066, + "epoch": 0.6562536426156895, + "grad_norm": 0.46484375, + "learning_rate": 1.9988735606952473e-05, + "loss": 0.229, + "mean_token_accuracy": 0.9306256473064423, + "num_tokens": 11652736.0, + "step": 2815 + }, + { + "entropy": 0.29740233570337293, + "epoch": 0.6574192796363213, + "grad_norm": 1.375, + "learning_rate": 1.9988694608820775e-05, + "loss": 0.567, + "mean_token_accuracy": 0.891633152961731, + "num_tokens": 11669815.0, + "step": 2820 + }, + { + "entropy": 0.21288203895092012, + "epoch": 0.6585849166569531, + "grad_norm": 0.734375, + "learning_rate": 1.9988653536300132e-05, + "loss": 0.351, + "mean_token_accuracy": 0.9191200852394104, + "num_tokens": 11693951.0, + "step": 2825 + }, + { + "entropy": 0.3570013351738453, + "epoch": 0.6597505536775848, + "grad_norm": 12.0, + "learning_rate": 1.9988612389391163e-05, + "loss": 0.7257, + "mean_token_accuracy": 0.9116259157657624, + "num_tokens": 11735005.0, + "step": 2830 + }, + { + "entropy": 0.3212395556271076, + "epoch": 0.6609161906982166, + "grad_norm": 2.015625, + "learning_rate": 1.998857116809448e-05, + "loss": 0.7315, + "mean_token_accuracy": 0.8878246247768402, + "num_tokens": 11747768.0, + "step": 2835 + }, + { + "entropy": 0.3063099093735218, + "epoch": 0.6620818277188484, + "grad_norm": 0.9765625, + "learning_rate": 1.9988529872410698e-05, + "loss": 0.5298, + "mean_token_accuracy": 0.8972079992294312, + "num_tokens": 11765542.0, + "step": 2840 + }, + { + "entropy": 0.23337812647223471, + "epoch": 0.6632474647394802, + "grad_norm": 0.5703125, + "learning_rate": 1.9988488502340432e-05, + "loss": 0.2289, + "mean_token_accuracy": 0.932334941625595, + "num_tokens": 11800468.0, + "step": 2845 + }, + { + "entropy": 0.43383695781230924, + "epoch": 0.6644131017601119, + "grad_norm": 2.0625, + "learning_rate": 1.99884470578843e-05, + "loss": 0.6844, + "mean_token_accuracy": 0.8706629991531372, + "num_tokens": 11818230.0, + "step": 2850 + }, + { + "entropy": 0.31050637289881705, + "epoch": 0.6655787387807437, + "grad_norm": 0.486328125, + "learning_rate": 1.9988405539042918e-05, + "loss": 0.4206, + "mean_token_accuracy": 0.9051454186439514, + "num_tokens": 11837491.0, + "step": 2855 + }, + { + "entropy": 0.3793765440583229, + "epoch": 0.6667443758013755, + "grad_norm": 5.21875, + "learning_rate": 1.9988363945816906e-05, + "loss": 0.7112, + "mean_token_accuracy": 0.8710412621498108, + "num_tokens": 11865376.0, + "step": 2860 + }, + { + "entropy": 0.29791102930903435, + "epoch": 0.6679100128220072, + "grad_norm": 5.3125, + "learning_rate": 1.9988322278206887e-05, + "loss": 0.5251, + "mean_token_accuracy": 0.9106599807739257, + "num_tokens": 11880384.0, + "step": 2865 + }, + { + "entropy": 0.25474109277129175, + "epoch": 0.669075649842639, + "grad_norm": 3.6875, + "learning_rate": 1.9988280536213477e-05, + "loss": 0.5121, + "mean_token_accuracy": 0.906974321603775, + "num_tokens": 11896588.0, + "step": 2870 + }, + { + "entropy": 0.22310059182345868, + "epoch": 0.6702412868632708, + "grad_norm": 0.4140625, + "learning_rate": 1.9988238719837306e-05, + "loss": 0.3435, + "mean_token_accuracy": 0.922565633058548, + "num_tokens": 11932465.0, + "step": 2875 + }, + { + "entropy": 0.29564819037914275, + "epoch": 0.6714069238839026, + "grad_norm": 2.5625, + "learning_rate": 1.9988196829078988e-05, + "loss": 0.6536, + "mean_token_accuracy": 0.894516795873642, + "num_tokens": 11951000.0, + "step": 2880 + }, + { + "entropy": 0.25558542367070913, + "epoch": 0.6725725609045343, + "grad_norm": 0.984375, + "learning_rate": 1.9988154863939156e-05, + "loss": 0.3796, + "mean_token_accuracy": 0.9356054246425629, + "num_tokens": 11973901.0, + "step": 2885 + }, + { + "entropy": 0.26891485378146174, + "epoch": 0.6737381979251661, + "grad_norm": 2.25, + "learning_rate": 1.998811282441843e-05, + "loss": 0.4838, + "mean_token_accuracy": 0.9037762641906738, + "num_tokens": 11997570.0, + "step": 2890 + }, + { + "entropy": 0.23099879249930383, + "epoch": 0.6749038349457979, + "grad_norm": 2.484375, + "learning_rate": 1.998807071051744e-05, + "loss": 0.3147, + "mean_token_accuracy": 0.9223685622215271, + "num_tokens": 12023241.0, + "step": 2895 + }, + { + "entropy": 0.3084828436374664, + "epoch": 0.6760694719664296, + "grad_norm": 1.65625, + "learning_rate": 1.9988028522236814e-05, + "loss": 0.5966, + "mean_token_accuracy": 0.8926734149456024, + "num_tokens": 12042716.0, + "step": 2900 + }, + { + "entropy": 0.22942945584654809, + "epoch": 0.6772351089870614, + "grad_norm": 1.7734375, + "learning_rate": 1.9987986259577178e-05, + "loss": 0.3961, + "mean_token_accuracy": 0.934667432308197, + "num_tokens": 12066142.0, + "step": 2905 + }, + { + "entropy": 0.23905463069677352, + "epoch": 0.6784007460076932, + "grad_norm": 0.55078125, + "learning_rate": 1.9987943922539168e-05, + "loss": 0.35, + "mean_token_accuracy": 0.9203935861587524, + "num_tokens": 12091703.0, + "step": 2910 + }, + { + "entropy": 0.3106345657259226, + "epoch": 0.679566383028325, + "grad_norm": 1.2890625, + "learning_rate": 1.9987901511123412e-05, + "loss": 0.7652, + "mean_token_accuracy": 0.8731933474540711, + "num_tokens": 12108139.0, + "step": 2915 + }, + { + "entropy": 0.3008536197245121, + "epoch": 0.6807320200489567, + "grad_norm": 2.359375, + "learning_rate": 1.9987859025330537e-05, + "loss": 0.4116, + "mean_token_accuracy": 0.9075113773345947, + "num_tokens": 12139750.0, + "step": 2920 + }, + { + "entropy": 0.2751265406608582, + "epoch": 0.6818976570695885, + "grad_norm": 2.46875, + "learning_rate": 1.9987816465161186e-05, + "loss": 0.429, + "mean_token_accuracy": 0.9219581127166748, + "num_tokens": 12154831.0, + "step": 2925 + }, + { + "entropy": 0.2258956879377365, + "epoch": 0.6830632940902203, + "grad_norm": 0.9609375, + "learning_rate": 1.998777383061599e-05, + "loss": 0.4781, + "mean_token_accuracy": 0.9113677144050598, + "num_tokens": 12176507.0, + "step": 2930 + }, + { + "entropy": 0.3542913876473904, + "epoch": 0.684228931110852, + "grad_norm": 0.322265625, + "learning_rate": 1.998773112169558e-05, + "loss": 0.4442, + "mean_token_accuracy": 0.8889063417911529, + "num_tokens": 12204987.0, + "step": 2935 + }, + { + "entropy": 0.26746199689805505, + "epoch": 0.6853945681314838, + "grad_norm": 2.546875, + "learning_rate": 1.99876883384006e-05, + "loss": 0.7624, + "mean_token_accuracy": 0.8774264693260193, + "num_tokens": 12219026.0, + "step": 2940 + }, + { + "entropy": 0.2980736643075943, + "epoch": 0.6865602051521156, + "grad_norm": 1.359375, + "learning_rate": 1.9987645480731687e-05, + "loss": 0.5191, + "mean_token_accuracy": 0.9098766207695007, + "num_tokens": 12229932.0, + "step": 2945 + }, + { + "entropy": 0.27693705186247825, + "epoch": 0.6877258421727475, + "grad_norm": 2.4375, + "learning_rate": 1.9987602548689475e-05, + "loss": 0.5249, + "mean_token_accuracy": 0.9166098952293396, + "num_tokens": 12252411.0, + "step": 2950 + }, + { + "entropy": 0.28829210847616193, + "epoch": 0.6888914791933792, + "grad_norm": 2.375, + "learning_rate": 1.998755954227461e-05, + "loss": 0.6741, + "mean_token_accuracy": 0.890589964389801, + "num_tokens": 12262482.0, + "step": 2955 + }, + { + "entropy": 0.26005504429340365, + "epoch": 0.690057116214011, + "grad_norm": 0.5, + "learning_rate": 1.9987516461487726e-05, + "loss": 0.3838, + "mean_token_accuracy": 0.8957611501216889, + "num_tokens": 12285838.0, + "step": 2960 + }, + { + "entropy": 0.28918950296938417, + "epoch": 0.6912227532346428, + "grad_norm": 1.15625, + "learning_rate": 1.9987473306329473e-05, + "loss": 0.3924, + "mean_token_accuracy": 0.9049074351787567, + "num_tokens": 12304686.0, + "step": 2965 + }, + { + "entropy": 0.3453959345817566, + "epoch": 0.6923883902552745, + "grad_norm": 4.8125, + "learning_rate": 1.998743007680049e-05, + "loss": 0.9069, + "mean_token_accuracy": 0.8660809576511384, + "num_tokens": 12313370.0, + "step": 2970 + }, + { + "entropy": 0.3491327941417694, + "epoch": 0.6935540272759063, + "grad_norm": 1.3359375, + "learning_rate": 1.9987386772901426e-05, + "loss": 0.7424, + "mean_token_accuracy": 0.8711540699005127, + "num_tokens": 12326798.0, + "step": 2975 + }, + { + "entropy": 0.46053214073181153, + "epoch": 0.6947196642965381, + "grad_norm": 2.359375, + "learning_rate": 1.998734339463292e-05, + "loss": 0.9168, + "mean_token_accuracy": 0.8372248828411102, + "num_tokens": 12336458.0, + "step": 2980 + }, + { + "entropy": 0.359159516915679, + "epoch": 0.6958853013171699, + "grad_norm": 2.21875, + "learning_rate": 1.9987299941995625e-05, + "loss": 0.5595, + "mean_token_accuracy": 0.8771745800971985, + "num_tokens": 12354223.0, + "step": 2985 + }, + { + "entropy": 0.3393791884183884, + "epoch": 0.6970509383378016, + "grad_norm": 3.078125, + "learning_rate": 1.9987256414990183e-05, + "loss": 0.69, + "mean_token_accuracy": 0.8888447761535645, + "num_tokens": 12374416.0, + "step": 2990 + }, + { + "entropy": 0.2696688212454319, + "epoch": 0.6982165753584334, + "grad_norm": 0.91015625, + "learning_rate": 1.998721281361725e-05, + "loss": 0.3425, + "mean_token_accuracy": 0.9043423593044281, + "num_tokens": 12399187.0, + "step": 2995 + }, + { + "entropy": 0.2706952027976513, + "epoch": 0.6993822123790652, + "grad_norm": 0.79296875, + "learning_rate": 1.9987169137877474e-05, + "loss": 0.467, + "mean_token_accuracy": 0.9108605206012725, + "num_tokens": 12416919.0, + "step": 3000 + }, + { + "entropy": 0.32974872663617133, + "epoch": 0.7005478493996969, + "grad_norm": 0.5625, + "learning_rate": 1.9987125387771502e-05, + "loss": 0.4047, + "mean_token_accuracy": 0.9061824440956116, + "num_tokens": 12441006.0, + "step": 3005 + }, + { + "entropy": 0.2989892097190022, + "epoch": 0.7017134864203287, + "grad_norm": 0.498046875, + "learning_rate": 1.9987081563299992e-05, + "loss": 0.3412, + "mean_token_accuracy": 0.8854645431041718, + "num_tokens": 12476290.0, + "step": 3010 + }, + { + "entropy": 0.3127955436706543, + "epoch": 0.7028791234409605, + "grad_norm": 1.09375, + "learning_rate": 1.9987037664463593e-05, + "loss": 0.706, + "mean_token_accuracy": 0.8708417236804962, + "num_tokens": 12495372.0, + "step": 3015 + }, + { + "entropy": 0.22802112326025964, + "epoch": 0.7040447604615923, + "grad_norm": 0.44921875, + "learning_rate": 1.9986993691262963e-05, + "loss": 0.2649, + "mean_token_accuracy": 0.9163160860538483, + "num_tokens": 12521478.0, + "step": 3020 + }, + { + "entropy": 0.29017159678041937, + "epoch": 0.705210397482224, + "grad_norm": 0.185546875, + "learning_rate": 1.9986949643698752e-05, + "loss": 0.4145, + "mean_token_accuracy": 0.9037138223648071, + "num_tokens": 12542998.0, + "step": 3025 + }, + { + "entropy": 0.2329173669219017, + "epoch": 0.7063760345028558, + "grad_norm": 1.125, + "learning_rate": 1.9986905521771625e-05, + "loss": 0.3699, + "mean_token_accuracy": 0.9263135194778442, + "num_tokens": 12562045.0, + "step": 3030 + }, + { + "entropy": 0.3430146735161543, + "epoch": 0.7075416715234876, + "grad_norm": 0.42578125, + "learning_rate": 1.9986861325482236e-05, + "loss": 0.4286, + "mean_token_accuracy": 0.8931098103523254, + "num_tokens": 12598642.0, + "step": 3035 + }, + { + "entropy": 0.296509512513876, + "epoch": 0.7087073085441193, + "grad_norm": 1.1640625, + "learning_rate": 1.998681705483124e-05, + "loss": 0.6135, + "mean_token_accuracy": 0.8931901514530182, + "num_tokens": 12610992.0, + "step": 3040 + }, + { + "entropy": 0.36168722808361053, + "epoch": 0.7098729455647511, + "grad_norm": 1.75, + "learning_rate": 1.9986772709819305e-05, + "loss": 0.8423, + "mean_token_accuracy": 0.8605839788913727, + "num_tokens": 12620250.0, + "step": 3045 + }, + { + "entropy": 0.29431844148784875, + "epoch": 0.7110385825853829, + "grad_norm": 0.384765625, + "learning_rate": 1.998672829044709e-05, + "loss": 0.5822, + "mean_token_accuracy": 0.9037684023380279, + "num_tokens": 12638767.0, + "step": 3050 + }, + { + "entropy": 0.324118447676301, + "epoch": 0.7122042196060147, + "grad_norm": 0.486328125, + "learning_rate": 1.9986683796715253e-05, + "loss": 0.6017, + "mean_token_accuracy": 0.90307257771492, + "num_tokens": 12655374.0, + "step": 3055 + }, + { + "entropy": 0.2938187211751938, + "epoch": 0.7133698566266464, + "grad_norm": 0.9140625, + "learning_rate": 1.998663922862446e-05, + "loss": 0.5992, + "mean_token_accuracy": 0.9050024807453155, + "num_tokens": 12666178.0, + "step": 3060 + }, + { + "entropy": 0.32323597818613053, + "epoch": 0.7145354936472782, + "grad_norm": 3.953125, + "learning_rate": 1.9986594586175375e-05, + "loss": 0.6051, + "mean_token_accuracy": 0.8777613401412964, + "num_tokens": 12685326.0, + "step": 3065 + }, + { + "entropy": 0.28426281437277795, + "epoch": 0.71570113066791, + "grad_norm": 0.20703125, + "learning_rate": 1.9986549869368667e-05, + "loss": 0.271, + "mean_token_accuracy": 0.892994499206543, + "num_tokens": 12722638.0, + "step": 3070 + }, + { + "entropy": 0.31557350754737856, + "epoch": 0.7168667676885417, + "grad_norm": 0.5859375, + "learning_rate": 1.9986505078205e-05, + "loss": 0.3999, + "mean_token_accuracy": 0.9035743296146392, + "num_tokens": 12749198.0, + "step": 3075 + }, + { + "entropy": 0.371865002810955, + "epoch": 0.7180324047091735, + "grad_norm": 2.78125, + "learning_rate": 1.998646021268504e-05, + "loss": 0.8639, + "mean_token_accuracy": 0.8539581596851349, + "num_tokens": 12759500.0, + "step": 3080 + }, + { + "entropy": 0.26819879561662674, + "epoch": 0.7191980417298054, + "grad_norm": 0.9765625, + "learning_rate": 1.9986415272809458e-05, + "loss": 0.3749, + "mean_token_accuracy": 0.9239216327667237, + "num_tokens": 12783515.0, + "step": 3085 + }, + { + "entropy": 0.25302067399024963, + "epoch": 0.7203636787504372, + "grad_norm": 0.46484375, + "learning_rate": 1.9986370258578925e-05, + "loss": 0.4213, + "mean_token_accuracy": 0.9127112746238708, + "num_tokens": 12806426.0, + "step": 3090 + }, + { + "entropy": 0.3763828493654728, + "epoch": 0.7215293157710689, + "grad_norm": 4.34375, + "learning_rate": 1.9986325169994116e-05, + "loss": 0.7125, + "mean_token_accuracy": 0.8624271988868714, + "num_tokens": 12826732.0, + "step": 3095 + }, + { + "entropy": 0.43457455970346925, + "epoch": 0.7226949527917007, + "grad_norm": 0.51171875, + "learning_rate": 1.998628000705569e-05, + "loss": 0.791, + "mean_token_accuracy": 0.8754983246326447, + "num_tokens": 12855074.0, + "step": 3100 + }, + { + "entropy": 0.3388949878513813, + "epoch": 0.7238605898123325, + "grad_norm": 0.76953125, + "learning_rate": 1.9986234769764337e-05, + "loss": 0.8004, + "mean_token_accuracy": 0.8680715382099151, + "num_tokens": 12865296.0, + "step": 3105 + }, + { + "entropy": 0.24326059743762016, + "epoch": 0.7250262268329642, + "grad_norm": 0.81640625, + "learning_rate": 1.9986189458120722e-05, + "loss": 0.3808, + "mean_token_accuracy": 0.9010769128799438, + "num_tokens": 12895080.0, + "step": 3110 + }, + { + "entropy": 0.33171582967042923, + "epoch": 0.726191863853596, + "grad_norm": 1.0, + "learning_rate": 1.998614407212552e-05, + "loss": 0.5749, + "mean_token_accuracy": 0.9047894716262818, + "num_tokens": 12905648.0, + "step": 3115 + }, + { + "entropy": 0.2634651020169258, + "epoch": 0.7273575008742278, + "grad_norm": 1.171875, + "learning_rate": 1.9986098611779412e-05, + "loss": 0.4922, + "mean_token_accuracy": 0.9093749225139618, + "num_tokens": 12921502.0, + "step": 3120 + }, + { + "entropy": 0.3110058598220348, + "epoch": 0.7285231378948596, + "grad_norm": 2.375, + "learning_rate": 1.9986053077083074e-05, + "loss": 0.8057, + "mean_token_accuracy": 0.8743705987930298, + "num_tokens": 12934872.0, + "step": 3125 + }, + { + "entropy": 0.3678535770624876, + "epoch": 0.7296887749154913, + "grad_norm": 2.015625, + "learning_rate": 1.9986007468037187e-05, + "loss": 0.6753, + "mean_token_accuracy": 0.874816483259201, + "num_tokens": 12946749.0, + "step": 3130 + }, + { + "entropy": 0.2128050871193409, + "epoch": 0.7308544119361231, + "grad_norm": 0.66015625, + "learning_rate": 1.9985961784642426e-05, + "loss": 0.2591, + "mean_token_accuracy": 0.9357932329177856, + "num_tokens": 12974482.0, + "step": 3135 + }, + { + "entropy": 0.31313026919960973, + "epoch": 0.7320200489567549, + "grad_norm": 5.125, + "learning_rate": 1.9985916026899478e-05, + "loss": 0.5509, + "mean_token_accuracy": 0.8914876878261566, + "num_tokens": 12994951.0, + "step": 3140 + }, + { + "entropy": 0.2906319335103035, + "epoch": 0.7331856859773866, + "grad_norm": 3.359375, + "learning_rate": 1.9985870194809022e-05, + "loss": 0.7678, + "mean_token_accuracy": 0.8900413334369659, + "num_tokens": 13005139.0, + "step": 3145 + }, + { + "entropy": 0.43225277215242386, + "epoch": 0.7343513229980184, + "grad_norm": 1.328125, + "learning_rate": 1.9985824288371743e-05, + "loss": 0.6923, + "mean_token_accuracy": 0.8420580059289933, + "num_tokens": 13032822.0, + "step": 3150 + }, + { + "entropy": 0.23028683103621006, + "epoch": 0.7355169600186502, + "grad_norm": 2.625, + "learning_rate": 1.9985778307588323e-05, + "loss": 0.2613, + "mean_token_accuracy": 0.9250756561756134, + "num_tokens": 13069596.0, + "step": 3155 + }, + { + "entropy": 0.2778568729758263, + "epoch": 0.736682597039282, + "grad_norm": 1.6484375, + "learning_rate": 1.998573225245945e-05, + "loss": 0.4831, + "mean_token_accuracy": 0.918241810798645, + "num_tokens": 13084383.0, + "step": 3160 + }, + { + "entropy": 0.24663490243256092, + "epoch": 0.7378482340599137, + "grad_norm": 0.66796875, + "learning_rate": 1.998568612298581e-05, + "loss": 0.2513, + "mean_token_accuracy": 0.9311227321624755, + "num_tokens": 13114417.0, + "step": 3165 + }, + { + "entropy": 0.28456913605332373, + "epoch": 0.7390138710805455, + "grad_norm": 1.453125, + "learning_rate": 1.9985639919168093e-05, + "loss": 0.5558, + "mean_token_accuracy": 0.8984531760215759, + "num_tokens": 13132970.0, + "step": 3170 + }, + { + "entropy": 0.2570043332874775, + "epoch": 0.7401795081011773, + "grad_norm": 8.625, + "learning_rate": 1.9985593641006984e-05, + "loss": 0.5133, + "mean_token_accuracy": 0.9061603724956513, + "num_tokens": 13156883.0, + "step": 3175 + }, + { + "entropy": 0.2510953940451145, + "epoch": 0.741345145121809, + "grad_norm": 3.609375, + "learning_rate": 1.9985547288503175e-05, + "loss": 0.3524, + "mean_token_accuracy": 0.9195565164089203, + "num_tokens": 13191055.0, + "step": 3180 + }, + { + "entropy": 0.2396312952041626, + "epoch": 0.7425107821424408, + "grad_norm": 1.5703125, + "learning_rate": 1.9985500861657358e-05, + "loss": 0.353, + "mean_token_accuracy": 0.9258275687694549, + "num_tokens": 13216177.0, + "step": 3185 + }, + { + "entropy": 0.32750511094927787, + "epoch": 0.7436764191630726, + "grad_norm": 2.609375, + "learning_rate": 1.9985454360470224e-05, + "loss": 0.4645, + "mean_token_accuracy": 0.9065626561641693, + "num_tokens": 13229869.0, + "step": 3190 + }, + { + "entropy": 0.33407930880784986, + "epoch": 0.7448420561837044, + "grad_norm": 5.0625, + "learning_rate": 1.9985407784942467e-05, + "loss": 0.5446, + "mean_token_accuracy": 0.8834915876388549, + "num_tokens": 13247975.0, + "step": 3195 + }, + { + "entropy": 0.35299213230609894, + "epoch": 0.7460076932043361, + "grad_norm": 1.703125, + "learning_rate": 1.9985361135074782e-05, + "loss": 0.6986, + "mean_token_accuracy": 0.870141988992691, + "num_tokens": 13260189.0, + "step": 3200 + }, + { + "entropy": 0.2146347463130951, + "epoch": 0.7471733302249679, + "grad_norm": 2.03125, + "learning_rate": 1.998531441086786e-05, + "loss": 0.3993, + "mean_token_accuracy": 0.9297842562198639, + "num_tokens": 13284011.0, + "step": 3205 + }, + { + "entropy": 0.31019294261932373, + "epoch": 0.7483389672455997, + "grad_norm": 1.3359375, + "learning_rate": 1.9985267612322408e-05, + "loss": 0.5904, + "mean_token_accuracy": 0.8879059374332428, + "num_tokens": 13301433.0, + "step": 3210 + }, + { + "entropy": 0.31197711527347566, + "epoch": 0.7495046042662314, + "grad_norm": 0.640625, + "learning_rate": 1.9985220739439117e-05, + "loss": 0.5727, + "mean_token_accuracy": 0.8965538263320922, + "num_tokens": 13314835.0, + "step": 3215 + }, + { + "entropy": 0.3941266030073166, + "epoch": 0.7506702412868632, + "grad_norm": 1.8984375, + "learning_rate": 1.9985173792218683e-05, + "loss": 0.9058, + "mean_token_accuracy": 0.8483995854854584, + "num_tokens": 13323087.0, + "step": 3220 + }, + { + "entropy": 0.2891694024205208, + "epoch": 0.7518358783074951, + "grad_norm": 2.1875, + "learning_rate": 1.998512677066181e-05, + "loss": 0.5224, + "mean_token_accuracy": 0.9001904428005219, + "num_tokens": 13345448.0, + "step": 3225 + }, + { + "entropy": 0.3195288822054863, + "epoch": 0.7530015153281269, + "grad_norm": 3.34375, + "learning_rate": 1.9985079674769203e-05, + "loss": 0.6119, + "mean_token_accuracy": 0.892349797487259, + "num_tokens": 13357508.0, + "step": 3230 + }, + { + "entropy": 0.2566119972616434, + "epoch": 0.7541671523487586, + "grad_norm": 2.265625, + "learning_rate": 1.9985032504541555e-05, + "loss": 0.4236, + "mean_token_accuracy": 0.9038833737373352, + "num_tokens": 13375653.0, + "step": 3235 + }, + { + "entropy": 0.36683848649263384, + "epoch": 0.7553327893693904, + "grad_norm": 2.640625, + "learning_rate": 1.9984985259979577e-05, + "loss": 0.7721, + "mean_token_accuracy": 0.8706809699535369, + "num_tokens": 13384215.0, + "step": 3240 + }, + { + "entropy": 0.23739213198423387, + "epoch": 0.7564984263900222, + "grad_norm": 0.80859375, + "learning_rate": 1.998493794108397e-05, + "loss": 0.4994, + "mean_token_accuracy": 0.9105100572109223, + "num_tokens": 13398419.0, + "step": 3245 + }, + { + "entropy": 0.3068238809704781, + "epoch": 0.757664063410654, + "grad_norm": 2.828125, + "learning_rate": 1.9984890547855444e-05, + "loss": 0.6933, + "mean_token_accuracy": 0.8871238231658936, + "num_tokens": 13410075.0, + "step": 3250 + }, + { + "entropy": 0.23343345075845717, + "epoch": 0.7588297004312857, + "grad_norm": 0.8359375, + "learning_rate": 1.9984843080294695e-05, + "loss": 0.4127, + "mean_token_accuracy": 0.9250391602516175, + "num_tokens": 13431831.0, + "step": 3255 + }, + { + "entropy": 0.35735142379999163, + "epoch": 0.7599953374519175, + "grad_norm": 0.67578125, + "learning_rate": 1.9984795538402444e-05, + "loss": 0.5364, + "mean_token_accuracy": 0.8816571056842804, + "num_tokens": 13453700.0, + "step": 3260 + }, + { + "entropy": 0.2540307299233973, + "epoch": 0.7611609744725493, + "grad_norm": 0.73828125, + "learning_rate": 1.9984747922179393e-05, + "loss": 0.2662, + "mean_token_accuracy": 0.9232360005378724, + "num_tokens": 13481739.0, + "step": 3265 + }, + { + "entropy": 0.3812587969005108, + "epoch": 0.762326611493181, + "grad_norm": 5.71875, + "learning_rate": 1.998470023162625e-05, + "loss": 0.809, + "mean_token_accuracy": 0.8781751275062561, + "num_tokens": 13494741.0, + "step": 3270 + }, + { + "entropy": 0.3789962977170944, + "epoch": 0.7634922485138128, + "grad_norm": 1.34375, + "learning_rate": 1.9984652466743733e-05, + "loss": 0.905, + "mean_token_accuracy": 0.8371272504329681, + "num_tokens": 13508467.0, + "step": 3275 + }, + { + "entropy": 0.2287605084478855, + "epoch": 0.7646578855344446, + "grad_norm": 3.46875, + "learning_rate": 1.9984604627532547e-05, + "loss": 0.4125, + "mean_token_accuracy": 0.9276242315769195, + "num_tokens": 13532941.0, + "step": 3280 + }, + { + "entropy": 0.23304533641785383, + "epoch": 0.7658235225550764, + "grad_norm": 0.5, + "learning_rate": 1.9984556713993414e-05, + "loss": 0.3236, + "mean_token_accuracy": 0.9229490995407105, + "num_tokens": 13555509.0, + "step": 3285 + }, + { + "entropy": 0.35275202319025994, + "epoch": 0.7669891595757081, + "grad_norm": 3.375, + "learning_rate": 1.9984508726127038e-05, + "loss": 0.6199, + "mean_token_accuracy": 0.8747760951519012, + "num_tokens": 13585680.0, + "step": 3290 + }, + { + "entropy": 0.24191362485289575, + "epoch": 0.7681547965963399, + "grad_norm": 0.58984375, + "learning_rate": 1.9984460663934143e-05, + "loss": 0.5734, + "mean_token_accuracy": 0.9015963733196258, + "num_tokens": 13609809.0, + "step": 3295 + }, + { + "entropy": 0.29589404761791227, + "epoch": 0.7693204336169717, + "grad_norm": 2.328125, + "learning_rate": 1.998441252741544e-05, + "loss": 0.5253, + "mean_token_accuracy": 0.8925094544887543, + "num_tokens": 13628829.0, + "step": 3300 + }, + { + "entropy": 0.20487794056534767, + "epoch": 0.7704860706376034, + "grad_norm": 1.046875, + "learning_rate": 1.9984364316571652e-05, + "loss": 0.2499, + "mean_token_accuracy": 0.9276638627052307, + "num_tokens": 13650807.0, + "step": 3305 + }, + { + "entropy": 0.2848545204848051, + "epoch": 0.7716517076582352, + "grad_norm": 1.6796875, + "learning_rate": 1.9984316031403494e-05, + "loss": 0.547, + "mean_token_accuracy": 0.8881556272506714, + "num_tokens": 13665337.0, + "step": 3310 + }, + { + "entropy": 0.2990790454670787, + "epoch": 0.772817344678867, + "grad_norm": 0.59375, + "learning_rate": 1.9984267671911685e-05, + "loss": 0.4545, + "mean_token_accuracy": 0.9087825000286103, + "num_tokens": 13686946.0, + "step": 3315 + }, + { + "entropy": 0.31242423579096795, + "epoch": 0.7739829816994988, + "grad_norm": 2.875, + "learning_rate": 1.998421923809695e-05, + "loss": 0.7536, + "mean_token_accuracy": 0.8559371948242187, + "num_tokens": 13699043.0, + "step": 3320 + }, + { + "entropy": 0.3204685363918543, + "epoch": 0.7751486187201305, + "grad_norm": 2.5, + "learning_rate": 1.998417072996001e-05, + "loss": 0.6342, + "mean_token_accuracy": 0.880757862329483, + "num_tokens": 13715301.0, + "step": 3325 + }, + { + "entropy": 0.28653821013867853, + "epoch": 0.7763142557407623, + "grad_norm": 0.466796875, + "learning_rate": 1.9984122147501586e-05, + "loss": 0.2364, + "mean_token_accuracy": 0.916357421875, + "num_tokens": 13739522.0, + "step": 3330 + }, + { + "entropy": 0.3267530560493469, + "epoch": 0.7774798927613941, + "grad_norm": 0.94921875, + "learning_rate": 1.9984073490722406e-05, + "loss": 0.4487, + "mean_token_accuracy": 0.8852592051029206, + "num_tokens": 13757223.0, + "step": 3335 + }, + { + "entropy": 0.2911109760403633, + "epoch": 0.7786455297820258, + "grad_norm": 1.125, + "learning_rate": 1.9984024759623192e-05, + "loss": 0.397, + "mean_token_accuracy": 0.9045413374900818, + "num_tokens": 13775390.0, + "step": 3340 + }, + { + "entropy": 0.28941122740507125, + "epoch": 0.7798111668026576, + "grad_norm": 2.234375, + "learning_rate": 1.9983975954204674e-05, + "loss": 0.6316, + "mean_token_accuracy": 0.8857159197330475, + "num_tokens": 13788002.0, + "step": 3345 + }, + { + "entropy": 0.2598882310092449, + "epoch": 0.7809768038232894, + "grad_norm": 1.515625, + "learning_rate": 1.9983927074467577e-05, + "loss": 0.4756, + "mean_token_accuracy": 0.9066189110279084, + "num_tokens": 13826780.0, + "step": 3350 + }, + { + "entropy": 0.33761544786393644, + "epoch": 0.7821424408439211, + "grad_norm": 1.4921875, + "learning_rate": 1.9983878120412632e-05, + "loss": 0.566, + "mean_token_accuracy": 0.8932851791381836, + "num_tokens": 13851290.0, + "step": 3355 + }, + { + "entropy": 0.2210379447788, + "epoch": 0.783308077864553, + "grad_norm": 1.21875, + "learning_rate": 1.9983829092040568e-05, + "loss": 0.2929, + "mean_token_accuracy": 0.9251502156257629, + "num_tokens": 13876976.0, + "step": 3360 + }, + { + "entropy": 0.20661963215097784, + "epoch": 0.7844737148851848, + "grad_norm": 0.353515625, + "learning_rate": 1.9983779989352113e-05, + "loss": 0.2915, + "mean_token_accuracy": 0.9164006829261779, + "num_tokens": 13916136.0, + "step": 3365 + }, + { + "entropy": 0.27498682774603367, + "epoch": 0.7856393519058166, + "grad_norm": 0.85546875, + "learning_rate": 1.9983730812348007e-05, + "loss": 0.4304, + "mean_token_accuracy": 0.8965947449207305, + "num_tokens": 13936758.0, + "step": 3370 + }, + { + "entropy": 0.3657851852476597, + "epoch": 0.7868049889264483, + "grad_norm": 6.5625, + "learning_rate": 1.9983681561028977e-05, + "loss": 0.5509, + "mean_token_accuracy": 0.9021944999694824, + "num_tokens": 13967866.0, + "step": 3375 + }, + { + "entropy": 0.32351127788424494, + "epoch": 0.7879706259470801, + "grad_norm": 0.7265625, + "learning_rate": 1.998363223539576e-05, + "loss": 0.7277, + "mean_token_accuracy": 0.8853053629398346, + "num_tokens": 13979343.0, + "step": 3380 + }, + { + "entropy": 0.32861794382333753, + "epoch": 0.7891362629677119, + "grad_norm": 0.74609375, + "learning_rate": 1.9983582835449088e-05, + "loss": 0.7126, + "mean_token_accuracy": 0.8623862028121948, + "num_tokens": 13997112.0, + "step": 3385 + }, + { + "entropy": 0.2560902625322342, + "epoch": 0.7903018999883437, + "grad_norm": 0.365234375, + "learning_rate": 1.9983533361189702e-05, + "loss": 0.5358, + "mean_token_accuracy": 0.8984552204608918, + "num_tokens": 14020986.0, + "step": 3390 + }, + { + "entropy": 0.38792429491877556, + "epoch": 0.7914675370089754, + "grad_norm": 2.25, + "learning_rate": 1.9983483812618337e-05, + "loss": 0.7052, + "mean_token_accuracy": 0.8550672352313995, + "num_tokens": 14044564.0, + "step": 3395 + }, + { + "entropy": 0.3164536517113447, + "epoch": 0.7926331740296072, + "grad_norm": 1.3984375, + "learning_rate": 1.9983434189735735e-05, + "loss": 0.4811, + "mean_token_accuracy": 0.9013772666454315, + "num_tokens": 14060290.0, + "step": 3400 + }, + { + "entropy": 0.33436234965920447, + "epoch": 0.793798811050239, + "grad_norm": 0.28515625, + "learning_rate": 1.9983384492542634e-05, + "loss": 0.6164, + "mean_token_accuracy": 0.9005577623844147, + "num_tokens": 14076258.0, + "step": 3405 + }, + { + "entropy": 0.4063506111502647, + "epoch": 0.7949644480708707, + "grad_norm": 8.625, + "learning_rate": 1.998333472103977e-05, + "loss": 0.8561, + "mean_token_accuracy": 0.8626307547092438, + "num_tokens": 14106977.0, + "step": 3410 + }, + { + "entropy": 0.40561274290084837, + "epoch": 0.7961300850915025, + "grad_norm": 1.3984375, + "learning_rate": 1.9983284875227894e-05, + "loss": 0.4162, + "mean_token_accuracy": 0.8764112651348114, + "num_tokens": 14133864.0, + "step": 3415 + }, + { + "entropy": 0.335993605107069, + "epoch": 0.7972957221121343, + "grad_norm": 1.515625, + "learning_rate": 1.9983234955107743e-05, + "loss": 0.5466, + "mean_token_accuracy": 0.8508065283298493, + "num_tokens": 14147764.0, + "step": 3420 + }, + { + "entropy": 0.23777510970830917, + "epoch": 0.7984613591327661, + "grad_norm": 1.53125, + "learning_rate": 1.9983184960680068e-05, + "loss": 0.2813, + "mean_token_accuracy": 0.9156764447689056, + "num_tokens": 14169080.0, + "step": 3425 + }, + { + "entropy": 0.3019415006041527, + "epoch": 0.7996269961533978, + "grad_norm": 2.046875, + "learning_rate": 1.9983134891945604e-05, + "loss": 0.6884, + "mean_token_accuracy": 0.8911664128303528, + "num_tokens": 14178988.0, + "step": 3430 + }, + { + "entropy": 0.34377400428056715, + "epoch": 0.8007926331740296, + "grad_norm": 1.3515625, + "learning_rate": 1.9983084748905107e-05, + "loss": 0.4009, + "mean_token_accuracy": 0.8842729508876801, + "num_tokens": 14213845.0, + "step": 3435 + }, + { + "entropy": 0.3087873324751854, + "epoch": 0.8019582701946614, + "grad_norm": 3.671875, + "learning_rate": 1.998303453155932e-05, + "loss": 0.7759, + "mean_token_accuracy": 0.8876574337482452, + "num_tokens": 14223321.0, + "step": 3440 + }, + { + "entropy": 0.2858346672728658, + "epoch": 0.8031239072152931, + "grad_norm": 1.65625, + "learning_rate": 1.9982984239908995e-05, + "loss": 0.6156, + "mean_token_accuracy": 0.890706866979599, + "num_tokens": 14242105.0, + "step": 3445 + }, + { + "entropy": 0.37374798357486727, + "epoch": 0.8042895442359249, + "grad_norm": 2.734375, + "learning_rate": 1.9982933873954878e-05, + "loss": 0.8822, + "mean_token_accuracy": 0.86474609375, + "num_tokens": 14252615.0, + "step": 3450 + }, + { + "entropy": 0.41026286482810975, + "epoch": 0.8054551812565567, + "grad_norm": 3.125, + "learning_rate": 1.9982883433697723e-05, + "loss": 0.8252, + "mean_token_accuracy": 0.8614952623844147, + "num_tokens": 14269831.0, + "step": 3455 + }, + { + "entropy": 0.30025038607418536, + "epoch": 0.8066208182771885, + "grad_norm": 0.390625, + "learning_rate": 1.9982832919138286e-05, + "loss": 0.1509, + "mean_token_accuracy": 0.903667026758194, + "num_tokens": 14306269.0, + "step": 3460 + }, + { + "entropy": 0.3967276046052575, + "epoch": 0.8077864552978202, + "grad_norm": 0.447265625, + "learning_rate": 1.9982782330277308e-05, + "loss": 0.621, + "mean_token_accuracy": 0.8603341758251191, + "num_tokens": 14335201.0, + "step": 3465 + }, + { + "entropy": 0.26602720804512503, + "epoch": 0.808952092318452, + "grad_norm": 1.5546875, + "learning_rate": 1.9982731667115556e-05, + "loss": 0.4676, + "mean_token_accuracy": 0.8931980729103088, + "num_tokens": 14355603.0, + "step": 3470 + }, + { + "entropy": 0.30935631804168223, + "epoch": 0.8101177293390838, + "grad_norm": 0.6328125, + "learning_rate": 1.9982680929653777e-05, + "loss": 0.441, + "mean_token_accuracy": 0.9137549519538879, + "num_tokens": 14374468.0, + "step": 3475 + }, + { + "entropy": 0.31746798753738403, + "epoch": 0.8112833663597155, + "grad_norm": 1.0390625, + "learning_rate": 1.9982630117892735e-05, + "loss": 0.5511, + "mean_token_accuracy": 0.8939504265785218, + "num_tokens": 14387992.0, + "step": 3480 + }, + { + "entropy": 0.20791075341403484, + "epoch": 0.8124490033803473, + "grad_norm": 0.224609375, + "learning_rate": 1.998257923183318e-05, + "loss": 0.2899, + "mean_token_accuracy": 0.9380963206291199, + "num_tokens": 14412238.0, + "step": 3485 + }, + { + "entropy": 0.22601248007267713, + "epoch": 0.8136146404009791, + "grad_norm": 0.84765625, + "learning_rate": 1.9982528271475876e-05, + "loss": 0.3418, + "mean_token_accuracy": 0.9155470192432403, + "num_tokens": 14435236.0, + "step": 3490 + }, + { + "entropy": 0.2866682179272175, + "epoch": 0.814780277421611, + "grad_norm": 0.5390625, + "learning_rate": 1.998247723682158e-05, + "loss": 0.436, + "mean_token_accuracy": 0.912602162361145, + "num_tokens": 14456632.0, + "step": 3495 + }, + { + "entropy": 0.23919593989849092, + "epoch": 0.8159459144422427, + "grad_norm": 2.59375, + "learning_rate": 1.9982426127871056e-05, + "loss": 0.3145, + "mean_token_accuracy": 0.9271677911281586, + "num_tokens": 14478160.0, + "step": 3500 + }, + { + "entropy": 0.264223488420248, + "epoch": 0.8171115514628745, + "grad_norm": 0.67578125, + "learning_rate": 1.9982374944625064e-05, + "loss": 0.612, + "mean_token_accuracy": 0.8922808647155762, + "num_tokens": 14494151.0, + "step": 3505 + }, + { + "entropy": 0.2899939067661762, + "epoch": 0.8182771884835063, + "grad_norm": 0.44140625, + "learning_rate": 1.9982323687084365e-05, + "loss": 0.5561, + "mean_token_accuracy": 0.9069446921348572, + "num_tokens": 14516496.0, + "step": 3510 + }, + { + "entropy": 0.24367965012788773, + "epoch": 0.819442825504138, + "grad_norm": 0.89453125, + "learning_rate": 1.998227235524973e-05, + "loss": 0.4233, + "mean_token_accuracy": 0.9186331748962402, + "num_tokens": 14538180.0, + "step": 3515 + }, + { + "entropy": 0.3140773274004459, + "epoch": 0.8206084625247698, + "grad_norm": 0.490234375, + "learning_rate": 1.998222094912192e-05, + "loss": 0.4142, + "mean_token_accuracy": 0.9007182121276855, + "num_tokens": 14571081.0, + "step": 3520 + }, + { + "entropy": 0.33965519815683365, + "epoch": 0.8217740995454016, + "grad_norm": 0.28515625, + "learning_rate": 1.9982169468701702e-05, + "loss": 0.501, + "mean_token_accuracy": 0.8900741517543793, + "num_tokens": 14589550.0, + "step": 3525 + }, + { + "entropy": 0.3125692706555128, + "epoch": 0.8229397365660334, + "grad_norm": 2.546875, + "learning_rate": 1.9982117913989844e-05, + "loss": 0.7301, + "mean_token_accuracy": 0.882424396276474, + "num_tokens": 14605935.0, + "step": 3530 + }, + { + "entropy": 0.2235667049884796, + "epoch": 0.8241053735866651, + "grad_norm": 0.73046875, + "learning_rate": 1.9982066284987108e-05, + "loss": 0.3922, + "mean_token_accuracy": 0.9213648438453674, + "num_tokens": 14629378.0, + "step": 3535 + }, + { + "entropy": 0.3533320169895887, + "epoch": 0.8252710106072969, + "grad_norm": 0.765625, + "learning_rate": 1.9982014581694277e-05, + "loss": 0.5098, + "mean_token_accuracy": 0.8809333443641663, + "num_tokens": 14659124.0, + "step": 3540 + }, + { + "entropy": 0.28666834309697153, + "epoch": 0.8264366476279287, + "grad_norm": 1.2265625, + "learning_rate": 1.9981962804112113e-05, + "loss": 0.6463, + "mean_token_accuracy": 0.8949851632118225, + "num_tokens": 14673210.0, + "step": 3545 + }, + { + "entropy": 0.27381937131285666, + "epoch": 0.8276022846485604, + "grad_norm": 3.078125, + "learning_rate": 1.998191095224139e-05, + "loss": 0.5301, + "mean_token_accuracy": 0.9156894981861115, + "num_tokens": 14688514.0, + "step": 3550 + }, + { + "entropy": 0.27920118868350985, + "epoch": 0.8287679216691922, + "grad_norm": 4.53125, + "learning_rate": 1.9981859026082882e-05, + "loss": 0.558, + "mean_token_accuracy": 0.8992727637290955, + "num_tokens": 14705520.0, + "step": 3555 + }, + { + "entropy": 0.3166303887963295, + "epoch": 0.829933558689824, + "grad_norm": 1.625, + "learning_rate": 1.9981807025637365e-05, + "loss": 0.4082, + "mean_token_accuracy": 0.9166463553905487, + "num_tokens": 14728999.0, + "step": 3560 + }, + { + "entropy": 0.26104464530944826, + "epoch": 0.8310991957104558, + "grad_norm": 0.259765625, + "learning_rate": 1.998175495090561e-05, + "loss": 0.493, + "mean_token_accuracy": 0.9059361159801483, + "num_tokens": 14766556.0, + "step": 3565 + }, + { + "entropy": 0.31263031214475634, + "epoch": 0.8322648327310875, + "grad_norm": 2.796875, + "learning_rate": 1.9981702801888393e-05, + "loss": 0.5876, + "mean_token_accuracy": 0.8922380983829499, + "num_tokens": 14786649.0, + "step": 3570 + }, + { + "entropy": 0.17848571315407752, + "epoch": 0.8334304697517193, + "grad_norm": 0.3359375, + "learning_rate": 1.99816505785865e-05, + "loss": 0.2014, + "mean_token_accuracy": 0.947855943441391, + "num_tokens": 14831989.0, + "step": 3575 + }, + { + "entropy": 0.3408513143658638, + "epoch": 0.8345961067723511, + "grad_norm": 2.84375, + "learning_rate": 1.99815982810007e-05, + "loss": 0.5118, + "mean_token_accuracy": 0.8933568835258484, + "num_tokens": 14849088.0, + "step": 3580 + }, + { + "entropy": 0.2799061857163906, + "epoch": 0.8357617437929828, + "grad_norm": 2.3125, + "learning_rate": 1.998154590913178e-05, + "loss": 0.5424, + "mean_token_accuracy": 0.9027321338653564, + "num_tokens": 14864188.0, + "step": 3585 + }, + { + "entropy": 0.2542919620871544, + "epoch": 0.8369273808136146, + "grad_norm": 1.484375, + "learning_rate": 1.9981493462980514e-05, + "loss": 0.424, + "mean_token_accuracy": 0.9119072020053863, + "num_tokens": 14881774.0, + "step": 3590 + }, + { + "entropy": 0.28481525033712385, + "epoch": 0.8380930178342464, + "grad_norm": 1.6875, + "learning_rate": 1.998144094254769e-05, + "loss": 0.4691, + "mean_token_accuracy": 0.9085649073123931, + "num_tokens": 14896673.0, + "step": 3595 + }, + { + "entropy": 0.28367489129304885, + "epoch": 0.8392586548548782, + "grad_norm": 1.546875, + "learning_rate": 1.998138834783409e-05, + "loss": 0.5003, + "mean_token_accuracy": 0.9036788463592529, + "num_tokens": 14910997.0, + "step": 3600 + }, + { + "entropy": 0.24263509139418601, + "epoch": 0.8404242918755099, + "grad_norm": 2.21875, + "learning_rate": 1.9981335678840495e-05, + "loss": 0.3992, + "mean_token_accuracy": 0.9190425634384155, + "num_tokens": 14935090.0, + "step": 3605 + }, + { + "entropy": 0.24624939411878585, + "epoch": 0.8415899288961417, + "grad_norm": 0.490234375, + "learning_rate": 1.9981282935567693e-05, + "loss": 0.4746, + "mean_token_accuracy": 0.9015482604503632, + "num_tokens": 14949307.0, + "step": 3610 + }, + { + "entropy": 0.2743656687438488, + "epoch": 0.8427555659167735, + "grad_norm": 0.427734375, + "learning_rate": 1.998123011801647e-05, + "loss": 0.2832, + "mean_token_accuracy": 0.9231635749340057, + "num_tokens": 14979284.0, + "step": 3615 + }, + { + "entropy": 0.18736706282943488, + "epoch": 0.8439212029374052, + "grad_norm": 0.87109375, + "learning_rate": 1.9981177226187617e-05, + "loss": 0.3228, + "mean_token_accuracy": 0.9420149087905884, + "num_tokens": 15012270.0, + "step": 3620 + }, + { + "entropy": 0.356676259636879, + "epoch": 0.845086839958037, + "grad_norm": 2.09375, + "learning_rate": 1.9981124260081917e-05, + "loss": 0.7459, + "mean_token_accuracy": 0.8781390905380249, + "num_tokens": 15023574.0, + "step": 3625 + }, + { + "entropy": 0.24901985973119736, + "epoch": 0.8462524769786688, + "grad_norm": 0.4375, + "learning_rate": 1.998107121970016e-05, + "loss": 0.3729, + "mean_token_accuracy": 0.9155802965164185, + "num_tokens": 15053734.0, + "step": 3630 + }, + { + "entropy": 0.26287381947040556, + "epoch": 0.8474181139993007, + "grad_norm": 2.234375, + "learning_rate": 1.9981018105043144e-05, + "loss": 0.5193, + "mean_token_accuracy": 0.9083257913589478, + "num_tokens": 15072791.0, + "step": 3635 + }, + { + "entropy": 0.3157116275280714, + "epoch": 0.8485837510199324, + "grad_norm": 1.0625, + "learning_rate": 1.9980964916111654e-05, + "loss": 0.3004, + "mean_token_accuracy": 0.903801566362381, + "num_tokens": 15107653.0, + "step": 3640 + }, + { + "entropy": 0.28021684251725676, + "epoch": 0.8497493880405642, + "grad_norm": 0.50390625, + "learning_rate": 1.9980911652906484e-05, + "loss": 0.4549, + "mean_token_accuracy": 0.8997021973133087, + "num_tokens": 15132932.0, + "step": 3645 + }, + { + "entropy": 0.25191900655627253, + "epoch": 0.850915025061196, + "grad_norm": 1.5859375, + "learning_rate": 1.998085831542843e-05, + "loss": 0.4318, + "mean_token_accuracy": 0.9206875085830688, + "num_tokens": 15146894.0, + "step": 3650 + }, + { + "entropy": 0.3430237350985408, + "epoch": 0.8520806620818278, + "grad_norm": 4.5, + "learning_rate": 1.9980804903678287e-05, + "loss": 0.5912, + "mean_token_accuracy": 0.88895942568779, + "num_tokens": 15174424.0, + "step": 3655 + }, + { + "entropy": 0.31596364453434944, + "epoch": 0.8532462991024595, + "grad_norm": 1.6640625, + "learning_rate": 1.998075141765685e-05, + "loss": 0.3543, + "mean_token_accuracy": 0.8607090711593628, + "num_tokens": 15204731.0, + "step": 3660 + }, + { + "entropy": 0.3275330767035484, + "epoch": 0.8544119361230913, + "grad_norm": 3.59375, + "learning_rate": 1.998069785736492e-05, + "loss": 0.5245, + "mean_token_accuracy": 0.907353276014328, + "num_tokens": 15232691.0, + "step": 3665 + }, + { + "entropy": 0.27450884664431213, + "epoch": 0.8555775731437231, + "grad_norm": 0.162109375, + "learning_rate": 1.9980644222803296e-05, + "loss": 0.4254, + "mean_token_accuracy": 0.8911545634269714, + "num_tokens": 15270176.0, + "step": 3670 + }, + { + "entropy": 0.2559219378978014, + "epoch": 0.8567432101643548, + "grad_norm": 0.73046875, + "learning_rate": 1.9980590513972775e-05, + "loss": 0.5514, + "mean_token_accuracy": 0.911538553237915, + "num_tokens": 15289751.0, + "step": 3675 + }, + { + "entropy": 0.24207828417420388, + "epoch": 0.8579088471849866, + "grad_norm": 1.515625, + "learning_rate": 1.9980536730874154e-05, + "loss": 0.2878, + "mean_token_accuracy": 0.9238324701786041, + "num_tokens": 15314355.0, + "step": 3680 + }, + { + "entropy": 0.44712048918008807, + "epoch": 0.8590744842056184, + "grad_norm": 1.640625, + "learning_rate": 1.998048287350824e-05, + "loss": 0.5822, + "mean_token_accuracy": 0.8792366266250611, + "num_tokens": 15325754.0, + "step": 3685 + }, + { + "entropy": 0.31567412763834, + "epoch": 0.8602401212262502, + "grad_norm": 1.9375, + "learning_rate": 1.9980428941875835e-05, + "loss": 0.4949, + "mean_token_accuracy": 0.914052402973175, + "num_tokens": 15353841.0, + "step": 3690 + }, + { + "entropy": 0.26079447716474535, + "epoch": 0.8614057582468819, + "grad_norm": 2.453125, + "learning_rate": 1.9980374935977747e-05, + "loss": 0.53, + "mean_token_accuracy": 0.9045489251613616, + "num_tokens": 15376130.0, + "step": 3695 + }, + { + "entropy": 0.41225661505013705, + "epoch": 0.8625713952675137, + "grad_norm": 1.2265625, + "learning_rate": 1.9980320855814775e-05, + "loss": 0.5735, + "mean_token_accuracy": 0.8939766168594361, + "num_tokens": 15407430.0, + "step": 3700 + }, + { + "entropy": 0.30904234796762464, + "epoch": 0.8637370322881455, + "grad_norm": 2.796875, + "learning_rate": 1.998026670138773e-05, + "loss": 0.6142, + "mean_token_accuracy": 0.8968602418899536, + "num_tokens": 15418732.0, + "step": 3705 + }, + { + "entropy": 0.2660053789615631, + "epoch": 0.8649026693087772, + "grad_norm": 0.376953125, + "learning_rate": 1.9980212472697414e-05, + "loss": 0.283, + "mean_token_accuracy": 0.9082744538784027, + "num_tokens": 15458633.0, + "step": 3710 + }, + { + "entropy": 0.2127044014632702, + "epoch": 0.866068306329409, + "grad_norm": 3.171875, + "learning_rate": 1.9980158169744644e-05, + "loss": 0.2172, + "mean_token_accuracy": 0.9319159090518951, + "num_tokens": 15493223.0, + "step": 3715 + }, + { + "entropy": 0.3829865030944347, + "epoch": 0.8672339433500408, + "grad_norm": 3.171875, + "learning_rate": 1.998010379253022e-05, + "loss": 0.8015, + "mean_token_accuracy": 0.8439773738384246, + "num_tokens": 15505228.0, + "step": 3720 + }, + { + "entropy": 0.36416123397648337, + "epoch": 0.8683995803706726, + "grad_norm": 2.0625, + "learning_rate": 1.9980049341054963e-05, + "loss": 0.6867, + "mean_token_accuracy": 0.8739733338356018, + "num_tokens": 15520660.0, + "step": 3725 + }, + { + "entropy": 0.2619575455784798, + "epoch": 0.8695652173913043, + "grad_norm": 0.41796875, + "learning_rate": 1.9979994815319677e-05, + "loss": 0.4769, + "mean_token_accuracy": 0.9197808086872101, + "num_tokens": 15550128.0, + "step": 3730 + }, + { + "entropy": 0.27543322481215, + "epoch": 0.8707308544119361, + "grad_norm": 2.5625, + "learning_rate": 1.9979940215325178e-05, + "loss": 0.5064, + "mean_token_accuracy": 0.9059760510921478, + "num_tokens": 15571541.0, + "step": 3735 + }, + { + "entropy": 0.25470650345087054, + "epoch": 0.8718964914325679, + "grad_norm": 0.53515625, + "learning_rate": 1.997988554107228e-05, + "loss": 0.3501, + "mean_token_accuracy": 0.9108909010887146, + "num_tokens": 15608560.0, + "step": 3740 + }, + { + "entropy": 0.24583430550992488, + "epoch": 0.8730621284531996, + "grad_norm": 0.8671875, + "learning_rate": 1.99798307925618e-05, + "loss": 0.382, + "mean_token_accuracy": 0.9136410057544708, + "num_tokens": 15630666.0, + "step": 3745 + }, + { + "entropy": 0.1376958515495062, + "epoch": 0.8742277654738314, + "grad_norm": 0.9296875, + "learning_rate": 1.9979775969794553e-05, + "loss": 0.1491, + "mean_token_accuracy": 0.9549042522907257, + "num_tokens": 15671696.0, + "step": 3750 + }, + { + "entropy": 0.22299788594245912, + "epoch": 0.8753934024944632, + "grad_norm": 0.61328125, + "learning_rate": 1.9979721072771357e-05, + "loss": 0.4353, + "mean_token_accuracy": 0.9072445154190063, + "num_tokens": 15703582.0, + "step": 3755 + }, + { + "entropy": 0.24687078446149827, + "epoch": 0.876559039515095, + "grad_norm": 1.265625, + "learning_rate": 1.9979666101493027e-05, + "loss": 0.592, + "mean_token_accuracy": 0.891139543056488, + "num_tokens": 15722485.0, + "step": 3760 + }, + { + "entropy": 0.3579208765178919, + "epoch": 0.8777246765357267, + "grad_norm": 3.34375, + "learning_rate": 1.9979611055960385e-05, + "loss": 0.8938, + "mean_token_accuracy": 0.8332218408584595, + "num_tokens": 15746229.0, + "step": 3765 + }, + { + "entropy": 0.2589207597076893, + "epoch": 0.8788903135563586, + "grad_norm": 0.36328125, + "learning_rate": 1.997955593617426e-05, + "loss": 0.5105, + "mean_token_accuracy": 0.9086531221866607, + "num_tokens": 15786496.0, + "step": 3770 + }, + { + "entropy": 0.22225482761859894, + "epoch": 0.8800559505769904, + "grad_norm": 2.1875, + "learning_rate": 1.9979500742135456e-05, + "loss": 0.4195, + "mean_token_accuracy": 0.9165923833847046, + "num_tokens": 15805106.0, + "step": 3775 + }, + { + "entropy": 0.3877521827816963, + "epoch": 0.8812215875976221, + "grad_norm": 1.34375, + "learning_rate": 1.9979445473844813e-05, + "loss": 0.5482, + "mean_token_accuracy": 0.8581431388854981, + "num_tokens": 15825199.0, + "step": 3780 + }, + { + "entropy": 0.24894805401563644, + "epoch": 0.8823872246182539, + "grad_norm": 2.171875, + "learning_rate": 1.9979390131303144e-05, + "loss": 0.4555, + "mean_token_accuracy": 0.9111175835132599, + "num_tokens": 15845149.0, + "step": 3785 + }, + { + "entropy": 0.40299307107925414, + "epoch": 0.8835528616388857, + "grad_norm": 3.0625, + "learning_rate": 1.997933471451128e-05, + "loss": 0.8629, + "mean_token_accuracy": 0.8512665927410126, + "num_tokens": 15862956.0, + "step": 3790 + }, + { + "entropy": 0.28117614462971685, + "epoch": 0.8847184986595175, + "grad_norm": 2.09375, + "learning_rate": 1.997927922347005e-05, + "loss": 0.4772, + "mean_token_accuracy": 0.9130080938339233, + "num_tokens": 15876637.0, + "step": 3795 + }, + { + "entropy": 0.5891709521412849, + "epoch": 0.8858841356801492, + "grad_norm": 0.73046875, + "learning_rate": 1.9979223658180273e-05, + "loss": 0.8647, + "mean_token_accuracy": 0.7845417231321334, + "num_tokens": 15909938.0, + "step": 3800 + }, + { + "entropy": 0.3107435509562492, + "epoch": 0.887049772700781, + "grad_norm": 0.37890625, + "learning_rate": 1.9979168018642784e-05, + "loss": 0.5805, + "mean_token_accuracy": 0.8911435663700104, + "num_tokens": 15937554.0, + "step": 3805 + }, + { + "entropy": 0.2665897116065025, + "epoch": 0.8882154097214128, + "grad_norm": 3.84375, + "learning_rate": 1.997911230485841e-05, + "loss": 0.7637, + "mean_token_accuracy": 0.8828855872154235, + "num_tokens": 15947609.0, + "step": 3810 + }, + { + "entropy": 0.3165190897881985, + "epoch": 0.8893810467420445, + "grad_norm": 2.921875, + "learning_rate": 1.9979056516827984e-05, + "loss": 0.6888, + "mean_token_accuracy": 0.8773676693439484, + "num_tokens": 15960074.0, + "step": 3815 + }, + { + "entropy": 0.2786505434662104, + "epoch": 0.8905466837626763, + "grad_norm": 6.0, + "learning_rate": 1.9979000654552336e-05, + "loss": 0.6122, + "mean_token_accuracy": 0.8958684921264648, + "num_tokens": 15977495.0, + "step": 3820 + }, + { + "entropy": 0.28286024890840056, + "epoch": 0.8917123207833081, + "grad_norm": 6.28125, + "learning_rate": 1.99789447180323e-05, + "loss": 0.6977, + "mean_token_accuracy": 0.8930557310581207, + "num_tokens": 15992438.0, + "step": 3825 + }, + { + "entropy": 0.27810373045504094, + "epoch": 0.8928779578039399, + "grad_norm": 0.314453125, + "learning_rate": 1.9978888707268706e-05, + "loss": 0.4654, + "mean_token_accuracy": 0.8919014155864715, + "num_tokens": 16020386.0, + "step": 3830 + }, + { + "entropy": 0.4190558884292841, + "epoch": 0.8940435948245716, + "grad_norm": 0.2421875, + "learning_rate": 1.9978832622262397e-05, + "loss": 0.6651, + "mean_token_accuracy": 0.8513571143150329, + "num_tokens": 16039241.0, + "step": 3835 + }, + { + "entropy": 0.25377435609698296, + "epoch": 0.8952092318452034, + "grad_norm": 0.8359375, + "learning_rate": 1.9978776463014203e-05, + "loss": 0.5405, + "mean_token_accuracy": 0.9124003052711487, + "num_tokens": 16052979.0, + "step": 3840 + }, + { + "entropy": 0.384748013317585, + "epoch": 0.8963748688658352, + "grad_norm": 0.55078125, + "learning_rate": 1.9978720229524963e-05, + "loss": 0.6201, + "mean_token_accuracy": 0.8712624669075012, + "num_tokens": 16071177.0, + "step": 3845 + }, + { + "entropy": 0.2742875192314386, + "epoch": 0.8975405058864669, + "grad_norm": 0.81640625, + "learning_rate": 1.9978663921795515e-05, + "loss": 0.5037, + "mean_token_accuracy": 0.8969690918922424, + "num_tokens": 16098544.0, + "step": 3850 + }, + { + "entropy": 0.3160762920975685, + "epoch": 0.8987061429070987, + "grad_norm": 3.375, + "learning_rate": 1.9978607539826702e-05, + "loss": 0.5482, + "mean_token_accuracy": 0.8990673005580903, + "num_tokens": 16112610.0, + "step": 3855 + }, + { + "entropy": 0.32029368802905084, + "epoch": 0.8998717799277305, + "grad_norm": 0.90234375, + "learning_rate": 1.997855108361936e-05, + "loss": 0.41, + "mean_token_accuracy": 0.9168787181377411, + "num_tokens": 16135541.0, + "step": 3860 + }, + { + "entropy": 0.24988686461001636, + "epoch": 0.9010374169483623, + "grad_norm": 0.3984375, + "learning_rate": 1.9978494553174337e-05, + "loss": 0.3174, + "mean_token_accuracy": 0.9324497878551483, + "num_tokens": 16157098.0, + "step": 3865 + }, + { + "entropy": 0.40392661690711973, + "epoch": 0.902203053968994, + "grad_norm": 2.328125, + "learning_rate": 1.997843794849247e-05, + "loss": 1.0247, + "mean_token_accuracy": 0.8482144296169281, + "num_tokens": 16164815.0, + "step": 3870 + }, + { + "entropy": 0.26116530895233153, + "epoch": 0.9033686909896258, + "grad_norm": 2.9375, + "learning_rate": 1.9978381269574605e-05, + "loss": 0.6495, + "mean_token_accuracy": 0.9009441614151001, + "num_tokens": 16184481.0, + "step": 3875 + }, + { + "entropy": 0.21197875253856183, + "epoch": 0.9045343280102576, + "grad_norm": 0.390625, + "learning_rate": 1.997832451642159e-05, + "loss": 0.2791, + "mean_token_accuracy": 0.9292280495166778, + "num_tokens": 16216813.0, + "step": 3880 + }, + { + "entropy": 0.3333045765757561, + "epoch": 0.9056999650308893, + "grad_norm": 2.34375, + "learning_rate": 1.9978267689034263e-05, + "loss": 0.8819, + "mean_token_accuracy": 0.855522632598877, + "num_tokens": 16234546.0, + "step": 3885 + }, + { + "entropy": 0.37700636237859725, + "epoch": 0.9068656020515211, + "grad_norm": 0.38671875, + "learning_rate": 1.9978210787413478e-05, + "loss": 0.7481, + "mean_token_accuracy": 0.8536148369312286, + "num_tokens": 16251924.0, + "step": 3890 + }, + { + "entropy": 0.31856413632631303, + "epoch": 0.9080312390721529, + "grad_norm": 0.71484375, + "learning_rate": 1.9978153811560084e-05, + "loss": 0.6041, + "mean_token_accuracy": 0.8982816457748413, + "num_tokens": 16263609.0, + "step": 3895 + }, + { + "entropy": 0.3457029201090336, + "epoch": 0.9091968760927847, + "grad_norm": 2.015625, + "learning_rate": 1.9978096761474932e-05, + "loss": 0.4611, + "mean_token_accuracy": 0.9150507569313049, + "num_tokens": 16284607.0, + "step": 3900 + }, + { + "entropy": 0.28251473493874074, + "epoch": 0.9103625131134164, + "grad_norm": 2.078125, + "learning_rate": 1.9978039637158865e-05, + "loss": 0.6731, + "mean_token_accuracy": 0.8842589139938355, + "num_tokens": 16302675.0, + "step": 3905 + }, + { + "entropy": 0.23792364951223136, + "epoch": 0.9115281501340483, + "grad_norm": 0.734375, + "learning_rate": 1.9977982438612744e-05, + "loss": 0.3204, + "mean_token_accuracy": 0.9141345858573914, + "num_tokens": 16339004.0, + "step": 3910 + }, + { + "entropy": 0.23666626550257205, + "epoch": 0.9126937871546801, + "grad_norm": 0.9453125, + "learning_rate": 1.9977925165837415e-05, + "loss": 0.3407, + "mean_token_accuracy": 0.9296033978462219, + "num_tokens": 16359569.0, + "step": 3915 + }, + { + "entropy": 0.22190179247409106, + "epoch": 0.9138594241753119, + "grad_norm": 2.28125, + "learning_rate": 1.9977867818833737e-05, + "loss": 0.2375, + "mean_token_accuracy": 0.915921813249588, + "num_tokens": 16393658.0, + "step": 3920 + }, + { + "entropy": 0.3424635842442513, + "epoch": 0.9150250611959436, + "grad_norm": 2.140625, + "learning_rate": 1.997781039760256e-05, + "loss": 0.4036, + "mean_token_accuracy": 0.8847531020641327, + "num_tokens": 16426979.0, + "step": 3925 + }, + { + "entropy": 0.2571659699082375, + "epoch": 0.9161906982165754, + "grad_norm": 2.484375, + "learning_rate": 1.9977752902144742e-05, + "loss": 0.5223, + "mean_token_accuracy": 0.9112405836582184, + "num_tokens": 16438542.0, + "step": 3930 + }, + { + "entropy": 0.3213506378233433, + "epoch": 0.9173563352372072, + "grad_norm": 0.46875, + "learning_rate": 1.9977695332461143e-05, + "loss": 0.5068, + "mean_token_accuracy": 0.8930616676807404, + "num_tokens": 16453504.0, + "step": 3935 + }, + { + "entropy": 0.19305912591516972, + "epoch": 0.9185219722578389, + "grad_norm": 0.96875, + "learning_rate": 1.997763768855262e-05, + "loss": 0.2318, + "mean_token_accuracy": 0.932563591003418, + "num_tokens": 16485783.0, + "step": 3940 + }, + { + "entropy": 0.22780342772603035, + "epoch": 0.9196876092784707, + "grad_norm": 1.0078125, + "learning_rate": 1.9977579970420033e-05, + "loss": 0.4052, + "mean_token_accuracy": 0.9049673736095428, + "num_tokens": 16511421.0, + "step": 3945 + }, + { + "entropy": 0.17885006498545408, + "epoch": 0.9208532462991025, + "grad_norm": 0.671875, + "learning_rate": 1.9977522178064242e-05, + "loss": 0.3811, + "mean_token_accuracy": 0.9320909857749939, + "num_tokens": 16542554.0, + "step": 3950 + }, + { + "entropy": 0.2305964458733797, + "epoch": 0.9220188833197343, + "grad_norm": 0.265625, + "learning_rate": 1.9977464311486105e-05, + "loss": 0.293, + "mean_token_accuracy": 0.9366983592510223, + "num_tokens": 16577722.0, + "step": 3955 + }, + { + "entropy": 0.22702017351984977, + "epoch": 0.923184520340366, + "grad_norm": 2.265625, + "learning_rate": 1.9977406370686493e-05, + "loss": 0.4052, + "mean_token_accuracy": 0.9182761132717132, + "num_tokens": 16605331.0, + "step": 3960 + }, + { + "entropy": 0.33075036108493805, + "epoch": 0.9243501573609978, + "grad_norm": 0.82421875, + "learning_rate": 1.9977348355666265e-05, + "loss": 0.6821, + "mean_token_accuracy": 0.8687184751033783, + "num_tokens": 16616854.0, + "step": 3965 + }, + { + "entropy": 0.2858414173126221, + "epoch": 0.9255157943816296, + "grad_norm": 0.99609375, + "learning_rate": 1.9977290266426284e-05, + "loss": 0.4689, + "mean_token_accuracy": 0.91506507396698, + "num_tokens": 16636421.0, + "step": 3970 + }, + { + "entropy": 0.31540155187249186, + "epoch": 0.9266814314022613, + "grad_norm": 0.45703125, + "learning_rate": 1.9977232102967422e-05, + "loss": 0.3668, + "mean_token_accuracy": 0.9111514151096344, + "num_tokens": 16659301.0, + "step": 3975 + }, + { + "entropy": 0.31216612458229065, + "epoch": 0.9278470684228931, + "grad_norm": 2.53125, + "learning_rate": 1.997717386529054e-05, + "loss": 0.7358, + "mean_token_accuracy": 0.8802648484706879, + "num_tokens": 16672943.0, + "step": 3980 + }, + { + "entropy": 0.2625709608197212, + "epoch": 0.9290127054435249, + "grad_norm": 1.2578125, + "learning_rate": 1.997711555339651e-05, + "loss": 0.3855, + "mean_token_accuracy": 0.9172736048698426, + "num_tokens": 16695486.0, + "step": 3985 + }, + { + "entropy": 0.3358628749847412, + "epoch": 0.9301783424641566, + "grad_norm": 1.265625, + "learning_rate": 1.9977057167286203e-05, + "loss": 0.7025, + "mean_token_accuracy": 0.888811719417572, + "num_tokens": 16704825.0, + "step": 3990 + }, + { + "entropy": 0.24518522769212722, + "epoch": 0.9313439794847884, + "grad_norm": 1.6328125, + "learning_rate": 1.9976998706960488e-05, + "loss": 0.4656, + "mean_token_accuracy": 0.9169709384441376, + "num_tokens": 16724234.0, + "step": 3995 + }, + { + "entropy": 0.25113632045686246, + "epoch": 0.9325096165054202, + "grad_norm": 0.419921875, + "learning_rate": 1.9976940172420232e-05, + "loss": 0.3557, + "mean_token_accuracy": 0.9113317489624023, + "num_tokens": 16757652.0, + "step": 4000 + }, + { + "entropy": 0.3050874337553978, + "epoch": 0.933675253526052, + "grad_norm": 1.3203125, + "learning_rate": 1.9976881563666318e-05, + "loss": 0.4791, + "mean_token_accuracy": 0.8894456505775452, + "num_tokens": 16779558.0, + "step": 4005 + }, + { + "entropy": 0.354495170339942, + "epoch": 0.9348408905466837, + "grad_norm": 1.4296875, + "learning_rate": 1.9976822880699612e-05, + "loss": 0.5045, + "mean_token_accuracy": 0.8878437876701355, + "num_tokens": 16805757.0, + "step": 4010 + }, + { + "entropy": 0.24523208253085613, + "epoch": 0.9360065275673155, + "grad_norm": 0.64453125, + "learning_rate": 1.997676412352099e-05, + "loss": 0.5218, + "mean_token_accuracy": 0.9126578509807587, + "num_tokens": 16824284.0, + "step": 4015 + }, + { + "entropy": 0.19175314232707025, + "epoch": 0.9371721645879473, + "grad_norm": 0.79296875, + "learning_rate": 1.997670529213133e-05, + "loss": 0.3261, + "mean_token_accuracy": 0.9379496335983276, + "num_tokens": 16847471.0, + "step": 4020 + }, + { + "entropy": 0.2729227438569069, + "epoch": 0.938337801608579, + "grad_norm": 0.6875, + "learning_rate": 1.997664638653151e-05, + "loss": 0.5073, + "mean_token_accuracy": 0.9096316695213318, + "num_tokens": 16869398.0, + "step": 4025 + }, + { + "entropy": 0.3152917675673962, + "epoch": 0.9395034386292108, + "grad_norm": 1.3046875, + "learning_rate": 1.9976587406722404e-05, + "loss": 0.5437, + "mean_token_accuracy": 0.9035122036933899, + "num_tokens": 16881743.0, + "step": 4030 + }, + { + "entropy": 0.4029148206114769, + "epoch": 0.9406690756498426, + "grad_norm": 1.9609375, + "learning_rate": 1.99765283527049e-05, + "loss": 0.6399, + "mean_token_accuracy": 0.8744793474674225, + "num_tokens": 16896559.0, + "step": 4035 + }, + { + "entropy": 0.4212800972163677, + "epoch": 0.9418347126704744, + "grad_norm": 2.6875, + "learning_rate": 1.9976469224479868e-05, + "loss": 0.6903, + "mean_token_accuracy": 0.8723967254161835, + "num_tokens": 16909529.0, + "step": 4040 + }, + { + "entropy": 0.33663665801286696, + "epoch": 0.9430003496911062, + "grad_norm": 1.21875, + "learning_rate": 1.9976410022048198e-05, + "loss": 0.7401, + "mean_token_accuracy": 0.8718845307826996, + "num_tokens": 16920073.0, + "step": 4045 + }, + { + "entropy": 0.24011497870087622, + "epoch": 0.944165986711738, + "grad_norm": 0.50390625, + "learning_rate": 1.997635074541077e-05, + "loss": 0.4172, + "mean_token_accuracy": 0.9157796263694763, + "num_tokens": 16948944.0, + "step": 4050 + }, + { + "entropy": 0.2570995256304741, + "epoch": 0.9453316237323698, + "grad_norm": 0.73046875, + "learning_rate": 1.9976291394568465e-05, + "loss": 0.3138, + "mean_token_accuracy": 0.92692631483078, + "num_tokens": 16973131.0, + "step": 4055 + }, + { + "entropy": 0.4393061429262161, + "epoch": 0.9464972607530016, + "grad_norm": 2.40625, + "learning_rate": 1.997623196952217e-05, + "loss": 0.8668, + "mean_token_accuracy": 0.8413293391466141, + "num_tokens": 16992622.0, + "step": 4060 + }, + { + "entropy": 0.34109273105859755, + "epoch": 0.9476628977736333, + "grad_norm": 0.69921875, + "learning_rate": 1.9976172470272774e-05, + "loss": 0.5098, + "mean_token_accuracy": 0.9031398296356201, + "num_tokens": 17013703.0, + "step": 4065 + }, + { + "entropy": 0.5193141900002957, + "epoch": 0.9488285347942651, + "grad_norm": 0.79296875, + "learning_rate": 1.9976112896821164e-05, + "loss": 0.7514, + "mean_token_accuracy": 0.8595295757055282, + "num_tokens": 17046681.0, + "step": 4070 + }, + { + "entropy": 0.29651210978627207, + "epoch": 0.9499941718148969, + "grad_norm": 0.34375, + "learning_rate": 1.997605324916822e-05, + "loss": 0.6418, + "mean_token_accuracy": 0.8880113184452056, + "num_tokens": 17068445.0, + "step": 4075 + }, + { + "entropy": 0.2569761071354151, + "epoch": 0.9511598088355286, + "grad_norm": 0.8515625, + "learning_rate": 1.997599352731484e-05, + "loss": 0.5029, + "mean_token_accuracy": 0.9025446593761444, + "num_tokens": 17085586.0, + "step": 4080 + }, + { + "entropy": 0.30053408779203894, + "epoch": 0.9523254458561604, + "grad_norm": 2.40625, + "learning_rate": 1.9975933731261917e-05, + "loss": 0.5302, + "mean_token_accuracy": 0.8973315000534058, + "num_tokens": 17108799.0, + "step": 4085 + }, + { + "entropy": 0.2752389371395111, + "epoch": 0.9534910828767922, + "grad_norm": 0.462890625, + "learning_rate": 1.997587386101033e-05, + "loss": 0.5593, + "mean_token_accuracy": 0.900017648935318, + "num_tokens": 17120763.0, + "step": 4090 + }, + { + "entropy": 0.19511011131107808, + "epoch": 0.954656719897424, + "grad_norm": 0.8984375, + "learning_rate": 1.9975813916560988e-05, + "loss": 0.169, + "mean_token_accuracy": 0.9473197340965271, + "num_tokens": 17147695.0, + "step": 4095 + }, + { + "entropy": 0.2630501437932253, + "epoch": 0.9558223569180557, + "grad_norm": 2.75, + "learning_rate": 1.997575389791477e-05, + "loss": 0.5802, + "mean_token_accuracy": 0.895086270570755, + "num_tokens": 17164836.0, + "step": 4100 + }, + { + "entropy": 0.3251684829592705, + "epoch": 0.9569879939386875, + "grad_norm": 0.36328125, + "learning_rate": 1.997569380507258e-05, + "loss": 0.628, + "mean_token_accuracy": 0.8754357278347016, + "num_tokens": 17193063.0, + "step": 4105 + }, + { + "entropy": 0.26151609756052496, + "epoch": 0.9581536309593193, + "grad_norm": 0.21484375, + "learning_rate": 1.997563363803531e-05, + "loss": 0.4691, + "mean_token_accuracy": 0.9026021063327789, + "num_tokens": 17220094.0, + "step": 4110 + }, + { + "entropy": 0.20274873673915864, + "epoch": 0.959319267979951, + "grad_norm": 1.3984375, + "learning_rate": 1.997557339680386e-05, + "loss": 0.5498, + "mean_token_accuracy": 0.910257762670517, + "num_tokens": 17234695.0, + "step": 4115 + }, + { + "entropy": 0.2564930349588394, + "epoch": 0.9604849050005828, + "grad_norm": 1.8046875, + "learning_rate": 1.9975513081379125e-05, + "loss": 0.4044, + "mean_token_accuracy": 0.9224636614322662, + "num_tokens": 17251544.0, + "step": 4120 + }, + { + "entropy": 0.2453150164335966, + "epoch": 0.9616505420212146, + "grad_norm": 1.65625, + "learning_rate": 1.997545269176201e-05, + "loss": 0.3385, + "mean_token_accuracy": 0.9272057890892029, + "num_tokens": 17277767.0, + "step": 4125 + }, + { + "entropy": 0.27376823760569097, + "epoch": 0.9628161790418464, + "grad_norm": 4.125, + "learning_rate": 1.997539222795341e-05, + "loss": 0.5076, + "mean_token_accuracy": 0.8964590787887573, + "num_tokens": 17299605.0, + "step": 4130 + }, + { + "entropy": 0.2696050863713026, + "epoch": 0.9639818160624781, + "grad_norm": 1.953125, + "learning_rate": 1.9975331689954228e-05, + "loss": 0.3222, + "mean_token_accuracy": 0.9068039536476136, + "num_tokens": 17337810.0, + "step": 4135 + }, + { + "entropy": 0.27581446021795275, + "epoch": 0.9651474530831099, + "grad_norm": 0.921875, + "learning_rate": 1.9975271077765365e-05, + "loss": 0.569, + "mean_token_accuracy": 0.893970274925232, + "num_tokens": 17350006.0, + "step": 4140 + }, + { + "entropy": 0.2386571519076824, + "epoch": 0.9663130901037417, + "grad_norm": 0.94140625, + "learning_rate": 1.997521039138773e-05, + "loss": 0.4095, + "mean_token_accuracy": 0.9206796944141388, + "num_tokens": 17364775.0, + "step": 4145 + }, + { + "entropy": 0.19428746849298478, + "epoch": 0.9674787271243734, + "grad_norm": 0.65234375, + "learning_rate": 1.9975149630822226e-05, + "loss": 0.2389, + "mean_token_accuracy": 0.9308637917041779, + "num_tokens": 17399499.0, + "step": 4150 + }, + { + "entropy": 0.32295563369989394, + "epoch": 0.9686443641450052, + "grad_norm": 1.140625, + "learning_rate": 1.9975088796069758e-05, + "loss": 0.5525, + "mean_token_accuracy": 0.8863936126232147, + "num_tokens": 17412882.0, + "step": 4155 + }, + { + "entropy": 0.2559451676905155, + "epoch": 0.969810001165637, + "grad_norm": 2.546875, + "learning_rate": 1.9975027887131228e-05, + "loss": 0.541, + "mean_token_accuracy": 0.9047481536865234, + "num_tokens": 17430157.0, + "step": 4160 + }, + { + "entropy": 0.2594328373670578, + "epoch": 0.9709756381862688, + "grad_norm": 4.09375, + "learning_rate": 1.9974966904007553e-05, + "loss": 0.4485, + "mean_token_accuracy": 0.9149899840354919, + "num_tokens": 17443985.0, + "step": 4165 + }, + { + "entropy": 0.4551702942699194, + "epoch": 0.9721412752069005, + "grad_norm": 0.72265625, + "learning_rate": 1.997490584669964e-05, + "loss": 0.7002, + "mean_token_accuracy": 0.8617740720510483, + "num_tokens": 17476885.0, + "step": 4170 + }, + { + "entropy": 0.3077906858175993, + "epoch": 0.9733069122275323, + "grad_norm": 4.5, + "learning_rate": 1.9974844715208397e-05, + "loss": 0.648, + "mean_token_accuracy": 0.893233060836792, + "num_tokens": 17492268.0, + "step": 4175 + }, + { + "entropy": 0.32362317144870756, + "epoch": 0.9744725492481642, + "grad_norm": 3.203125, + "learning_rate": 1.9974783509534737e-05, + "loss": 0.6601, + "mean_token_accuracy": 0.8944636046886444, + "num_tokens": 17514877.0, + "step": 4180 + }, + { + "entropy": 0.3098685838282108, + "epoch": 0.975638186268796, + "grad_norm": 1.015625, + "learning_rate": 1.997472222967957e-05, + "loss": 0.5523, + "mean_token_accuracy": 0.8969592690467835, + "num_tokens": 17526680.0, + "step": 4185 + }, + { + "entropy": 0.3358676999807358, + "epoch": 0.9768038232894277, + "grad_norm": 3.0, + "learning_rate": 1.9974660875643814e-05, + "loss": 0.7514, + "mean_token_accuracy": 0.8687438666820526, + "num_tokens": 17537720.0, + "step": 4190 + }, + { + "entropy": 0.31152590177953243, + "epoch": 0.9779694603100595, + "grad_norm": 1.015625, + "learning_rate": 1.997459944742838e-05, + "loss": 0.4645, + "mean_token_accuracy": 0.892284095287323, + "num_tokens": 17559300.0, + "step": 4195 + }, + { + "entropy": 0.35799047742038964, + "epoch": 0.9791350973306913, + "grad_norm": 3.5625, + "learning_rate": 1.997453794503419e-05, + "loss": 0.5022, + "mean_token_accuracy": 0.8763210058212281, + "num_tokens": 17592705.0, + "step": 4200 + }, + { + "entropy": 0.2442564606666565, + "epoch": 0.980300734351323, + "grad_norm": 0.88671875, + "learning_rate": 1.9974476368462155e-05, + "loss": 0.3056, + "mean_token_accuracy": 0.9137523889541626, + "num_tokens": 17614714.0, + "step": 4205 + }, + { + "entropy": 0.21334104053676128, + "epoch": 0.9814663713719548, + "grad_norm": 0.59765625, + "learning_rate": 1.9974414717713196e-05, + "loss": 0.3557, + "mean_token_accuracy": 0.929449713230133, + "num_tokens": 17634380.0, + "step": 4210 + }, + { + "entropy": 0.3569655314087868, + "epoch": 0.9826320083925866, + "grad_norm": 0.671875, + "learning_rate": 1.997435299278823e-05, + "loss": 0.3218, + "mean_token_accuracy": 0.8842150449752808, + "num_tokens": 17677414.0, + "step": 4215 + }, + { + "entropy": 0.42681182771921156, + "epoch": 0.9837976454132183, + "grad_norm": 3.90625, + "learning_rate": 1.997429119368818e-05, + "loss": 0.9208, + "mean_token_accuracy": 0.8420846164226532, + "num_tokens": 17688253.0, + "step": 4220 + }, + { + "entropy": 0.3741662811487913, + "epoch": 0.9849632824338501, + "grad_norm": 0.33203125, + "learning_rate": 1.997422932041397e-05, + "loss": 0.7088, + "mean_token_accuracy": 0.8799990713596344, + "num_tokens": 17705105.0, + "step": 4225 + }, + { + "entropy": 0.2380964808166027, + "epoch": 0.9861289194544819, + "grad_norm": 3.078125, + "learning_rate": 1.9974167372966512e-05, + "loss": 0.3689, + "mean_token_accuracy": 0.9005732059478759, + "num_tokens": 17731972.0, + "step": 4230 + }, + { + "entropy": 0.3463113531470299, + "epoch": 0.9872945564751137, + "grad_norm": 2.0, + "learning_rate": 1.9974105351346742e-05, + "loss": 0.4693, + "mean_token_accuracy": 0.8790521025657654, + "num_tokens": 17753540.0, + "step": 4235 + }, + { + "entropy": 0.3185799553990364, + "epoch": 0.9884601934957454, + "grad_norm": 0.91015625, + "learning_rate": 1.9974043255555576e-05, + "loss": 0.6741, + "mean_token_accuracy": 0.8678267776966095, + "num_tokens": 17768594.0, + "step": 4240 + }, + { + "entropy": 0.3051718398928642, + "epoch": 0.9896258305163772, + "grad_norm": 0.490234375, + "learning_rate": 1.9973981085593947e-05, + "loss": 0.5239, + "mean_token_accuracy": 0.8959280610084533, + "num_tokens": 17787716.0, + "step": 4245 + }, + { + "entropy": 0.26151602268218993, + "epoch": 0.990791467537009, + "grad_norm": 1.8203125, + "learning_rate": 1.9973918841462782e-05, + "loss": 0.3758, + "mean_token_accuracy": 0.9223517417907715, + "num_tokens": 17803913.0, + "step": 4250 + }, + { + "entropy": 0.3868280317634344, + "epoch": 0.9919571045576407, + "grad_norm": 3.828125, + "learning_rate": 1.9973856523162996e-05, + "loss": 0.6642, + "mean_token_accuracy": 0.8484514772891998, + "num_tokens": 17832350.0, + "step": 4255 + }, + { + "entropy": 0.2370097152888775, + "epoch": 0.9931227415782725, + "grad_norm": 1.4921875, + "learning_rate": 1.9973794130695536e-05, + "loss": 0.4053, + "mean_token_accuracy": 0.9227615237236023, + "num_tokens": 17845252.0, + "step": 4260 + }, + { + "entropy": 0.27250404432415964, + "epoch": 0.9942883785989043, + "grad_norm": 3.21875, + "learning_rate": 1.997373166406132e-05, + "loss": 0.5611, + "mean_token_accuracy": 0.9023528039455414, + "num_tokens": 17862577.0, + "step": 4265 + }, + { + "entropy": 0.31106444634497166, + "epoch": 0.9954540156195361, + "grad_norm": 4.0, + "learning_rate": 1.997366912326129e-05, + "loss": 0.5155, + "mean_token_accuracy": 0.9105746448040009, + "num_tokens": 17881250.0, + "step": 4270 + }, + { + "entropy": 0.23939795568585395, + "epoch": 0.9966196526401678, + "grad_norm": 0.3203125, + "learning_rate": 1.997360650829637e-05, + "loss": 0.3377, + "mean_token_accuracy": 0.9150448620319367, + "num_tokens": 17906259.0, + "step": 4275 + }, + { + "entropy": 0.33422945328056813, + "epoch": 0.9977852896607996, + "grad_norm": 1.328125, + "learning_rate": 1.9973543819167493e-05, + "loss": 0.6478, + "mean_token_accuracy": 0.8795804023742676, + "num_tokens": 17933066.0, + "step": 4280 + }, + { + "entropy": 0.27011500149965284, + "epoch": 0.9989509266814314, + "grad_norm": 0.82421875, + "learning_rate": 1.9973481055875598e-05, + "loss": 0.4263, + "mean_token_accuracy": 0.9082007288932801, + "num_tokens": 17963474.0, + "step": 4285 + }, + { + "entropy": 0.20046385543213951, + "epoch": 1.0, + "grad_norm": 2.859375, + "learning_rate": 1.997341821842162e-05, + "loss": 0.3831, + "mean_token_accuracy": 0.9208313756518893, + "num_tokens": 17980750.0, + "step": 4290 + }, + { + "entropy": 0.35562875419855117, + "epoch": 1.0011656370206319, + "grad_norm": 0.86328125, + "learning_rate": 1.99733553068065e-05, + "loss": 0.7468, + "mean_token_accuracy": 0.8761965453624725, + "num_tokens": 17990376.0, + "step": 4295 + }, + { + "entropy": 0.26873988620936873, + "epoch": 1.0023312740412635, + "grad_norm": 1.75, + "learning_rate": 1.9973292321031168e-05, + "loss": 0.5668, + "mean_token_accuracy": 0.9023180723190307, + "num_tokens": 18019003.0, + "step": 4300 + }, + { + "entropy": 0.22322348281741142, + "epoch": 1.0034969110618954, + "grad_norm": 2.96875, + "learning_rate": 1.9973229261096567e-05, + "loss": 0.4842, + "mean_token_accuracy": 0.9101031005382538, + "num_tokens": 18033478.0, + "step": 4305 + }, + { + "entropy": 0.27951807379722593, + "epoch": 1.004662548082527, + "grad_norm": 2.5625, + "learning_rate": 1.997316612700364e-05, + "loss": 0.5195, + "mean_token_accuracy": 0.8981912076473236, + "num_tokens": 18052997.0, + "step": 4310 + }, + { + "entropy": 0.16746344231069088, + "epoch": 1.005828185103159, + "grad_norm": 3.625, + "learning_rate": 1.9973102918753323e-05, + "loss": 0.3076, + "mean_token_accuracy": 0.9257374405860901, + "num_tokens": 18088486.0, + "step": 4315 + }, + { + "entropy": 0.33466513007879256, + "epoch": 1.0069938221237906, + "grad_norm": 2.078125, + "learning_rate": 1.9973039636346566e-05, + "loss": 0.8306, + "mean_token_accuracy": 0.8667209804058075, + "num_tokens": 18097090.0, + "step": 4320 + }, + { + "entropy": 0.282178159058094, + "epoch": 1.0081594591444225, + "grad_norm": 0.4296875, + "learning_rate": 1.9972976279784304e-05, + "loss": 0.4186, + "mean_token_accuracy": 0.9172792494297027, + "num_tokens": 18117837.0, + "step": 4325 + }, + { + "entropy": 0.48776160553097725, + "epoch": 1.0093250961650542, + "grad_norm": 1.4765625, + "learning_rate": 1.9972912849067486e-05, + "loss": 0.9886, + "mean_token_accuracy": 0.8563101321458817, + "num_tokens": 18140432.0, + "step": 4330 + }, + { + "entropy": 0.2983349785208702, + "epoch": 1.010490733185686, + "grad_norm": 1.6484375, + "learning_rate": 1.997284934419706e-05, + "loss": 0.4383, + "mean_token_accuracy": 0.900026822090149, + "num_tokens": 18162642.0, + "step": 4335 + }, + { + "entropy": 0.28172708451747897, + "epoch": 1.0116563702063177, + "grad_norm": 0.609375, + "learning_rate": 1.997278576517397e-05, + "loss": 0.6158, + "mean_token_accuracy": 0.8869329035282135, + "num_tokens": 18176449.0, + "step": 4340 + }, + { + "entropy": 0.2790716167539358, + "epoch": 1.0128220072269496, + "grad_norm": 0.9296875, + "learning_rate": 1.9972722111999165e-05, + "loss": 0.3866, + "mean_token_accuracy": 0.8965228259563446, + "num_tokens": 18204501.0, + "step": 4345 + }, + { + "entropy": 0.23924841061234475, + "epoch": 1.0139876442475813, + "grad_norm": 1.4453125, + "learning_rate": 1.9972658384673594e-05, + "loss": 0.4254, + "mean_token_accuracy": 0.9159099042415619, + "num_tokens": 18217738.0, + "step": 4350 + }, + { + "entropy": 0.2642524816095829, + "epoch": 1.0151532812682131, + "grad_norm": 1.453125, + "learning_rate": 1.9972594583198206e-05, + "loss": 0.4372, + "mean_token_accuracy": 0.9113417148590088, + "num_tokens": 18231551.0, + "step": 4355 + }, + { + "entropy": 0.2554112922400236, + "epoch": 1.0163189182888448, + "grad_norm": 0.6328125, + "learning_rate": 1.9972530707573954e-05, + "loss": 0.4601, + "mean_token_accuracy": 0.9215417444705963, + "num_tokens": 18251896.0, + "step": 4360 + }, + { + "entropy": 0.35437683314085006, + "epoch": 1.0174845553094767, + "grad_norm": 2.15625, + "learning_rate": 1.997246675780179e-05, + "loss": 0.7178, + "mean_token_accuracy": 0.8783478498458862, + "num_tokens": 18266899.0, + "step": 4365 + }, + { + "entropy": 0.13722085095942022, + "epoch": 1.0186501923301083, + "grad_norm": 2.46875, + "learning_rate": 1.997240273388267e-05, + "loss": 0.217, + "mean_token_accuracy": 0.961503392457962, + "num_tokens": 18303207.0, + "step": 4370 + }, + { + "entropy": 0.24477976867929102, + "epoch": 1.0198158293507402, + "grad_norm": 0.244140625, + "learning_rate": 1.9972338635817542e-05, + "loss": 0.473, + "mean_token_accuracy": 0.9170270323753357, + "num_tokens": 18326728.0, + "step": 4375 + }, + { + "entropy": 0.19941614847630262, + "epoch": 1.0209814663713719, + "grad_norm": 0.185546875, + "learning_rate": 1.9972274463607367e-05, + "loss": 0.3927, + "mean_token_accuracy": 0.9253010809421539, + "num_tokens": 18348890.0, + "step": 4380 + }, + { + "entropy": 0.38385615646839144, + "epoch": 1.0221471033920038, + "grad_norm": 2.328125, + "learning_rate": 1.9972210217253105e-05, + "loss": 0.8185, + "mean_token_accuracy": 0.8647119581699372, + "num_tokens": 18356656.0, + "step": 4385 + }, + { + "entropy": 0.20581890493631363, + "epoch": 1.0233127404126354, + "grad_norm": 3.390625, + "learning_rate": 1.9972145896755707e-05, + "loss": 0.5018, + "mean_token_accuracy": 0.9183601677417755, + "num_tokens": 18387230.0, + "step": 4390 + }, + { + "entropy": 0.3187251575291157, + "epoch": 1.0244783774332673, + "grad_norm": 1.40625, + "learning_rate": 1.9972081502116133e-05, + "loss": 0.5653, + "mean_token_accuracy": 0.9041484355926513, + "num_tokens": 18404146.0, + "step": 4395 + }, + { + "entropy": 0.2324225589632988, + "epoch": 1.025644014453899, + "grad_norm": 1.2578125, + "learning_rate": 1.997201703333535e-05, + "loss": 0.5529, + "mean_token_accuracy": 0.9044724762439728, + "num_tokens": 18424272.0, + "step": 4400 + }, + { + "entropy": 0.2801080636680126, + "epoch": 1.0268096514745308, + "grad_norm": 0.54296875, + "learning_rate": 1.9971952490414312e-05, + "loss": 0.6135, + "mean_token_accuracy": 0.9049952983856201, + "num_tokens": 18436748.0, + "step": 4405 + }, + { + "entropy": 0.24056704565882683, + "epoch": 1.0279752884951625, + "grad_norm": 1.15625, + "learning_rate": 1.9971887873353983e-05, + "loss": 0.3592, + "mean_token_accuracy": 0.9270002782344818, + "num_tokens": 18461229.0, + "step": 4410 + }, + { + "entropy": 0.2713818594813347, + "epoch": 1.0291409255157944, + "grad_norm": 3.234375, + "learning_rate": 1.997182318215533e-05, + "loss": 0.5206, + "mean_token_accuracy": 0.9101826786994934, + "num_tokens": 18473547.0, + "step": 4415 + }, + { + "entropy": 0.2699232131242752, + "epoch": 1.030306562536426, + "grad_norm": 4.40625, + "learning_rate": 1.9971758416819312e-05, + "loss": 0.602, + "mean_token_accuracy": 0.8996315360069275, + "num_tokens": 18485691.0, + "step": 4420 + }, + { + "entropy": 0.27017700783908366, + "epoch": 1.031472199557058, + "grad_norm": 1.796875, + "learning_rate": 1.9971693577346904e-05, + "loss": 0.2799, + "mean_token_accuracy": 0.9148701131343842, + "num_tokens": 18510802.0, + "step": 4425 + }, + { + "entropy": 0.29673150181770325, + "epoch": 1.0326378365776896, + "grad_norm": 3.5625, + "learning_rate": 1.9971628663739063e-05, + "loss": 0.4561, + "mean_token_accuracy": 0.8998429119586945, + "num_tokens": 18531646.0, + "step": 4430 + }, + { + "entropy": 0.23062185496091842, + "epoch": 1.0338034735983215, + "grad_norm": 1.515625, + "learning_rate": 1.997156367599676e-05, + "loss": 0.3979, + "mean_token_accuracy": 0.9245606422424316, + "num_tokens": 18543407.0, + "step": 4435 + }, + { + "entropy": 0.2646921593695879, + "epoch": 1.0349691106189534, + "grad_norm": 0.37109375, + "learning_rate": 1.997149861412097e-05, + "loss": 0.6075, + "mean_token_accuracy": 0.9036493420600891, + "num_tokens": 18558389.0, + "step": 4440 + }, + { + "entropy": 0.25541475638747213, + "epoch": 1.036134747639585, + "grad_norm": 0.6875, + "learning_rate": 1.9971433478112653e-05, + "loss": 0.4424, + "mean_token_accuracy": 0.9157603502273559, + "num_tokens": 18575353.0, + "step": 4445 + }, + { + "entropy": 0.2649271070957184, + "epoch": 1.037300384660217, + "grad_norm": 3.375, + "learning_rate": 1.9971368267972787e-05, + "loss": 0.5452, + "mean_token_accuracy": 0.9030423641204834, + "num_tokens": 18585986.0, + "step": 4450 + }, + { + "entropy": 0.3601708263158798, + "epoch": 1.0384660216808486, + "grad_norm": 6.0, + "learning_rate": 1.9971302983702342e-05, + "loss": 0.8623, + "mean_token_accuracy": 0.8600324988365173, + "num_tokens": 18598197.0, + "step": 4455 + }, + { + "entropy": 0.24812965393066405, + "epoch": 1.0396316587014804, + "grad_norm": 1.9140625, + "learning_rate": 1.9971237625302294e-05, + "loss": 0.4411, + "mean_token_accuracy": 0.9120156764984131, + "num_tokens": 18621661.0, + "step": 4460 + }, + { + "entropy": 0.45811365395784376, + "epoch": 1.040797295722112, + "grad_norm": 1.171875, + "learning_rate": 1.9971172192773612e-05, + "loss": 0.6904, + "mean_token_accuracy": 0.8586636424064636, + "num_tokens": 18643842.0, + "step": 4465 + }, + { + "entropy": 0.2506137236952782, + "epoch": 1.041962932742744, + "grad_norm": 2.734375, + "learning_rate": 1.997110668611728e-05, + "loss": 0.3648, + "mean_token_accuracy": 0.9153142750263215, + "num_tokens": 18668595.0, + "step": 4470 + }, + { + "entropy": 0.2909720130264759, + "epoch": 1.0431285697633756, + "grad_norm": 4.25, + "learning_rate": 1.9971041105334267e-05, + "loss": 0.7996, + "mean_token_accuracy": 0.8663729548454284, + "num_tokens": 18679674.0, + "step": 4475 + }, + { + "entropy": 0.33627558797597884, + "epoch": 1.0442942067840075, + "grad_norm": 2.59375, + "learning_rate": 1.997097545042556e-05, + "loss": 0.6941, + "mean_token_accuracy": 0.8793821096420288, + "num_tokens": 18688713.0, + "step": 4480 + }, + { + "entropy": 0.25198244825005534, + "epoch": 1.0454598438046392, + "grad_norm": 1.3984375, + "learning_rate": 1.9970909721392123e-05, + "loss": 0.3858, + "mean_token_accuracy": 0.9182367920875549, + "num_tokens": 18712780.0, + "step": 4485 + }, + { + "entropy": 0.21226853989064692, + "epoch": 1.046625480825271, + "grad_norm": 1.9921875, + "learning_rate": 1.9970843918234953e-05, + "loss": 0.3497, + "mean_token_accuracy": 0.9377843379974365, + "num_tokens": 18735852.0, + "step": 4490 + }, + { + "entropy": 0.3144254297018051, + "epoch": 1.0477911178459027, + "grad_norm": 1.953125, + "learning_rate": 1.997077804095502e-05, + "loss": 0.6218, + "mean_token_accuracy": 0.8912947416305542, + "num_tokens": 18747949.0, + "step": 4495 + }, + { + "entropy": 0.2468037974089384, + "epoch": 1.0489567548665346, + "grad_norm": 1.3828125, + "learning_rate": 1.9970712089553312e-05, + "loss": 0.3826, + "mean_token_accuracy": 0.9294294059276581, + "num_tokens": 18764902.0, + "step": 4500 + }, + { + "entropy": 0.3669187381863594, + "epoch": 1.0501223918871663, + "grad_norm": 2.46875, + "learning_rate": 1.997064606403081e-05, + "loss": 0.6487, + "mean_token_accuracy": 0.8842283308506012, + "num_tokens": 18775403.0, + "step": 4505 + }, + { + "entropy": 0.222046391479671, + "epoch": 1.0512880289077982, + "grad_norm": 1.4375, + "learning_rate": 1.99705799643885e-05, + "loss": 0.2955, + "mean_token_accuracy": 0.9272170722484588, + "num_tokens": 18806888.0, + "step": 4510 + }, + { + "entropy": 0.24561046734452247, + "epoch": 1.0524536659284298, + "grad_norm": 0.65625, + "learning_rate": 1.9970513790627363e-05, + "loss": 0.4845, + "mean_token_accuracy": 0.9211707890033722, + "num_tokens": 18820932.0, + "step": 4515 + }, + { + "entropy": 0.24179197587072848, + "epoch": 1.0536193029490617, + "grad_norm": 1.4140625, + "learning_rate": 1.997044754274839e-05, + "loss": 0.4912, + "mean_token_accuracy": 0.9171702861785889, + "num_tokens": 18836876.0, + "step": 4520 + }, + { + "entropy": 0.2697492055594921, + "epoch": 1.0547849399696934, + "grad_norm": 1.8828125, + "learning_rate": 1.997038122075257e-05, + "loss": 0.479, + "mean_token_accuracy": 0.9130570411682128, + "num_tokens": 18851087.0, + "step": 4525 + }, + { + "entropy": 0.35710594058036804, + "epoch": 1.0559505769903252, + "grad_norm": 1.34375, + "learning_rate": 1.9970314824640892e-05, + "loss": 0.58, + "mean_token_accuracy": 0.8559439688920975, + "num_tokens": 18883555.0, + "step": 4530 + }, + { + "entropy": 0.2868450716137886, + "epoch": 1.057116214010957, + "grad_norm": 0.2265625, + "learning_rate": 1.9970248354414342e-05, + "loss": 0.6734, + "mean_token_accuracy": 0.8887735664844513, + "num_tokens": 18901186.0, + "step": 4535 + }, + { + "entropy": 0.23753126077353953, + "epoch": 1.0582818510315888, + "grad_norm": 1.8046875, + "learning_rate": 1.9970181810073913e-05, + "loss": 0.4314, + "mean_token_accuracy": 0.9144180476665497, + "num_tokens": 18922703.0, + "step": 4540 + }, + { + "entropy": 0.2350205697119236, + "epoch": 1.0594474880522204, + "grad_norm": 0.72265625, + "learning_rate": 1.9970115191620596e-05, + "loss": 0.3227, + "mean_token_accuracy": 0.9272505462169647, + "num_tokens": 18938623.0, + "step": 4545 + }, + { + "entropy": 0.29542321562767027, + "epoch": 1.0606131250728523, + "grad_norm": 0.73828125, + "learning_rate": 1.997004849905539e-05, + "loss": 0.5377, + "mean_token_accuracy": 0.8950551569461822, + "num_tokens": 18954884.0, + "step": 4550 + }, + { + "entropy": 0.2510738968849182, + "epoch": 1.061778762093484, + "grad_norm": 2.296875, + "learning_rate": 1.996998173237928e-05, + "loss": 0.4795, + "mean_token_accuracy": 0.9140961229801178, + "num_tokens": 18970242.0, + "step": 4555 + }, + { + "entropy": 0.3822752878069878, + "epoch": 1.0629443991141159, + "grad_norm": 4.21875, + "learning_rate": 1.9969914891593272e-05, + "loss": 0.6895, + "mean_token_accuracy": 0.8611569821834564, + "num_tokens": 18990243.0, + "step": 4560 + }, + { + "entropy": 0.28315904922783375, + "epoch": 1.0641100361347475, + "grad_norm": 3.125, + "learning_rate": 1.9969847976698355e-05, + "loss": 0.4318, + "mean_token_accuracy": 0.9066355526447296, + "num_tokens": 19006220.0, + "step": 4565 + }, + { + "entropy": 0.1834595672786236, + "epoch": 1.0652756731553794, + "grad_norm": 0.69921875, + "learning_rate": 1.996978098769553e-05, + "loss": 0.2685, + "mean_token_accuracy": 0.940112954378128, + "num_tokens": 19029958.0, + "step": 4570 + }, + { + "entropy": 0.2601656034588814, + "epoch": 1.0664413101760113, + "grad_norm": 3.203125, + "learning_rate": 1.99697139245858e-05, + "loss": 0.6005, + "mean_token_accuracy": 0.8940555512905121, + "num_tokens": 19045954.0, + "step": 4575 + }, + { + "entropy": 0.31239586509764194, + "epoch": 1.067606947196643, + "grad_norm": 3.8125, + "learning_rate": 1.9969646787370154e-05, + "loss": 0.4227, + "mean_token_accuracy": 0.9133547961711883, + "num_tokens": 19068136.0, + "step": 4580 + }, + { + "entropy": 0.24956375658512114, + "epoch": 1.0687725842172748, + "grad_norm": 1.8046875, + "learning_rate": 1.9969579576049603e-05, + "loss": 0.4452, + "mean_token_accuracy": 0.9185151219367981, + "num_tokens": 19090732.0, + "step": 4585 + }, + { + "entropy": 0.2317537029273808, + "epoch": 1.0699382212379065, + "grad_norm": 3.328125, + "learning_rate": 1.9969512290625145e-05, + "loss": 0.3482, + "mean_token_accuracy": 0.9122158706188201, + "num_tokens": 19117200.0, + "step": 4590 + }, + { + "entropy": 0.32455471605062486, + "epoch": 1.0711038582585384, + "grad_norm": 3.328125, + "learning_rate": 1.9969444931097782e-05, + "loss": 0.6336, + "mean_token_accuracy": 0.8840024530887604, + "num_tokens": 19131499.0, + "step": 4595 + }, + { + "entropy": 0.31562632527202367, + "epoch": 1.07226949527917, + "grad_norm": 1.1875, + "learning_rate": 1.9969377497468524e-05, + "loss": 0.4875, + "mean_token_accuracy": 0.8833531379699707, + "num_tokens": 19174599.0, + "step": 4600 + }, + { + "entropy": 0.19211040288209916, + "epoch": 1.073435132299802, + "grad_norm": 1.4140625, + "learning_rate": 1.996930998973837e-05, + "loss": 0.2376, + "mean_token_accuracy": 0.935130262374878, + "num_tokens": 19194152.0, + "step": 4605 + }, + { + "entropy": 0.2705673351883888, + "epoch": 1.0746007693204336, + "grad_norm": 2.171875, + "learning_rate": 1.9969242407908336e-05, + "loss": 0.4673, + "mean_token_accuracy": 0.893635481595993, + "num_tokens": 19210561.0, + "step": 4610 + }, + { + "entropy": 0.22310225944966078, + "epoch": 1.0757664063410655, + "grad_norm": 0.3984375, + "learning_rate": 1.996917475197942e-05, + "loss": 0.3425, + "mean_token_accuracy": 0.934300822019577, + "num_tokens": 19249267.0, + "step": 4615 + }, + { + "entropy": 0.21514166854321956, + "epoch": 1.0769320433616971, + "grad_norm": 0.271484375, + "learning_rate": 1.996910702195263e-05, + "loss": 0.2464, + "mean_token_accuracy": 0.9217098355293274, + "num_tokens": 19279689.0, + "step": 4620 + }, + { + "entropy": 0.421810145303607, + "epoch": 1.078097680382329, + "grad_norm": 2.953125, + "learning_rate": 1.9969039217828988e-05, + "loss": 0.5759, + "mean_token_accuracy": 0.8948563575744629, + "num_tokens": 19309629.0, + "step": 4625 + }, + { + "entropy": 0.30465604811906816, + "epoch": 1.0792633174029607, + "grad_norm": 3.203125, + "learning_rate": 1.9968971339609494e-05, + "loss": 0.5313, + "mean_token_accuracy": 0.9070425748825073, + "num_tokens": 19332872.0, + "step": 4630 + }, + { + "entropy": 0.21195731237530707, + "epoch": 1.0804289544235925, + "grad_norm": 0.89453125, + "learning_rate": 1.9968903387295162e-05, + "loss": 0.2566, + "mean_token_accuracy": 0.943495512008667, + "num_tokens": 19354020.0, + "step": 4635 + }, + { + "entropy": 0.28489233925938606, + "epoch": 1.0815945914442242, + "grad_norm": 2.125, + "learning_rate": 1.9968835360887007e-05, + "loss": 0.4362, + "mean_token_accuracy": 0.9148045003414154, + "num_tokens": 19373694.0, + "step": 4640 + }, + { + "entropy": 0.20208041854202746, + "epoch": 1.082760228464856, + "grad_norm": 4.0, + "learning_rate": 1.9968767260386043e-05, + "loss": 0.4345, + "mean_token_accuracy": 0.9049011528491974, + "num_tokens": 19394838.0, + "step": 4645 + }, + { + "entropy": 0.27856975942850115, + "epoch": 1.0839258654854877, + "grad_norm": 3.546875, + "learning_rate": 1.996869908579329e-05, + "loss": 0.5698, + "mean_token_accuracy": 0.9010435700416565, + "num_tokens": 19410829.0, + "step": 4650 + }, + { + "entropy": 0.36488936431705954, + "epoch": 1.0850915025061196, + "grad_norm": 2.578125, + "learning_rate": 1.9968630837109756e-05, + "loss": 0.7248, + "mean_token_accuracy": 0.8689317822456359, + "num_tokens": 19423986.0, + "step": 4655 + }, + { + "entropy": 0.247793560475111, + "epoch": 1.0862571395267513, + "grad_norm": 0.466796875, + "learning_rate": 1.9968562514336464e-05, + "loss": 0.4697, + "mean_token_accuracy": 0.9112800538539887, + "num_tokens": 19446363.0, + "step": 4660 + }, + { + "entropy": 0.26282162182033064, + "epoch": 1.0874227765473832, + "grad_norm": 3.40625, + "learning_rate": 1.996849411747443e-05, + "loss": 0.5178, + "mean_token_accuracy": 0.9006511569023132, + "num_tokens": 19479115.0, + "step": 4665 + }, + { + "entropy": 0.2753569819033146, + "epoch": 1.0885884135680148, + "grad_norm": 2.046875, + "learning_rate": 1.9968425646524682e-05, + "loss": 0.6274, + "mean_token_accuracy": 0.8891318142414093, + "num_tokens": 19489958.0, + "step": 4670 + }, + { + "entropy": 0.2802404096350074, + "epoch": 1.0897540505886467, + "grad_norm": 3.6875, + "learning_rate": 1.9968357101488227e-05, + "loss": 0.4688, + "mean_token_accuracy": 0.9078739762306214, + "num_tokens": 19522416.0, + "step": 4675 + }, + { + "entropy": 0.3198259465396404, + "epoch": 1.0909196876092784, + "grad_norm": 2.265625, + "learning_rate": 1.9968288482366097e-05, + "loss": 0.5392, + "mean_token_accuracy": 0.8862555027008057, + "num_tokens": 19545618.0, + "step": 4680 + }, + { + "entropy": 0.23124271370470523, + "epoch": 1.0920853246299103, + "grad_norm": 0.236328125, + "learning_rate": 1.9968219789159314e-05, + "loss": 0.5302, + "mean_token_accuracy": 0.9184695065021515, + "num_tokens": 19568029.0, + "step": 4685 + }, + { + "entropy": 0.20576266888529063, + "epoch": 1.093250961650542, + "grad_norm": 2.03125, + "learning_rate": 1.9968151021868906e-05, + "loss": 0.3619, + "mean_token_accuracy": 0.9128438830375671, + "num_tokens": 19595024.0, + "step": 4690 + }, + { + "entropy": 0.2861837536096573, + "epoch": 1.0944165986711738, + "grad_norm": 3.234375, + "learning_rate": 1.9968082180495887e-05, + "loss": 0.5211, + "mean_token_accuracy": 0.9105264604091644, + "num_tokens": 19609621.0, + "step": 4695 + }, + { + "entropy": 0.36588003784418105, + "epoch": 1.0955822356918055, + "grad_norm": 4.8125, + "learning_rate": 1.9968013265041293e-05, + "loss": 0.5441, + "mean_token_accuracy": 0.8853360414505005, + "num_tokens": 19628917.0, + "step": 4700 + }, + { + "entropy": 0.2513444259762764, + "epoch": 1.0967478727124373, + "grad_norm": 1.515625, + "learning_rate": 1.996794427550615e-05, + "loss": 0.3729, + "mean_token_accuracy": 0.9267160654067993, + "num_tokens": 19649010.0, + "step": 4705 + }, + { + "entropy": 0.26096164882183076, + "epoch": 1.0979135097330692, + "grad_norm": 2.203125, + "learning_rate": 1.9967875211891483e-05, + "loss": 0.5006, + "mean_token_accuracy": 0.9162486016750335, + "num_tokens": 19667982.0, + "step": 4710 + }, + { + "entropy": 0.34309937320649625, + "epoch": 1.0990791467537009, + "grad_norm": 1.40625, + "learning_rate": 1.9967806074198323e-05, + "loss": 0.512, + "mean_token_accuracy": 0.8793802082538604, + "num_tokens": 19688359.0, + "step": 4715 + }, + { + "entropy": 0.2940907657146454, + "epoch": 1.1002447837743328, + "grad_norm": 1.1953125, + "learning_rate": 1.9967736862427707e-05, + "loss": 0.452, + "mean_token_accuracy": 0.9185880184173584, + "num_tokens": 19702524.0, + "step": 4720 + }, + { + "entropy": 0.2799779180437326, + "epoch": 1.1014104207949644, + "grad_norm": 0.66015625, + "learning_rate": 1.996766757658066e-05, + "loss": 0.4462, + "mean_token_accuracy": 0.889285272359848, + "num_tokens": 19732863.0, + "step": 4725 + }, + { + "entropy": 0.38467673361301424, + "epoch": 1.1025760578155963, + "grad_norm": 2.078125, + "learning_rate": 1.9967598216658217e-05, + "loss": 0.6861, + "mean_token_accuracy": 0.8668268322944641, + "num_tokens": 19756176.0, + "step": 4730 + }, + { + "entropy": 0.3017943359911442, + "epoch": 1.103741694836228, + "grad_norm": 0.43359375, + "learning_rate": 1.9967528782661413e-05, + "loss": 0.6322, + "mean_token_accuracy": 0.9024563610553742, + "num_tokens": 19774248.0, + "step": 4735 + }, + { + "entropy": 0.26271404810249804, + "epoch": 1.1049073318568599, + "grad_norm": 1.8203125, + "learning_rate": 1.9967459274591286e-05, + "loss": 0.304, + "mean_token_accuracy": 0.906380957365036, + "num_tokens": 19794918.0, + "step": 4740 + }, + { + "entropy": 0.4135320819914341, + "epoch": 1.1060729688774915, + "grad_norm": 1.78125, + "learning_rate": 1.996738969244887e-05, + "loss": 0.5758, + "mean_token_accuracy": 0.8603252410888672, + "num_tokens": 19818348.0, + "step": 4745 + }, + { + "entropy": 0.32035259939730165, + "epoch": 1.1072386058981234, + "grad_norm": 0.451171875, + "learning_rate": 1.9967320036235198e-05, + "loss": 0.4718, + "mean_token_accuracy": 0.8999540865421295, + "num_tokens": 19835870.0, + "step": 4750 + }, + { + "entropy": 0.2962488478049636, + "epoch": 1.108404242918755, + "grad_norm": 0.267578125, + "learning_rate": 1.9967250305951317e-05, + "loss": 0.2977, + "mean_token_accuracy": 0.9045934975147247, + "num_tokens": 19869134.0, + "step": 4755 + }, + { + "entropy": 0.2296114858239889, + "epoch": 1.109569879939387, + "grad_norm": 1.9453125, + "learning_rate": 1.9967180501598257e-05, + "loss": 0.3174, + "mean_token_accuracy": 0.9314020931720733, + "num_tokens": 19895029.0, + "step": 4760 + }, + { + "entropy": 0.265631403028965, + "epoch": 1.1107355169600186, + "grad_norm": 1.6640625, + "learning_rate": 1.9967110623177072e-05, + "loss": 0.5776, + "mean_token_accuracy": 0.8974580585956573, + "num_tokens": 19907777.0, + "step": 4765 + }, + { + "entropy": 0.41383567191660403, + "epoch": 1.1119011539806505, + "grad_norm": 3.203125, + "learning_rate": 1.9967040670688792e-05, + "loss": 0.803, + "mean_token_accuracy": 0.8468187242746353, + "num_tokens": 19933236.0, + "step": 4770 + }, + { + "entropy": 0.21881723627448083, + "epoch": 1.1130667910012821, + "grad_norm": 0.330078125, + "learning_rate": 1.9966970644134467e-05, + "loss": 0.2632, + "mean_token_accuracy": 0.9160628378391266, + "num_tokens": 19971416.0, + "step": 4775 + }, + { + "entropy": 0.21530752796679736, + "epoch": 1.114232428021914, + "grad_norm": 1.828125, + "learning_rate": 1.9966900543515137e-05, + "loss": 0.3804, + "mean_token_accuracy": 0.9274242103099823, + "num_tokens": 20001318.0, + "step": 4780 + }, + { + "entropy": 0.2666290858760476, + "epoch": 1.1153980650425457, + "grad_norm": 3.59375, + "learning_rate": 1.9966830368831852e-05, + "loss": 0.5918, + "mean_token_accuracy": 0.8934563815593719, + "num_tokens": 20019964.0, + "step": 4785 + }, + { + "entropy": 0.3114991795271635, + "epoch": 1.1165637020631776, + "grad_norm": 3.25, + "learning_rate": 1.9966760120085654e-05, + "loss": 0.6155, + "mean_token_accuracy": 0.8941902697086335, + "num_tokens": 20040676.0, + "step": 4790 + }, + { + "entropy": 0.1996636178344488, + "epoch": 1.1177293390838092, + "grad_norm": 0.412109375, + "learning_rate": 1.996668979727759e-05, + "loss": 0.2037, + "mean_token_accuracy": 0.9335273861885071, + "num_tokens": 20078019.0, + "step": 4795 + }, + { + "entropy": 0.3700167939066887, + "epoch": 1.118894976104441, + "grad_norm": 2.40625, + "learning_rate": 1.996661940040871e-05, + "loss": 0.8312, + "mean_token_accuracy": 0.8724365890026092, + "num_tokens": 20089226.0, + "step": 4800 + }, + { + "entropy": 0.285097012668848, + "epoch": 1.1200606131250728, + "grad_norm": 1.625, + "learning_rate": 1.9966548929480072e-05, + "loss": 0.6656, + "mean_token_accuracy": 0.8899259626865387, + "num_tokens": 20100172.0, + "step": 4805 + }, + { + "entropy": 0.28418309725821017, + "epoch": 1.1212262501457047, + "grad_norm": 2.546875, + "learning_rate": 1.9966478384492713e-05, + "loss": 0.42, + "mean_token_accuracy": 0.9119695782661438, + "num_tokens": 20125120.0, + "step": 4810 + }, + { + "entropy": 0.24601214230060578, + "epoch": 1.1223918871663363, + "grad_norm": 1.5546875, + "learning_rate": 1.9966407765447694e-05, + "loss": 0.4867, + "mean_token_accuracy": 0.8957095205783844, + "num_tokens": 20145895.0, + "step": 4815 + }, + { + "entropy": 0.2464228618890047, + "epoch": 1.1235575241869682, + "grad_norm": 1.3046875, + "learning_rate": 1.9966337072346063e-05, + "loss": 0.2837, + "mean_token_accuracy": 0.9086942136287689, + "num_tokens": 20174555.0, + "step": 4820 + }, + { + "entropy": 0.29267266392707825, + "epoch": 1.1247231612075999, + "grad_norm": 1.9609375, + "learning_rate": 1.996626630518888e-05, + "loss": 0.4338, + "mean_token_accuracy": 0.9187252819538116, + "num_tokens": 20194808.0, + "step": 4825 + }, + { + "entropy": 0.2548982771113515, + "epoch": 1.1258887982282317, + "grad_norm": 1.6171875, + "learning_rate": 1.9966195463977193e-05, + "loss": 0.3892, + "mean_token_accuracy": 0.9128529787063598, + "num_tokens": 20220293.0, + "step": 4830 + }, + { + "entropy": 0.43977917432785035, + "epoch": 1.1270544352488634, + "grad_norm": 2.3125, + "learning_rate": 1.9966124548712064e-05, + "loss": 0.7689, + "mean_token_accuracy": 0.8565711915493012, + "num_tokens": 20246105.0, + "step": 4835 + }, + { + "entropy": 0.22334610745310784, + "epoch": 1.1282200722694953, + "grad_norm": 0.8046875, + "learning_rate": 1.9966053559394548e-05, + "loss": 0.3492, + "mean_token_accuracy": 0.9118761956691742, + "num_tokens": 20274785.0, + "step": 4840 + }, + { + "entropy": 0.2549382247030735, + "epoch": 1.1293857092901272, + "grad_norm": 1.1015625, + "learning_rate": 1.9965982496025705e-05, + "loss": 0.3671, + "mean_token_accuracy": 0.9273378610610962, + "num_tokens": 20297017.0, + "step": 4845 + }, + { + "entropy": 0.26040482223033906, + "epoch": 1.1305513463107588, + "grad_norm": 1.6015625, + "learning_rate": 1.9965911358606592e-05, + "loss": 0.5535, + "mean_token_accuracy": 0.9075103044509888, + "num_tokens": 20308131.0, + "step": 4850 + }, + { + "entropy": 0.2684775315225124, + "epoch": 1.1317169833313905, + "grad_norm": 0.46875, + "learning_rate": 1.9965840147138273e-05, + "loss": 0.4772, + "mean_token_accuracy": 0.9086149275302887, + "num_tokens": 20332934.0, + "step": 4855 + }, + { + "entropy": 0.26287127695977686, + "epoch": 1.1328826203520224, + "grad_norm": 1.59375, + "learning_rate": 1.996576886162181e-05, + "loss": 0.3848, + "mean_token_accuracy": 0.9158682942390441, + "num_tokens": 20355529.0, + "step": 4860 + }, + { + "entropy": 0.2752557501196861, + "epoch": 1.1340482573726542, + "grad_norm": 4.1875, + "learning_rate": 1.996569750205826e-05, + "loss": 0.6068, + "mean_token_accuracy": 0.8869381487369538, + "num_tokens": 20374204.0, + "step": 4865 + }, + { + "entropy": 0.19664042815566063, + "epoch": 1.135213894393286, + "grad_norm": 5.21875, + "learning_rate": 1.9965626068448694e-05, + "loss": 0.5432, + "mean_token_accuracy": 0.9174998879432679, + "num_tokens": 20390154.0, + "step": 4870 + }, + { + "entropy": 0.21189433336257935, + "epoch": 1.1363795314139178, + "grad_norm": 2.0625, + "learning_rate": 1.9965554560794173e-05, + "loss": 0.433, + "mean_token_accuracy": 0.9120612025260926, + "num_tokens": 20411386.0, + "step": 4875 + }, + { + "entropy": 0.27202789932489396, + "epoch": 1.1375451684345494, + "grad_norm": 3.171875, + "learning_rate": 1.9965482979095766e-05, + "loss": 0.4594, + "mean_token_accuracy": 0.8918597221374511, + "num_tokens": 20429358.0, + "step": 4880 + }, + { + "entropy": 0.24662722423672676, + "epoch": 1.1387108054551813, + "grad_norm": 0.462890625, + "learning_rate": 1.996541132335454e-05, + "loss": 0.3717, + "mean_token_accuracy": 0.9205475330352784, + "num_tokens": 20457099.0, + "step": 4885 + }, + { + "entropy": 0.33891555294394493, + "epoch": 1.139876442475813, + "grad_norm": 1.171875, + "learning_rate": 1.9965339593571562e-05, + "loss": 0.545, + "mean_token_accuracy": 0.8842925369739533, + "num_tokens": 20478711.0, + "step": 4890 + }, + { + "entropy": 0.3443553917109966, + "epoch": 1.1410420794964449, + "grad_norm": 1.09375, + "learning_rate": 1.9965267789747902e-05, + "loss": 0.5574, + "mean_token_accuracy": 0.8829470694065094, + "num_tokens": 20499220.0, + "step": 4895 + }, + { + "entropy": 0.2517301285639405, + "epoch": 1.1422077165170765, + "grad_norm": 0.419921875, + "learning_rate": 1.9965195911884632e-05, + "loss": 0.5739, + "mean_token_accuracy": 0.8933994770050049, + "num_tokens": 20518241.0, + "step": 4900 + }, + { + "entropy": 0.23689071610569953, + "epoch": 1.1433733535377084, + "grad_norm": 2.5625, + "learning_rate": 1.996512395998282e-05, + "loss": 0.4953, + "mean_token_accuracy": 0.921712464094162, + "num_tokens": 20532325.0, + "step": 4905 + }, + { + "entropy": 0.2861016098409891, + "epoch": 1.14453899055834, + "grad_norm": 4.15625, + "learning_rate": 1.9965051934043545e-05, + "loss": 0.4887, + "mean_token_accuracy": 0.9079478204250335, + "num_tokens": 20555088.0, + "step": 4910 + }, + { + "entropy": 0.35219028070569036, + "epoch": 1.145704627578972, + "grad_norm": 0.318359375, + "learning_rate": 1.996497983406788e-05, + "loss": 0.6508, + "mean_token_accuracy": 0.8878498017787934, + "num_tokens": 20577242.0, + "step": 4915 + }, + { + "entropy": 0.17518207393586635, + "epoch": 1.1468702645996036, + "grad_norm": 1.1171875, + "learning_rate": 1.9964907660056894e-05, + "loss": 0.3614, + "mean_token_accuracy": 0.9309583485126496, + "num_tokens": 20599132.0, + "step": 4920 + }, + { + "entropy": 0.3855657190084457, + "epoch": 1.1480359016202355, + "grad_norm": 0.490234375, + "learning_rate": 1.9964835412011667e-05, + "loss": 0.6255, + "mean_token_accuracy": 0.8820390582084656, + "num_tokens": 20611755.0, + "step": 4925 + }, + { + "entropy": 0.19828929752111435, + "epoch": 1.1492015386408672, + "grad_norm": 1.453125, + "learning_rate": 1.996476308993328e-05, + "loss": 0.4292, + "mean_token_accuracy": 0.92506183385849, + "num_tokens": 20631471.0, + "step": 4930 + }, + { + "entropy": 0.24127081036567688, + "epoch": 1.150367175661499, + "grad_norm": 1.2578125, + "learning_rate": 1.9964690693822805e-05, + "loss": 0.3354, + "mean_token_accuracy": 0.9134461104869842, + "num_tokens": 20653340.0, + "step": 4935 + }, + { + "entropy": 0.25572728682309387, + "epoch": 1.1515328126821307, + "grad_norm": 3.75, + "learning_rate": 1.996461822368132e-05, + "loss": 0.3822, + "mean_token_accuracy": 0.9080056071281433, + "num_tokens": 20691385.0, + "step": 4940 + }, + { + "entropy": 0.3221442990005016, + "epoch": 1.1526984497027626, + "grad_norm": 0.59765625, + "learning_rate": 1.9964545679509917e-05, + "loss": 0.674, + "mean_token_accuracy": 0.8798101782798767, + "num_tokens": 20703113.0, + "step": 4945 + }, + { + "entropy": 0.3144264120608568, + "epoch": 1.1538640867233942, + "grad_norm": 0.376953125, + "learning_rate": 1.996447306130967e-05, + "loss": 0.4135, + "mean_token_accuracy": 0.8957395732402802, + "num_tokens": 20730511.0, + "step": 4950 + }, + { + "entropy": 0.3189440816640854, + "epoch": 1.1550297237440261, + "grad_norm": 2.15625, + "learning_rate": 1.996440036908166e-05, + "loss": 0.3236, + "mean_token_accuracy": 0.9054770946502686, + "num_tokens": 20759904.0, + "step": 4955 + }, + { + "entropy": 0.2833704300224781, + "epoch": 1.156195360764658, + "grad_norm": 0.458984375, + "learning_rate": 1.9964327602826977e-05, + "loss": 0.5359, + "mean_token_accuracy": 0.9107962608337402, + "num_tokens": 20793843.0, + "step": 4960 + }, + { + "entropy": 0.28943902999162674, + "epoch": 1.1573609977852897, + "grad_norm": 1.890625, + "learning_rate": 1.99642547625467e-05, + "loss": 0.6363, + "mean_token_accuracy": 0.8925088405609131, + "num_tokens": 20803894.0, + "step": 4965 + }, + { + "entropy": 0.31220489740371704, + "epoch": 1.1585266348059213, + "grad_norm": 0.9375, + "learning_rate": 1.996418184824192e-05, + "loss": 0.4212, + "mean_token_accuracy": 0.9167026937007904, + "num_tokens": 20827567.0, + "step": 4970 + }, + { + "entropy": 0.2499225214123726, + "epoch": 1.1596922718265532, + "grad_norm": 3.40625, + "learning_rate": 1.996410885991372e-05, + "loss": 0.5375, + "mean_token_accuracy": 0.9086990296840668, + "num_tokens": 20847726.0, + "step": 4975 + }, + { + "entropy": 0.1842884048819542, + "epoch": 1.160857908847185, + "grad_norm": 1.71875, + "learning_rate": 1.996403579756319e-05, + "loss": 0.307, + "mean_token_accuracy": 0.9375099897384643, + "num_tokens": 20876274.0, + "step": 4980 + }, + { + "entropy": 0.2099345738068223, + "epoch": 1.1620235458678168, + "grad_norm": 0.2470703125, + "learning_rate": 1.996396266119142e-05, + "loss": 0.3601, + "mean_token_accuracy": 0.9239444255828857, + "num_tokens": 20908707.0, + "step": 4985 + }, + { + "entropy": 0.2952556751668453, + "epoch": 1.1631891828884484, + "grad_norm": 2.6875, + "learning_rate": 1.9963889450799503e-05, + "loss": 0.538, + "mean_token_accuracy": 0.9095062971115112, + "num_tokens": 20923367.0, + "step": 4990 + }, + { + "entropy": 0.32244512140750886, + "epoch": 1.1643548199090803, + "grad_norm": 0.390625, + "learning_rate": 1.9963816166388527e-05, + "loss": 0.3306, + "mean_token_accuracy": 0.8819191873073577, + "num_tokens": 20956465.0, + "step": 4995 + }, + { + "entropy": 0.28892406783998015, + "epoch": 1.1655204569297122, + "grad_norm": 0.75, + "learning_rate": 1.9963742807959587e-05, + "loss": 0.5125, + "mean_token_accuracy": 0.8838415026664734, + "num_tokens": 20980348.0, + "step": 5000 + }, + { + "entropy": 0.31543378755450246, + "epoch": 1.1666860939503438, + "grad_norm": 2.625, + "learning_rate": 1.9963669375513773e-05, + "loss": 0.6096, + "mean_token_accuracy": 0.8829708397388458, + "num_tokens": 20997289.0, + "step": 5005 + }, + { + "entropy": 0.2742725571617484, + "epoch": 1.1678517309709757, + "grad_norm": 0.54296875, + "learning_rate": 1.9963595869052185e-05, + "loss": 0.5468, + "mean_token_accuracy": 0.8958122670650482, + "num_tokens": 21022460.0, + "step": 5010 + }, + { + "entropy": 0.4142576478421688, + "epoch": 1.1690173679916074, + "grad_norm": 1.59375, + "learning_rate": 1.9963522288575915e-05, + "loss": 0.6416, + "mean_token_accuracy": 0.8969951689243316, + "num_tokens": 21049444.0, + "step": 5015 + }, + { + "entropy": 0.3312222182750702, + "epoch": 1.1701830050122393, + "grad_norm": 3.203125, + "learning_rate": 1.9963448634086063e-05, + "loss": 0.7295, + "mean_token_accuracy": 0.8790445506572724, + "num_tokens": 21058389.0, + "step": 5020 + }, + { + "entropy": 0.3322805389761925, + "epoch": 1.171348642032871, + "grad_norm": 0.78515625, + "learning_rate": 1.996337490558373e-05, + "loss": 0.5016, + "mean_token_accuracy": 0.9036110222339631, + "num_tokens": 21069310.0, + "step": 5025 + }, + { + "entropy": 0.2525691881775856, + "epoch": 1.1725142790535028, + "grad_norm": 3.140625, + "learning_rate": 1.9963301103070007e-05, + "loss": 0.314, + "mean_token_accuracy": 0.8964376389980316, + "num_tokens": 21103588.0, + "step": 5030 + }, + { + "entropy": 0.2757076404988766, + "epoch": 1.1736799160741345, + "grad_norm": 2.296875, + "learning_rate": 1.9963227226546e-05, + "loss": 0.5008, + "mean_token_accuracy": 0.9100005924701691, + "num_tokens": 21115956.0, + "step": 5035 + }, + { + "entropy": 0.24521611034870147, + "epoch": 1.1748455530947663, + "grad_norm": 3.453125, + "learning_rate": 1.996315327601281e-05, + "loss": 0.5026, + "mean_token_accuracy": 0.9170917510986328, + "num_tokens": 21129529.0, + "step": 5040 + }, + { + "entropy": 0.38413129895925524, + "epoch": 1.176011190115398, + "grad_norm": 2.71875, + "learning_rate": 1.996307925147154e-05, + "loss": 0.7491, + "mean_token_accuracy": 0.8473885953426361, + "num_tokens": 21149606.0, + "step": 5045 + }, + { + "entropy": 0.2860517233610153, + "epoch": 1.17717682713603, + "grad_norm": 1.4453125, + "learning_rate": 1.9963005152923297e-05, + "loss": 0.6112, + "mean_token_accuracy": 0.906189215183258, + "num_tokens": 21160383.0, + "step": 5050 + }, + { + "entropy": 0.25727585405111314, + "epoch": 1.1783424641566616, + "grad_norm": 0.625, + "learning_rate": 1.9962930980369178e-05, + "loss": 0.3184, + "mean_token_accuracy": 0.9098371982574462, + "num_tokens": 21184716.0, + "step": 5055 + }, + { + "entropy": 0.2614014007151127, + "epoch": 1.1795081011772934, + "grad_norm": 2.640625, + "learning_rate": 1.9962856733810295e-05, + "loss": 0.6666, + "mean_token_accuracy": 0.8902623951435089, + "num_tokens": 21197902.0, + "step": 5060 + }, + { + "entropy": 0.28945720940828323, + "epoch": 1.180673738197925, + "grad_norm": 2.6875, + "learning_rate": 1.9962782413247753e-05, + "loss": 0.6894, + "mean_token_accuracy": 0.8888876676559448, + "num_tokens": 21208021.0, + "step": 5065 + }, + { + "entropy": 0.18397099822759627, + "epoch": 1.181839375218557, + "grad_norm": 0.7890625, + "learning_rate": 1.9962708018682663e-05, + "loss": 0.25, + "mean_token_accuracy": 0.9296178877353668, + "num_tokens": 21243231.0, + "step": 5070 + }, + { + "entropy": 0.2348667562007904, + "epoch": 1.1830050122391886, + "grad_norm": 2.09375, + "learning_rate": 1.996263355011613e-05, + "loss": 0.4452, + "mean_token_accuracy": 0.9184268653392792, + "num_tokens": 21262906.0, + "step": 5075 + }, + { + "entropy": 0.21639835610985755, + "epoch": 1.1841706492598205, + "grad_norm": 6.71875, + "learning_rate": 1.9962559007549265e-05, + "loss": 0.3633, + "mean_token_accuracy": 0.9239116966724396, + "num_tokens": 21276857.0, + "step": 5080 + }, + { + "entropy": 0.32814215682446957, + "epoch": 1.1853362862804522, + "grad_norm": 0.53515625, + "learning_rate": 1.9962484390983182e-05, + "loss": 0.3529, + "mean_token_accuracy": 0.9116757154464722, + "num_tokens": 21307183.0, + "step": 5085 + }, + { + "entropy": 0.25477695763111113, + "epoch": 1.186501923301084, + "grad_norm": 3.421875, + "learning_rate": 1.9962409700418993e-05, + "loss": 0.4097, + "mean_token_accuracy": 0.9303404748439789, + "num_tokens": 21334651.0, + "step": 5090 + }, + { + "entropy": 0.2113716546446085, + "epoch": 1.1876675603217157, + "grad_norm": 0.43359375, + "learning_rate": 1.9962334935857813e-05, + "loss": 0.3035, + "mean_token_accuracy": 0.9402331054210663, + "num_tokens": 21359790.0, + "step": 5095 + }, + { + "entropy": 0.23614692371338605, + "epoch": 1.1888331973423476, + "grad_norm": 3.09375, + "learning_rate": 1.9962260097300752e-05, + "loss": 0.23, + "mean_token_accuracy": 0.9286585927009583, + "num_tokens": 21403308.0, + "step": 5100 + }, + { + "entropy": 0.29054297506809235, + "epoch": 1.1899988343629793, + "grad_norm": 4.6875, + "learning_rate": 1.9962185184748934e-05, + "loss": 0.5162, + "mean_token_accuracy": 0.9029807567596435, + "num_tokens": 21417092.0, + "step": 5105 + }, + { + "entropy": 0.30761214941740034, + "epoch": 1.1911644713836111, + "grad_norm": 0.51171875, + "learning_rate": 1.996211019820347e-05, + "loss": 0.433, + "mean_token_accuracy": 0.9027220129966735, + "num_tokens": 21444507.0, + "step": 5110 + }, + { + "entropy": 0.3023512065410614, + "epoch": 1.192330108404243, + "grad_norm": 0.392578125, + "learning_rate": 1.9962035137665475e-05, + "loss": 0.7219, + "mean_token_accuracy": 0.8693067491054535, + "num_tokens": 21458658.0, + "step": 5115 + }, + { + "entropy": 0.38721234649419783, + "epoch": 1.1934957454248747, + "grad_norm": 0.6484375, + "learning_rate": 1.996196000313608e-05, + "loss": 0.8143, + "mean_token_accuracy": 0.8422433733940125, + "num_tokens": 21475926.0, + "step": 5120 + }, + { + "entropy": 0.31085776090621947, + "epoch": 1.1946613824455063, + "grad_norm": 2.03125, + "learning_rate": 1.9961884794616393e-05, + "loss": 0.4117, + "mean_token_accuracy": 0.8949326932430267, + "num_tokens": 21494892.0, + "step": 5125 + }, + { + "entropy": 0.36102984845638275, + "epoch": 1.1958270194661382, + "grad_norm": 0.9609375, + "learning_rate": 1.996180951210754e-05, + "loss": 0.5535, + "mean_token_accuracy": 0.8549659192562103, + "num_tokens": 21525176.0, + "step": 5130 + }, + { + "entropy": 0.319421348720789, + "epoch": 1.1969926564867701, + "grad_norm": 3.328125, + "learning_rate": 1.9961734155610647e-05, + "loss": 0.4569, + "mean_token_accuracy": 0.899403166770935, + "num_tokens": 21540302.0, + "step": 5135 + }, + { + "entropy": 0.3183010257780552, + "epoch": 1.1981582935074018, + "grad_norm": 0.6953125, + "learning_rate": 1.9961658725126835e-05, + "loss": 0.4923, + "mean_token_accuracy": 0.8722436010837555, + "num_tokens": 21565453.0, + "step": 5140 + }, + { + "entropy": 0.3733881928026676, + "epoch": 1.1993239305280337, + "grad_norm": 2.28125, + "learning_rate": 1.9961583220657228e-05, + "loss": 0.6894, + "mean_token_accuracy": 0.8586771845817566, + "num_tokens": 21581316.0, + "step": 5145 + }, + { + "entropy": 0.27310655564069747, + "epoch": 1.2004895675486653, + "grad_norm": 0.76953125, + "learning_rate": 1.9961507642202953e-05, + "loss": 0.4088, + "mean_token_accuracy": 0.9163376450538635, + "num_tokens": 21601574.0, + "step": 5150 + }, + { + "entropy": 0.2638257570564747, + "epoch": 1.2016552045692972, + "grad_norm": 2.890625, + "learning_rate": 1.9961431989765137e-05, + "loss": 0.4367, + "mean_token_accuracy": 0.9224029779434204, + "num_tokens": 21638480.0, + "step": 5155 + }, + { + "entropy": 0.25862696319818496, + "epoch": 1.2028208415899289, + "grad_norm": 2.390625, + "learning_rate": 1.9961356263344903e-05, + "loss": 0.4565, + "mean_token_accuracy": 0.896664959192276, + "num_tokens": 21660075.0, + "step": 5160 + }, + { + "entropy": 0.28866573721170424, + "epoch": 1.2039864786105607, + "grad_norm": 0.82421875, + "learning_rate": 1.996128046294339e-05, + "loss": 0.7012, + "mean_token_accuracy": 0.8824924468994141, + "num_tokens": 21671753.0, + "step": 5165 + }, + { + "entropy": 0.3241452187299728, + "epoch": 1.2051521156311924, + "grad_norm": 1.625, + "learning_rate": 1.9961204588561723e-05, + "loss": 0.5501, + "mean_token_accuracy": 0.8935443520545959, + "num_tokens": 21684460.0, + "step": 5170 + }, + { + "entropy": 0.21291088834404945, + "epoch": 1.2063177526518243, + "grad_norm": 0.3125, + "learning_rate": 1.996112864020103e-05, + "loss": 0.3549, + "mean_token_accuracy": 0.9189221918582916, + "num_tokens": 21709414.0, + "step": 5175 + }, + { + "entropy": 0.2792430866509676, + "epoch": 1.207483389672456, + "grad_norm": 4.09375, + "learning_rate": 1.9961052617862447e-05, + "loss": 0.4852, + "mean_token_accuracy": 0.910968005657196, + "num_tokens": 21728226.0, + "step": 5180 + }, + { + "entropy": 0.27534177508205177, + "epoch": 1.2086490266930878, + "grad_norm": 4.34375, + "learning_rate": 1.9960976521547112e-05, + "loss": 0.7243, + "mean_token_accuracy": 0.8686468303203583, + "num_tokens": 21748996.0, + "step": 5185 + }, + { + "entropy": 0.2846062559634447, + "epoch": 1.2098146637137195, + "grad_norm": 2.140625, + "learning_rate": 1.9960900351256154e-05, + "loss": 0.4689, + "mean_token_accuracy": 0.8924234092235566, + "num_tokens": 21769334.0, + "step": 5190 + }, + { + "entropy": 0.305383824557066, + "epoch": 1.2109803007343514, + "grad_norm": 2.890625, + "learning_rate": 1.996082410699071e-05, + "loss": 0.5887, + "mean_token_accuracy": 0.9004212081432342, + "num_tokens": 21783300.0, + "step": 5195 + }, + { + "entropy": 0.3224423822015524, + "epoch": 1.212145937754983, + "grad_norm": 2.25, + "learning_rate": 1.9960747788751916e-05, + "loss": 0.6121, + "mean_token_accuracy": 0.8869613647460938, + "num_tokens": 21814313.0, + "step": 5200 + }, + { + "entropy": 0.23042884096503258, + "epoch": 1.213311574775615, + "grad_norm": 1.21875, + "learning_rate": 1.9960671396540908e-05, + "loss": 0.3764, + "mean_token_accuracy": 0.9182167291641236, + "num_tokens": 21835573.0, + "step": 5205 + }, + { + "entropy": 0.3127889070659876, + "epoch": 1.2144772117962466, + "grad_norm": 7.875, + "learning_rate": 1.996059493035883e-05, + "loss": 0.4681, + "mean_token_accuracy": 0.885968142747879, + "num_tokens": 21854284.0, + "step": 5210 + }, + { + "entropy": 0.27360687740147116, + "epoch": 1.2156428488168785, + "grad_norm": 0.62109375, + "learning_rate": 1.9960518390206824e-05, + "loss": 0.5724, + "mean_token_accuracy": 0.8911721289157868, + "num_tokens": 21877998.0, + "step": 5215 + }, + { + "entropy": 0.2860093414783478, + "epoch": 1.2168084858375101, + "grad_norm": 6.65625, + "learning_rate": 1.9960441776086025e-05, + "loss": 0.691, + "mean_token_accuracy": 0.8818713963031769, + "num_tokens": 21893428.0, + "step": 5220 + }, + { + "entropy": 0.24104835242033004, + "epoch": 1.217974122858142, + "grad_norm": 2.0, + "learning_rate": 1.996036508799758e-05, + "loss": 0.3223, + "mean_token_accuracy": 0.9073219478130341, + "num_tokens": 21913016.0, + "step": 5225 + }, + { + "entropy": 0.2667414344847202, + "epoch": 1.2191397598787737, + "grad_norm": 3.328125, + "learning_rate": 1.9960288325942628e-05, + "loss": 0.6249, + "mean_token_accuracy": 0.899876493215561, + "num_tokens": 21926214.0, + "step": 5230 + }, + { + "entropy": 0.34385873265564443, + "epoch": 1.2203053968994055, + "grad_norm": 1.5390625, + "learning_rate": 1.9960211489922318e-05, + "loss": 0.4407, + "mean_token_accuracy": 0.8673520445823669, + "num_tokens": 21955376.0, + "step": 5235 + }, + { + "entropy": 0.20846946127712726, + "epoch": 1.2214710339200372, + "grad_norm": 2.359375, + "learning_rate": 1.9960134579937796e-05, + "loss": 0.2482, + "mean_token_accuracy": 0.9406843543052673, + "num_tokens": 21979736.0, + "step": 5240 + }, + { + "entropy": 0.27167031932622193, + "epoch": 1.222636670940669, + "grad_norm": 0.2001953125, + "learning_rate": 1.9960057595990205e-05, + "loss": 0.6442, + "mean_token_accuracy": 0.8813684284687042, + "num_tokens": 21997966.0, + "step": 5245 + }, + { + "entropy": 0.36240064799785615, + "epoch": 1.223802307961301, + "grad_norm": 1.8671875, + "learning_rate": 1.9959980538080696e-05, + "loss": 0.6134, + "mean_token_accuracy": 0.898576021194458, + "num_tokens": 22018016.0, + "step": 5250 + }, + { + "entropy": 0.26327665261924266, + "epoch": 1.2249679449819326, + "grad_norm": 4.21875, + "learning_rate": 1.9959903406210415e-05, + "loss": 0.5013, + "mean_token_accuracy": 0.9144660592079162, + "num_tokens": 22032216.0, + "step": 5255 + }, + { + "entropy": 0.27930256724357605, + "epoch": 1.2261335820025643, + "grad_norm": 5.03125, + "learning_rate": 1.9959826200380514e-05, + "loss": 0.5056, + "mean_token_accuracy": 0.9050133049488067, + "num_tokens": 22044317.0, + "step": 5260 + }, + { + "entropy": 0.252337658405304, + "epoch": 1.2272992190231962, + "grad_norm": 1.0078125, + "learning_rate": 1.9959748920592147e-05, + "loss": 0.3191, + "mean_token_accuracy": 0.929460322856903, + "num_tokens": 22063934.0, + "step": 5265 + }, + { + "entropy": 0.251625837571919, + "epoch": 1.228464856043828, + "grad_norm": 2.671875, + "learning_rate": 1.995967156684646e-05, + "loss": 0.2712, + "mean_token_accuracy": 0.9225124597549439, + "num_tokens": 22087458.0, + "step": 5270 + }, + { + "entropy": 0.3579515844583511, + "epoch": 1.2296304930644597, + "grad_norm": 1.0234375, + "learning_rate": 1.995959413914461e-05, + "loss": 0.6229, + "mean_token_accuracy": 0.8749721348285675, + "num_tokens": 22112344.0, + "step": 5275 + }, + { + "entropy": 0.2837061192840338, + "epoch": 1.2307961300850916, + "grad_norm": 1.515625, + "learning_rate": 1.9959516637487758e-05, + "loss": 0.3442, + "mean_token_accuracy": 0.8971266686916352, + "num_tokens": 22139433.0, + "step": 5280 + }, + { + "entropy": 0.28239931985735894, + "epoch": 1.2319617671057232, + "grad_norm": 3.21875, + "learning_rate": 1.9959439061877045e-05, + "loss": 0.5502, + "mean_token_accuracy": 0.8951669692993164, + "num_tokens": 22151982.0, + "step": 5285 + }, + { + "entropy": 0.2798332400619984, + "epoch": 1.2331274041263551, + "grad_norm": 2.78125, + "learning_rate": 1.9959361412313644e-05, + "loss": 0.5572, + "mean_token_accuracy": 0.8993366360664368, + "num_tokens": 22172368.0, + "step": 5290 + }, + { + "entropy": 0.21451436076313257, + "epoch": 1.2342930411469868, + "grad_norm": 2.609375, + "learning_rate": 1.99592836887987e-05, + "loss": 0.4159, + "mean_token_accuracy": 0.9279858708381653, + "num_tokens": 22198315.0, + "step": 5295 + }, + { + "entropy": 0.24950225427746772, + "epoch": 1.2354586781676187, + "grad_norm": 2.453125, + "learning_rate": 1.9959205891333377e-05, + "loss": 0.3212, + "mean_token_accuracy": 0.9208178699016571, + "num_tokens": 22226935.0, + "step": 5300 + }, + { + "entropy": 0.2923054873943329, + "epoch": 1.2366243151882503, + "grad_norm": 2.328125, + "learning_rate": 1.995912801991884e-05, + "loss": 0.6991, + "mean_token_accuracy": 0.881962114572525, + "num_tokens": 22237313.0, + "step": 5305 + }, + { + "entropy": 0.20678225150331855, + "epoch": 1.2377899522088822, + "grad_norm": 1.0625, + "learning_rate": 1.995905007455624e-05, + "loss": 0.2865, + "mean_token_accuracy": 0.9330097615718842, + "num_tokens": 22264438.0, + "step": 5310 + }, + { + "entropy": 0.24442143216729165, + "epoch": 1.2389555892295139, + "grad_norm": 0.435546875, + "learning_rate": 1.9958972055246745e-05, + "loss": 0.3176, + "mean_token_accuracy": 0.9177972376346588, + "num_tokens": 22290522.0, + "step": 5315 + }, + { + "entropy": 0.24426558166742324, + "epoch": 1.2401212262501458, + "grad_norm": 2.15625, + "learning_rate": 1.995889396199152e-05, + "loss": 0.4477, + "mean_token_accuracy": 0.9186632454395294, + "num_tokens": 22325579.0, + "step": 5320 + }, + { + "entropy": 0.3336485348641872, + "epoch": 1.2412868632707774, + "grad_norm": 3.234375, + "learning_rate": 1.995881579479173e-05, + "loss": 0.6578, + "mean_token_accuracy": 0.8619132041931152, + "num_tokens": 22338730.0, + "step": 5325 + }, + { + "entropy": 0.22589795142412186, + "epoch": 1.2424525002914093, + "grad_norm": 1.453125, + "learning_rate": 1.9958737553648534e-05, + "loss": 0.393, + "mean_token_accuracy": 0.9276214420795441, + "num_tokens": 22358940.0, + "step": 5330 + }, + { + "entropy": 0.3180350840091705, + "epoch": 1.243618137312041, + "grad_norm": 4.1875, + "learning_rate": 1.99586592385631e-05, + "loss": 0.516, + "mean_token_accuracy": 0.9004910588264465, + "num_tokens": 22373909.0, + "step": 5335 + }, + { + "entropy": 0.22264700792729855, + "epoch": 1.2447837743326728, + "grad_norm": 2.03125, + "learning_rate": 1.9958580849536605e-05, + "loss": 0.3238, + "mean_token_accuracy": 0.9395315706729889, + "num_tokens": 22399772.0, + "step": 5340 + }, + { + "entropy": 0.2370637021958828, + "epoch": 1.2459494113533045, + "grad_norm": 0.4375, + "learning_rate": 1.9958502386570205e-05, + "loss": 0.4085, + "mean_token_accuracy": 0.9249156236648559, + "num_tokens": 22419605.0, + "step": 5345 + }, + { + "entropy": 0.20505453906953336, + "epoch": 1.2471150483739364, + "grad_norm": 0.890625, + "learning_rate": 1.9958423849665083e-05, + "loss": 0.363, + "mean_token_accuracy": 0.9317021369934082, + "num_tokens": 22447540.0, + "step": 5350 + }, + { + "entropy": 0.24239457100629808, + "epoch": 1.248280685394568, + "grad_norm": 0.8828125, + "learning_rate": 1.9958345238822398e-05, + "loss": 0.5966, + "mean_token_accuracy": 0.9081678748130798, + "num_tokens": 22459109.0, + "step": 5355 + }, + { + "entropy": 0.2920355159789324, + "epoch": 1.2494463224152, + "grad_norm": 2.484375, + "learning_rate": 1.995826655404333e-05, + "loss": 0.3709, + "mean_token_accuracy": 0.9113083839416504, + "num_tokens": 22479242.0, + "step": 5360 + }, + { + "entropy": 0.18992096073925496, + "epoch": 1.2506119594358318, + "grad_norm": 2.828125, + "learning_rate": 1.9958187795329052e-05, + "loss": 0.3623, + "mean_token_accuracy": 0.932866907119751, + "num_tokens": 22498918.0, + "step": 5365 + }, + { + "entropy": 0.2889205154031515, + "epoch": 1.2517775964564635, + "grad_norm": 1.1796875, + "learning_rate": 1.9958108962680734e-05, + "loss": 0.3862, + "mean_token_accuracy": 0.870019656419754, + "num_tokens": 22546690.0, + "step": 5370 + }, + { + "entropy": 0.2940490383654833, + "epoch": 1.2529432334770951, + "grad_norm": 0.37109375, + "learning_rate": 1.9958030056099554e-05, + "loss": 0.4688, + "mean_token_accuracy": 0.9204876840114593, + "num_tokens": 22573571.0, + "step": 5375 + }, + { + "entropy": 0.2702594131231308, + "epoch": 1.254108870497727, + "grad_norm": 4.71875, + "learning_rate": 1.995795107558669e-05, + "loss": 0.7299, + "mean_token_accuracy": 0.8841011583805084, + "num_tokens": 22585871.0, + "step": 5380 + }, + { + "entropy": 0.31155082285404206, + "epoch": 1.255274507518359, + "grad_norm": 4.78125, + "learning_rate": 1.9957872021143315e-05, + "loss": 0.4556, + "mean_token_accuracy": 0.8956121504306793, + "num_tokens": 22600566.0, + "step": 5385 + }, + { + "entropy": 0.24358467012643814, + "epoch": 1.2564401445389906, + "grad_norm": 2.546875, + "learning_rate": 1.9957792892770614e-05, + "loss": 0.6497, + "mean_token_accuracy": 0.9009511232376098, + "num_tokens": 22611432.0, + "step": 5390 + }, + { + "entropy": 0.23342385441064833, + "epoch": 1.2576057815596222, + "grad_norm": 4.78125, + "learning_rate": 1.9957713690469765e-05, + "loss": 0.3691, + "mean_token_accuracy": 0.9175182461738587, + "num_tokens": 22640108.0, + "step": 5395 + }, + { + "entropy": 0.20412432737648487, + "epoch": 1.258771418580254, + "grad_norm": 0.65234375, + "learning_rate": 1.9957634414241947e-05, + "loss": 0.2297, + "mean_token_accuracy": 0.9177448451519012, + "num_tokens": 22676472.0, + "step": 5400 + }, + { + "entropy": 0.22115157768130303, + "epoch": 1.259937055600886, + "grad_norm": 4.09375, + "learning_rate": 1.995755506408834e-05, + "loss": 0.5732, + "mean_token_accuracy": 0.9084285914897918, + "num_tokens": 22699491.0, + "step": 5405 + }, + { + "entropy": 0.23604003936052323, + "epoch": 1.2611026926215176, + "grad_norm": 4.0625, + "learning_rate": 1.9957475640010134e-05, + "loss": 0.4428, + "mean_token_accuracy": 0.9152187645435333, + "num_tokens": 22718024.0, + "step": 5410 + }, + { + "entropy": 0.18227957040071488, + "epoch": 1.2622683296421493, + "grad_norm": 1.7109375, + "learning_rate": 1.9957396142008508e-05, + "loss": 0.2653, + "mean_token_accuracy": 0.9294990241527558, + "num_tokens": 22736249.0, + "step": 5415 + }, + { + "entropy": 0.242751706764102, + "epoch": 1.2634339666627812, + "grad_norm": 3.046875, + "learning_rate": 1.9957316570084653e-05, + "loss": 0.499, + "mean_token_accuracy": 0.9050007402896881, + "num_tokens": 22757046.0, + "step": 5420 + }, + { + "entropy": 0.24649465046823024, + "epoch": 1.264599603683413, + "grad_norm": 0.283203125, + "learning_rate": 1.9957236924239747e-05, + "loss": 0.4445, + "mean_token_accuracy": 0.91389000415802, + "num_tokens": 22787756.0, + "step": 5425 + }, + { + "entropy": 0.27705603912472726, + "epoch": 1.2657652407040447, + "grad_norm": 3.390625, + "learning_rate": 1.9957157204474985e-05, + "loss": 0.5771, + "mean_token_accuracy": 0.8832474410533905, + "num_tokens": 22800485.0, + "step": 5430 + }, + { + "entropy": 0.2584805965423584, + "epoch": 1.2669308777246766, + "grad_norm": 1.6640625, + "learning_rate": 1.995707741079155e-05, + "loss": 0.4395, + "mean_token_accuracy": 0.9243484199047088, + "num_tokens": 22812750.0, + "step": 5435 + }, + { + "entropy": 0.20337900295853614, + "epoch": 1.2680965147453083, + "grad_norm": 1.2265625, + "learning_rate": 1.995699754319064e-05, + "loss": 0.3228, + "mean_token_accuracy": 0.9291253626346588, + "num_tokens": 22837745.0, + "step": 5440 + }, + { + "entropy": 0.38868461102247237, + "epoch": 1.2692621517659401, + "grad_norm": 2.515625, + "learning_rate": 1.9956917601673437e-05, + "loss": 0.6526, + "mean_token_accuracy": 0.8844451904296875, + "num_tokens": 22856058.0, + "step": 5445 + }, + { + "entropy": 0.21811575591564178, + "epoch": 1.2704277887865718, + "grad_norm": 2.96875, + "learning_rate": 1.9956837586241138e-05, + "loss": 0.4726, + "mean_token_accuracy": 0.9248144567012787, + "num_tokens": 22876617.0, + "step": 5450 + }, + { + "entropy": 0.2282697781920433, + "epoch": 1.2715934258072037, + "grad_norm": 1.203125, + "learning_rate": 1.9956757496894935e-05, + "loss": 0.242, + "mean_token_accuracy": 0.9245195806026458, + "num_tokens": 22917771.0, + "step": 5455 + }, + { + "entropy": 0.30149990916252134, + "epoch": 1.2727590628278354, + "grad_norm": 0.51171875, + "learning_rate": 1.9956677333636024e-05, + "loss": 0.6098, + "mean_token_accuracy": 0.8778238594532013, + "num_tokens": 22932042.0, + "step": 5460 + }, + { + "entropy": 0.25813721865415573, + "epoch": 1.2739246998484672, + "grad_norm": 3.734375, + "learning_rate": 1.9956597096465594e-05, + "loss": 0.6734, + "mean_token_accuracy": 0.8907998919486999, + "num_tokens": 22942112.0, + "step": 5465 + }, + { + "entropy": 0.1798843963071704, + "epoch": 1.275090336869099, + "grad_norm": 1.6328125, + "learning_rate": 1.995651678538485e-05, + "loss": 0.3044, + "mean_token_accuracy": 0.9292000591754913, + "num_tokens": 22969289.0, + "step": 5470 + }, + { + "entropy": 0.30131426751613616, + "epoch": 1.2762559738897308, + "grad_norm": 5.6875, + "learning_rate": 1.9956436400394984e-05, + "loss": 0.6164, + "mean_token_accuracy": 0.8864310383796692, + "num_tokens": 22980735.0, + "step": 5475 + }, + { + "entropy": 0.29428491592407224, + "epoch": 1.2774216109103624, + "grad_norm": 3.546875, + "learning_rate": 1.99563559414972e-05, + "loss": 0.6925, + "mean_token_accuracy": 0.8707791328430176, + "num_tokens": 22994196.0, + "step": 5480 + }, + { + "entropy": 0.2878478910773993, + "epoch": 1.2785872479309943, + "grad_norm": 4.34375, + "learning_rate": 1.995627540869269e-05, + "loss": 0.5568, + "mean_token_accuracy": 0.8926146745681762, + "num_tokens": 23014522.0, + "step": 5485 + }, + { + "entropy": 0.263381066173315, + "epoch": 1.279752884951626, + "grad_norm": 1.2421875, + "learning_rate": 1.995619480198266e-05, + "loss": 0.4982, + "mean_token_accuracy": 0.9074977576732636, + "num_tokens": 23033023.0, + "step": 5490 + }, + { + "entropy": 0.295852642133832, + "epoch": 1.2809185219722579, + "grad_norm": 0.63671875, + "learning_rate": 1.9956114121368314e-05, + "loss": 0.3381, + "mean_token_accuracy": 0.9136807262897492, + "num_tokens": 23058135.0, + "step": 5495 + }, + { + "entropy": 0.2935203604400158, + "epoch": 1.2820841589928897, + "grad_norm": 0.6015625, + "learning_rate": 1.9956033366850847e-05, + "loss": 0.5256, + "mean_token_accuracy": 0.8971765220165253, + "num_tokens": 23070936.0, + "step": 5500 + }, + { + "entropy": 0.2395356010645628, + "epoch": 1.2832497960135214, + "grad_norm": 4.96875, + "learning_rate": 1.995595253843147e-05, + "loss": 0.5813, + "mean_token_accuracy": 0.88962482213974, + "num_tokens": 23089938.0, + "step": 5505 + }, + { + "entropy": 0.419968231767416, + "epoch": 1.284415433034153, + "grad_norm": 4.0, + "learning_rate": 1.9955871636111386e-05, + "loss": 0.5162, + "mean_token_accuracy": 0.8872650682926178, + "num_tokens": 23110821.0, + "step": 5510 + }, + { + "entropy": 0.2630806386470795, + "epoch": 1.285581070054785, + "grad_norm": 5.125, + "learning_rate": 1.9955790659891804e-05, + "loss": 0.5462, + "mean_token_accuracy": 0.9106649100780487, + "num_tokens": 23125481.0, + "step": 5515 + }, + { + "entropy": 0.38911786004900933, + "epoch": 1.2867467070754168, + "grad_norm": 2.640625, + "learning_rate": 1.995570960977393e-05, + "loss": 0.7633, + "mean_token_accuracy": 0.8669470906257629, + "num_tokens": 23139471.0, + "step": 5520 + }, + { + "entropy": 0.26729471050202847, + "epoch": 1.2879123440960485, + "grad_norm": 5.09375, + "learning_rate": 1.9955628485758968e-05, + "loss": 0.4119, + "mean_token_accuracy": 0.9227097749710083, + "num_tokens": 23156969.0, + "step": 5525 + }, + { + "entropy": 0.194175586104393, + "epoch": 1.2890779811166801, + "grad_norm": 0.490234375, + "learning_rate": 1.9955547287848136e-05, + "loss": 0.3486, + "mean_token_accuracy": 0.914922684431076, + "num_tokens": 23186931.0, + "step": 5530 + }, + { + "entropy": 0.2041798748075962, + "epoch": 1.290243618137312, + "grad_norm": 2.296875, + "learning_rate": 1.9955466016042637e-05, + "loss": 0.3021, + "mean_token_accuracy": 0.9194576680660248, + "num_tokens": 23218246.0, + "step": 5535 + }, + { + "entropy": 0.20812569046393037, + "epoch": 1.291409255157944, + "grad_norm": 0.27734375, + "learning_rate": 1.995538467034369e-05, + "loss": 0.2525, + "mean_token_accuracy": 0.9183135628700256, + "num_tokens": 23252019.0, + "step": 5540 + }, + { + "entropy": 0.3059225469827652, + "epoch": 1.2925748921785756, + "grad_norm": 5.25, + "learning_rate": 1.99553032507525e-05, + "loss": 0.7214, + "mean_token_accuracy": 0.8827582836151123, + "num_tokens": 23260474.0, + "step": 5545 + }, + { + "entropy": 0.42859417796134947, + "epoch": 1.2937405291992072, + "grad_norm": 4.78125, + "learning_rate": 1.9955221757270287e-05, + "loss": 0.6725, + "mean_token_accuracy": 0.8570225417613984, + "num_tokens": 23275961.0, + "step": 5550 + }, + { + "entropy": 0.33137104101479053, + "epoch": 1.2949061662198391, + "grad_norm": 2.140625, + "learning_rate": 1.9955140189898262e-05, + "loss": 0.5805, + "mean_token_accuracy": 0.8772634506225586, + "num_tokens": 23288893.0, + "step": 5555 + }, + { + "entropy": 0.2654805898666382, + "epoch": 1.296071803240471, + "grad_norm": 5.4375, + "learning_rate": 1.995505854863765e-05, + "loss": 0.5707, + "mean_token_accuracy": 0.9057930707931519, + "num_tokens": 23299913.0, + "step": 5560 + }, + { + "entropy": 0.26006845086812974, + "epoch": 1.2972374402611027, + "grad_norm": 3.40625, + "learning_rate": 1.995497683348966e-05, + "loss": 0.378, + "mean_token_accuracy": 0.9138223767280579, + "num_tokens": 23323199.0, + "step": 5565 + }, + { + "entropy": 0.2738994713872671, + "epoch": 1.2984030772817345, + "grad_norm": 1.7578125, + "learning_rate": 1.995489504445551e-05, + "loss": 0.3923, + "mean_token_accuracy": 0.9132127881050109, + "num_tokens": 23340559.0, + "step": 5570 + }, + { + "entropy": 0.34965813905000687, + "epoch": 1.2995687143023662, + "grad_norm": 0.59765625, + "learning_rate": 1.995481318153642e-05, + "loss": 0.8147, + "mean_token_accuracy": 0.8552458584308624, + "num_tokens": 23358466.0, + "step": 5575 + }, + { + "entropy": 0.23392754904925822, + "epoch": 1.300734351322998, + "grad_norm": 1.171875, + "learning_rate": 1.9954731244733618e-05, + "loss": 0.3513, + "mean_token_accuracy": 0.9144553184509278, + "num_tokens": 23381477.0, + "step": 5580 + }, + { + "entropy": 0.23823288679122925, + "epoch": 1.3018999883436297, + "grad_norm": 2.96875, + "learning_rate": 1.995464923404832e-05, + "loss": 0.4633, + "mean_token_accuracy": 0.9204339981079102, + "num_tokens": 23403290.0, + "step": 5585 + }, + { + "entropy": 0.2805923163890839, + "epoch": 1.3030656253642616, + "grad_norm": 1.296875, + "learning_rate": 1.995456714948175e-05, + "loss": 0.5294, + "mean_token_accuracy": 0.9073796331882477, + "num_tokens": 23413307.0, + "step": 5590 + }, + { + "entropy": 0.36135985255241393, + "epoch": 1.3042312623848933, + "grad_norm": 7.03125, + "learning_rate": 1.995448499103513e-05, + "loss": 0.8949, + "mean_token_accuracy": 0.8443944990634918, + "num_tokens": 23423224.0, + "step": 5595 + }, + { + "entropy": 0.3975703451782465, + "epoch": 1.3053968994055252, + "grad_norm": 0.291015625, + "learning_rate": 1.9954402758709687e-05, + "loss": 0.6798, + "mean_token_accuracy": 0.869120967388153, + "num_tokens": 23449403.0, + "step": 5600 + }, + { + "entropy": 0.26895866096019744, + "epoch": 1.3065625364261568, + "grad_norm": 0.66015625, + "learning_rate": 1.9954320452506648e-05, + "loss": 0.573, + "mean_token_accuracy": 0.9046859323978425, + "num_tokens": 23477768.0, + "step": 5605 + }, + { + "entropy": 0.2528399731963873, + "epoch": 1.3077281734467887, + "grad_norm": 0.65234375, + "learning_rate": 1.995423807242724e-05, + "loss": 0.4023, + "mean_token_accuracy": 0.9266422271728516, + "num_tokens": 23504880.0, + "step": 5610 + }, + { + "entropy": 0.2804453056305647, + "epoch": 1.3088938104674204, + "grad_norm": 3.671875, + "learning_rate": 1.9954155618472687e-05, + "loss": 0.488, + "mean_token_accuracy": 0.9003461420536041, + "num_tokens": 23532967.0, + "step": 5615 + }, + { + "entropy": 0.26205198690295217, + "epoch": 1.3100594474880523, + "grad_norm": 0.88671875, + "learning_rate": 1.9954073090644227e-05, + "loss": 0.3315, + "mean_token_accuracy": 0.9261995434761048, + "num_tokens": 23548656.0, + "step": 5620 + }, + { + "entropy": 0.18076314106583596, + "epoch": 1.311225084508684, + "grad_norm": 0.337890625, + "learning_rate": 1.9953990488943086e-05, + "loss": 0.2923, + "mean_token_accuracy": 0.9447229325771331, + "num_tokens": 23576455.0, + "step": 5625 + }, + { + "entropy": 0.267459000647068, + "epoch": 1.3123907215293158, + "grad_norm": 0.9296875, + "learning_rate": 1.9953907813370494e-05, + "loss": 0.3465, + "mean_token_accuracy": 0.9184731841087341, + "num_tokens": 23591955.0, + "step": 5630 + }, + { + "entropy": 0.3997308075428009, + "epoch": 1.3135563585499477, + "grad_norm": 5.15625, + "learning_rate": 1.9953825063927684e-05, + "loss": 0.7307, + "mean_token_accuracy": 0.8641902565956116, + "num_tokens": 23600933.0, + "step": 5635 + }, + { + "entropy": 0.4334285452961922, + "epoch": 1.3147219955705793, + "grad_norm": 1.1640625, + "learning_rate": 1.995374224061589e-05, + "loss": 0.5922, + "mean_token_accuracy": 0.8764832258224488, + "num_tokens": 23615985.0, + "step": 5640 + }, + { + "entropy": 0.4441813049837947, + "epoch": 1.315887632591211, + "grad_norm": 0.40625, + "learning_rate": 1.9953659343436352e-05, + "loss": 0.8662, + "mean_token_accuracy": 0.8815126717090607, + "num_tokens": 23644565.0, + "step": 5645 + }, + { + "entropy": 0.27254144847393036, + "epoch": 1.3170532696118429, + "grad_norm": 2.78125, + "learning_rate": 1.99535763723903e-05, + "loss": 0.4853, + "mean_token_accuracy": 0.9063039720058441, + "num_tokens": 23663586.0, + "step": 5650 + }, + { + "entropy": 0.37559669390320777, + "epoch": 1.3182189066324748, + "grad_norm": 4.84375, + "learning_rate": 1.9953493327478976e-05, + "loss": 0.7446, + "mean_token_accuracy": 0.8823855638504028, + "num_tokens": 23672706.0, + "step": 5655 + }, + { + "entropy": 0.20802447032183408, + "epoch": 1.3193845436531064, + "grad_norm": 0.228515625, + "learning_rate": 1.9953410208703614e-05, + "loss": 0.4037, + "mean_token_accuracy": 0.919158661365509, + "num_tokens": 23697302.0, + "step": 5660 + }, + { + "entropy": 0.3208934962749481, + "epoch": 1.320550180673738, + "grad_norm": 2.546875, + "learning_rate": 1.9953327016065455e-05, + "loss": 0.686, + "mean_token_accuracy": 0.8717946231365203, + "num_tokens": 23707498.0, + "step": 5665 + }, + { + "entropy": 0.3227735310792923, + "epoch": 1.32171581769437, + "grad_norm": 2.609375, + "learning_rate": 1.9953243749565742e-05, + "loss": 0.8797, + "mean_token_accuracy": 0.8639743089675903, + "num_tokens": 23716875.0, + "step": 5670 + }, + { + "entropy": 0.27695200871676207, + "epoch": 1.3228814547150018, + "grad_norm": 6.625, + "learning_rate": 1.9953160409205714e-05, + "loss": 0.3936, + "mean_token_accuracy": 0.9095885276794433, + "num_tokens": 23758465.0, + "step": 5675 + }, + { + "entropy": 0.22672717571258544, + "epoch": 1.3240470917356335, + "grad_norm": 1.8984375, + "learning_rate": 1.9953076994986613e-05, + "loss": 0.3288, + "mean_token_accuracy": 0.9335966229438781, + "num_tokens": 23774175.0, + "step": 5680 + }, + { + "entropy": 0.2518630506470799, + "epoch": 1.3252127287562652, + "grad_norm": 0.318359375, + "learning_rate": 1.9952993506909687e-05, + "loss": 0.3842, + "mean_token_accuracy": 0.9146360695362091, + "num_tokens": 23806579.0, + "step": 5685 + }, + { + "entropy": 0.2931936949491501, + "epoch": 1.326378365776897, + "grad_norm": 3.859375, + "learning_rate": 1.9952909944976175e-05, + "loss": 0.5051, + "mean_token_accuracy": 0.8879533410072327, + "num_tokens": 23821716.0, + "step": 5690 + }, + { + "entropy": 0.2536670383065939, + "epoch": 1.327544002797529, + "grad_norm": 2.9375, + "learning_rate": 1.995282630918733e-05, + "loss": 0.4721, + "mean_token_accuracy": 0.9183348596096039, + "num_tokens": 23836439.0, + "step": 5695 + }, + { + "entropy": 0.25506643392145634, + "epoch": 1.3287096398181606, + "grad_norm": 0.765625, + "learning_rate": 1.995274259954439e-05, + "loss": 0.4695, + "mean_token_accuracy": 0.91966432929039, + "num_tokens": 23850720.0, + "step": 5700 + }, + { + "entropy": 0.2509689211845398, + "epoch": 1.3298752768387925, + "grad_norm": 4.875, + "learning_rate": 1.9952658816048612e-05, + "loss": 0.5005, + "mean_token_accuracy": 0.8925885140895844, + "num_tokens": 23869638.0, + "step": 5705 + }, + { + "entropy": 0.2714238610118628, + "epoch": 1.3310409138594241, + "grad_norm": 2.609375, + "learning_rate": 1.995257495870124e-05, + "loss": 0.4868, + "mean_token_accuracy": 0.917405641078949, + "num_tokens": 23886300.0, + "step": 5710 + }, + { + "entropy": 0.23347382061183453, + "epoch": 1.332206550880056, + "grad_norm": 2.25, + "learning_rate": 1.9952491027503527e-05, + "loss": 0.4869, + "mean_token_accuracy": 0.9239233791828155, + "num_tokens": 23922439.0, + "step": 5715 + }, + { + "entropy": 0.24450993463397025, + "epoch": 1.3333721879006877, + "grad_norm": 4.21875, + "learning_rate": 1.9952407022456722e-05, + "loss": 0.4458, + "mean_token_accuracy": 0.9130438029766083, + "num_tokens": 23940457.0, + "step": 5720 + }, + { + "entropy": 0.15998954940587282, + "epoch": 1.3345378249213196, + "grad_norm": 0.69140625, + "learning_rate": 1.9952322943562085e-05, + "loss": 0.2078, + "mean_token_accuracy": 0.9507815659046173, + "num_tokens": 23970127.0, + "step": 5725 + }, + { + "entropy": 0.29249856173992156, + "epoch": 1.3357034619419512, + "grad_norm": 2.34375, + "learning_rate": 1.995223879082086e-05, + "loss": 0.5888, + "mean_token_accuracy": 0.9081490993499756, + "num_tokens": 23980439.0, + "step": 5730 + }, + { + "entropy": 0.2913537845015526, + "epoch": 1.336869098962583, + "grad_norm": 0.3125, + "learning_rate": 1.9952154564234307e-05, + "loss": 0.5545, + "mean_token_accuracy": 0.8999676525592804, + "num_tokens": 24006720.0, + "step": 5735 + }, + { + "entropy": 0.2000728841871023, + "epoch": 1.3380347359832148, + "grad_norm": 0.310546875, + "learning_rate": 1.995207026380368e-05, + "loss": 0.4729, + "mean_token_accuracy": 0.9112296581268311, + "num_tokens": 24030735.0, + "step": 5740 + }, + { + "entropy": 0.40132756531238556, + "epoch": 1.3392003730038466, + "grad_norm": 2.34375, + "learning_rate": 1.9951985889530237e-05, + "loss": 0.7819, + "mean_token_accuracy": 0.8558760762214661, + "num_tokens": 24044026.0, + "step": 5745 + }, + { + "entropy": 0.31329273283481596, + "epoch": 1.3403660100244783, + "grad_norm": 3.421875, + "learning_rate": 1.995190144141524e-05, + "loss": 0.5908, + "mean_token_accuracy": 0.8867635846138, + "num_tokens": 24059893.0, + "step": 5750 + }, + { + "entropy": 0.2145456612110138, + "epoch": 1.3415316470451102, + "grad_norm": 3.09375, + "learning_rate": 1.9951816919459937e-05, + "loss": 0.4101, + "mean_token_accuracy": 0.9166670083999634, + "num_tokens": 24084954.0, + "step": 5755 + }, + { + "entropy": 0.26943669021129607, + "epoch": 1.3426972840657418, + "grad_norm": 3.296875, + "learning_rate": 1.9951732323665602e-05, + "loss": 0.4286, + "mean_token_accuracy": 0.9149986863136291, + "num_tokens": 24097300.0, + "step": 5760 + }, + { + "entropy": 0.4137574560940266, + "epoch": 1.3438629210863737, + "grad_norm": 1.9375, + "learning_rate": 1.9951647654033487e-05, + "loss": 0.7597, + "mean_token_accuracy": 0.8728756129741668, + "num_tokens": 24119361.0, + "step": 5765 + }, + { + "entropy": 0.22645775750279426, + "epoch": 1.3450285581070056, + "grad_norm": 0.388671875, + "learning_rate": 1.995156291056486e-05, + "loss": 0.4117, + "mean_token_accuracy": 0.9256348073482513, + "num_tokens": 24144326.0, + "step": 5770 + }, + { + "entropy": 0.3713815161027014, + "epoch": 1.3461941951276373, + "grad_norm": 1.515625, + "learning_rate": 1.9951478093260984e-05, + "loss": 0.3942, + "mean_token_accuracy": 0.8806808292865753, + "num_tokens": 24179091.0, + "step": 5775 + }, + { + "entropy": 0.3129763476550579, + "epoch": 1.347359832148269, + "grad_norm": 2.90625, + "learning_rate": 1.995139320212312e-05, + "loss": 0.6692, + "mean_token_accuracy": 0.8922302544116973, + "num_tokens": 24191822.0, + "step": 5780 + }, + { + "entropy": 0.2801304005086422, + "epoch": 1.3485254691689008, + "grad_norm": 1.1015625, + "learning_rate": 1.9951308237152534e-05, + "loss": 0.5596, + "mean_token_accuracy": 0.8922988891601562, + "num_tokens": 24206746.0, + "step": 5785 + }, + { + "entropy": 0.3644409075379372, + "epoch": 1.3496911061895327, + "grad_norm": 5.3125, + "learning_rate": 1.9951223198350494e-05, + "loss": 0.6947, + "mean_token_accuracy": 0.8766247451305389, + "num_tokens": 24220264.0, + "step": 5790 + }, + { + "entropy": 0.2567942202091217, + "epoch": 1.3508567432101644, + "grad_norm": 2.484375, + "learning_rate": 1.9951138085718274e-05, + "loss": 0.4551, + "mean_token_accuracy": 0.9105157017707824, + "num_tokens": 24254482.0, + "step": 5795 + }, + { + "entropy": 0.2791744988411665, + "epoch": 1.352022380230796, + "grad_norm": 0.91015625, + "learning_rate": 1.9951052899257138e-05, + "loss": 0.4648, + "mean_token_accuracy": 0.9042120218276978, + "num_tokens": 24278510.0, + "step": 5800 + }, + { + "entropy": 0.23117900416254997, + "epoch": 1.353188017251428, + "grad_norm": 0.298828125, + "learning_rate": 1.9950967638968352e-05, + "loss": 0.1542, + "mean_token_accuracy": 0.9249145030975342, + "num_tokens": 24320079.0, + "step": 5805 + }, + { + "entropy": 0.2455504924058914, + "epoch": 1.3543536542720598, + "grad_norm": 0.28515625, + "learning_rate": 1.9950882304853195e-05, + "loss": 0.2634, + "mean_token_accuracy": 0.9251305639743805, + "num_tokens": 24351086.0, + "step": 5810 + }, + { + "entropy": 0.32813689541071656, + "epoch": 1.3555192912926914, + "grad_norm": 0.353515625, + "learning_rate": 1.9950796896912937e-05, + "loss": 0.4891, + "mean_token_accuracy": 0.8899535357952117, + "num_tokens": 24400487.0, + "step": 5815 + }, + { + "entropy": 0.24501474015414715, + "epoch": 1.356684928313323, + "grad_norm": 0.51171875, + "learning_rate": 1.995071141514885e-05, + "loss": 0.5379, + "mean_token_accuracy": 0.9088938236236572, + "num_tokens": 24427331.0, + "step": 5820 + }, + { + "entropy": 0.3071946881711483, + "epoch": 1.357850565333955, + "grad_norm": 1.578125, + "learning_rate": 1.9950625859562208e-05, + "loss": 0.5551, + "mean_token_accuracy": 0.8883362352848053, + "num_tokens": 24438312.0, + "step": 5825 + }, + { + "entropy": 0.19779857322573663, + "epoch": 1.3590162023545869, + "grad_norm": 1.359375, + "learning_rate": 1.995054023015429e-05, + "loss": 0.3946, + "mean_token_accuracy": 0.9297601401805877, + "num_tokens": 24451773.0, + "step": 5830 + }, + { + "entropy": 0.22592936605215072, + "epoch": 1.3601818393752185, + "grad_norm": 0.41796875, + "learning_rate": 1.995045452692637e-05, + "loss": 0.4492, + "mean_token_accuracy": 0.9175360441207886, + "num_tokens": 24483192.0, + "step": 5835 + }, + { + "entropy": 0.2569009017199278, + "epoch": 1.3613474763958504, + "grad_norm": 4.9375, + "learning_rate": 1.9950368749879726e-05, + "loss": 0.5167, + "mean_token_accuracy": 0.9054693639278412, + "num_tokens": 24500215.0, + "step": 5840 + }, + { + "entropy": 0.1963417749851942, + "epoch": 1.362513113416482, + "grad_norm": 1.65625, + "learning_rate": 1.995028289901564e-05, + "loss": 0.301, + "mean_token_accuracy": 0.9437475621700286, + "num_tokens": 24527876.0, + "step": 5845 + }, + { + "entropy": 0.35903219059109687, + "epoch": 1.363678750437114, + "grad_norm": 3.640625, + "learning_rate": 1.9950196974335392e-05, + "loss": 0.5245, + "mean_token_accuracy": 0.8531217724084854, + "num_tokens": 24556353.0, + "step": 5850 + }, + { + "entropy": 0.23062839526683093, + "epoch": 1.3648443874577456, + "grad_norm": 0.23828125, + "learning_rate": 1.9950110975840256e-05, + "loss": 0.3947, + "mean_token_accuracy": 0.9208827018737793, + "num_tokens": 24585285.0, + "step": 5855 + }, + { + "entropy": 0.3217948623001575, + "epoch": 1.3660100244783775, + "grad_norm": 3.09375, + "learning_rate": 1.9950024903531525e-05, + "loss": 0.5242, + "mean_token_accuracy": 0.8897144675254822, + "num_tokens": 24602223.0, + "step": 5860 + }, + { + "entropy": 0.25288745760917664, + "epoch": 1.3671756614990092, + "grad_norm": 0.357421875, + "learning_rate": 1.994993875741048e-05, + "loss": 0.3786, + "mean_token_accuracy": 0.9193249821662903, + "num_tokens": 24622073.0, + "step": 5865 + }, + { + "entropy": 0.22469098567962648, + "epoch": 1.368341298519641, + "grad_norm": 3.578125, + "learning_rate": 1.9949852537478396e-05, + "loss": 0.411, + "mean_token_accuracy": 0.9251877844333649, + "num_tokens": 24638083.0, + "step": 5870 + }, + { + "entropy": 0.2176618270576, + "epoch": 1.3695069355402727, + "grad_norm": 2.5625, + "learning_rate": 1.9949766243736567e-05, + "loss": 0.3303, + "mean_token_accuracy": 0.934071135520935, + "num_tokens": 24653018.0, + "step": 5875 + }, + { + "entropy": 0.2513718821108341, + "epoch": 1.3706725725609046, + "grad_norm": 3.609375, + "learning_rate": 1.9949679876186282e-05, + "loss": 0.4386, + "mean_token_accuracy": 0.9089162170886993, + "num_tokens": 24674972.0, + "step": 5880 + }, + { + "entropy": 0.28676617220044137, + "epoch": 1.3718382095815362, + "grad_norm": 4.40625, + "learning_rate": 1.9949593434828826e-05, + "loss": 0.6229, + "mean_token_accuracy": 0.8899110794067383, + "num_tokens": 24689403.0, + "step": 5885 + }, + { + "entropy": 0.23323919475078583, + "epoch": 1.3730038466021681, + "grad_norm": 0.64453125, + "learning_rate": 1.9949506919665483e-05, + "loss": 0.4513, + "mean_token_accuracy": 0.9205905556678772, + "num_tokens": 24715786.0, + "step": 5890 + }, + { + "entropy": 0.2819278556853533, + "epoch": 1.3741694836227998, + "grad_norm": 0.70703125, + "learning_rate": 1.994942033069755e-05, + "loss": 0.3828, + "mean_token_accuracy": 0.9168027639389038, + "num_tokens": 24734448.0, + "step": 5895 + }, + { + "entropy": 0.3614561915397644, + "epoch": 1.3753351206434317, + "grad_norm": 3.484375, + "learning_rate": 1.9949333667926315e-05, + "loss": 0.633, + "mean_token_accuracy": 0.8805983364582062, + "num_tokens": 24753262.0, + "step": 5900 + }, + { + "entropy": 0.24026509281247854, + "epoch": 1.3765007576640635, + "grad_norm": 3.578125, + "learning_rate": 1.994924693135307e-05, + "loss": 0.2847, + "mean_token_accuracy": 0.9050538659095764, + "num_tokens": 24788999.0, + "step": 5905 + }, + { + "entropy": 0.21759585216641425, + "epoch": 1.3776663946846952, + "grad_norm": 6.65625, + "learning_rate": 1.9949160120979106e-05, + "loss": 0.4946, + "mean_token_accuracy": 0.9079568982124329, + "num_tokens": 24808413.0, + "step": 5910 + }, + { + "entropy": 0.19574114717543126, + "epoch": 1.3788320317053269, + "grad_norm": 0.52734375, + "learning_rate": 1.9949073236805727e-05, + "loss": 0.3247, + "mean_token_accuracy": 0.9310945808887482, + "num_tokens": 24832143.0, + "step": 5915 + }, + { + "entropy": 0.2865985196083784, + "epoch": 1.3799976687259587, + "grad_norm": 2.75, + "learning_rate": 1.994898627883422e-05, + "loss": 0.418, + "mean_token_accuracy": 0.9289934575557709, + "num_tokens": 24849135.0, + "step": 5920 + }, + { + "entropy": 0.31515090465545653, + "epoch": 1.3811633057465906, + "grad_norm": 0.62109375, + "learning_rate": 1.9948899247065882e-05, + "loss": 0.5842, + "mean_token_accuracy": 0.8844206809997559, + "num_tokens": 24875602.0, + "step": 5925 + }, + { + "entropy": 0.336475083976984, + "epoch": 1.3823289427672223, + "grad_norm": 1.3125, + "learning_rate": 1.9948812141502015e-05, + "loss": 0.609, + "mean_token_accuracy": 0.8807213425636291, + "num_tokens": 24896407.0, + "step": 5930 + }, + { + "entropy": 0.23444087468087674, + "epoch": 1.383494579787854, + "grad_norm": 0.90625, + "learning_rate": 1.994872496214391e-05, + "loss": 0.2867, + "mean_token_accuracy": 0.9178794384002685, + "num_tokens": 24928449.0, + "step": 5935 + }, + { + "entropy": 0.2076961003243923, + "epoch": 1.3846602168084858, + "grad_norm": 1.265625, + "learning_rate": 1.994863770899288e-05, + "loss": 0.4733, + "mean_token_accuracy": 0.9192006587982178, + "num_tokens": 24943725.0, + "step": 5940 + }, + { + "entropy": 0.25617918446660043, + "epoch": 1.3858258538291177, + "grad_norm": 3.9375, + "learning_rate": 1.9948550382050217e-05, + "loss": 0.3418, + "mean_token_accuracy": 0.8931475222110749, + "num_tokens": 24962709.0, + "step": 5945 + }, + { + "entropy": 0.2509648621082306, + "epoch": 1.3869914908497494, + "grad_norm": 4.5, + "learning_rate": 1.9948462981317224e-05, + "loss": 0.5186, + "mean_token_accuracy": 0.9041432499885559, + "num_tokens": 24975572.0, + "step": 5950 + }, + { + "entropy": 0.28281467258930204, + "epoch": 1.388157127870381, + "grad_norm": 4.71875, + "learning_rate": 1.9948375506795203e-05, + "loss": 0.6613, + "mean_token_accuracy": 0.8867592930793762, + "num_tokens": 24985140.0, + "step": 5955 + }, + { + "entropy": 0.1830880269408226, + "epoch": 1.389322764891013, + "grad_norm": 3.3125, + "learning_rate": 1.9948287958485462e-05, + "loss": 0.3379, + "mean_token_accuracy": 0.9403021216392518, + "num_tokens": 25003798.0, + "step": 5960 + }, + { + "entropy": 0.20799617059528827, + "epoch": 1.3904884019116448, + "grad_norm": 1.1796875, + "learning_rate": 1.9948200336389306e-05, + "loss": 0.3895, + "mean_token_accuracy": 0.918929374217987, + "num_tokens": 25038105.0, + "step": 5965 + }, + { + "entropy": 0.216236861795187, + "epoch": 1.3916540389322765, + "grad_norm": 1.875, + "learning_rate": 1.9948112640508038e-05, + "loss": 0.5133, + "mean_token_accuracy": 0.9075867176055908, + "num_tokens": 25054679.0, + "step": 5970 + }, + { + "entropy": 0.22477278523147107, + "epoch": 1.3928196759529083, + "grad_norm": 3.546875, + "learning_rate": 1.994802487084297e-05, + "loss": 0.5587, + "mean_token_accuracy": 0.9134108006954194, + "num_tokens": 25079947.0, + "step": 5975 + }, + { + "entropy": 0.23681939952075481, + "epoch": 1.39398531297354, + "grad_norm": 1.5625, + "learning_rate": 1.9947937027395407e-05, + "loss": 0.3169, + "mean_token_accuracy": 0.9177536785602569, + "num_tokens": 25106368.0, + "step": 5980 + }, + { + "entropy": 0.22461127489805222, + "epoch": 1.3951509499941719, + "grad_norm": 0.54296875, + "learning_rate": 1.994784911016666e-05, + "loss": 0.3875, + "mean_token_accuracy": 0.9304892122745514, + "num_tokens": 25132141.0, + "step": 5985 + }, + { + "entropy": 0.24312784448266028, + "epoch": 1.3963165870148035, + "grad_norm": 4.75, + "learning_rate": 1.9947761119158046e-05, + "loss": 0.4565, + "mean_token_accuracy": 0.910781592130661, + "num_tokens": 25163258.0, + "step": 5990 + }, + { + "entropy": 0.2798545081168413, + "epoch": 1.3974822240354354, + "grad_norm": 2.25, + "learning_rate": 1.9947673054370867e-05, + "loss": 0.3462, + "mean_token_accuracy": 0.884056442975998, + "num_tokens": 25185321.0, + "step": 5995 + }, + { + "entropy": 0.2866420477628708, + "epoch": 1.398647861056067, + "grad_norm": 5.78125, + "learning_rate": 1.9947584915806444e-05, + "loss": 0.7835, + "mean_token_accuracy": 0.8801892518997192, + "num_tokens": 25195532.0, + "step": 6000 + }, + { + "entropy": 0.2600490044802427, + "epoch": 1.399813498076699, + "grad_norm": 4.09375, + "learning_rate": 1.9947496703466088e-05, + "loss": 0.3731, + "mean_token_accuracy": 0.9133882701396943, + "num_tokens": 25218885.0, + "step": 6005 + }, + { + "entropy": 0.18085310496389867, + "epoch": 1.4009791350973306, + "grad_norm": 3.140625, + "learning_rate": 1.9947408417351114e-05, + "loss": 0.2987, + "mean_token_accuracy": 0.9389593541622162, + "num_tokens": 25248167.0, + "step": 6010 + }, + { + "entropy": 0.401553612947464, + "epoch": 1.4021447721179625, + "grad_norm": 2.9375, + "learning_rate": 1.9947320057462836e-05, + "loss": 0.6126, + "mean_token_accuracy": 0.8375705778598785, + "num_tokens": 25272827.0, + "step": 6015 + }, + { + "entropy": 0.27304190024733543, + "epoch": 1.4033104091385942, + "grad_norm": 2.34375, + "learning_rate": 1.994723162380258e-05, + "loss": 0.4852, + "mean_token_accuracy": 0.9171205282211303, + "num_tokens": 25284313.0, + "step": 6020 + }, + { + "entropy": 0.3541749894618988, + "epoch": 1.404476046159226, + "grad_norm": 1.1953125, + "learning_rate": 1.9947143116371656e-05, + "loss": 0.61, + "mean_token_accuracy": 0.8762133181095123, + "num_tokens": 25303657.0, + "step": 6025 + }, + { + "entropy": 0.21734450608491898, + "epoch": 1.4056416831798577, + "grad_norm": 2.34375, + "learning_rate": 1.994705453517139e-05, + "loss": 0.4161, + "mean_token_accuracy": 0.9308688163757324, + "num_tokens": 25316085.0, + "step": 6030 + }, + { + "entropy": 0.21970502883195878, + "epoch": 1.4068073202004896, + "grad_norm": 0.74609375, + "learning_rate": 1.9946965880203098e-05, + "loss": 0.3519, + "mean_token_accuracy": 0.9339965403079986, + "num_tokens": 25336522.0, + "step": 6035 + }, + { + "entropy": 0.2662087522447109, + "epoch": 1.4079729572211215, + "grad_norm": 2.328125, + "learning_rate": 1.9946877151468103e-05, + "loss": 0.5657, + "mean_token_accuracy": 0.8919131517410278, + "num_tokens": 25352103.0, + "step": 6040 + }, + { + "entropy": 0.3002671368420124, + "epoch": 1.4091385942417531, + "grad_norm": 0.63671875, + "learning_rate": 1.9946788348967732e-05, + "loss": 0.4606, + "mean_token_accuracy": 0.9111246347427369, + "num_tokens": 25371971.0, + "step": 6045 + }, + { + "entropy": 0.2485044565051794, + "epoch": 1.4103042312623848, + "grad_norm": 3.15625, + "learning_rate": 1.9946699472703305e-05, + "loss": 0.4732, + "mean_token_accuracy": 0.9000851452350617, + "num_tokens": 25397621.0, + "step": 6050 + }, + { + "entropy": 0.3733428567647934, + "epoch": 1.4114698682830167, + "grad_norm": 2.0625, + "learning_rate": 1.9946610522676148e-05, + "loss": 0.4188, + "mean_token_accuracy": 0.9120821356773376, + "num_tokens": 25408478.0, + "step": 6055 + }, + { + "entropy": 0.18461912628263236, + "epoch": 1.4126355053036486, + "grad_norm": 2.4375, + "learning_rate": 1.9946521498887587e-05, + "loss": 0.2562, + "mean_token_accuracy": 0.9476583361625671, + "num_tokens": 25430854.0, + "step": 6060 + }, + { + "entropy": 0.3762295678257942, + "epoch": 1.4138011423242802, + "grad_norm": 4.28125, + "learning_rate": 1.9946432401338952e-05, + "loss": 0.6702, + "mean_token_accuracy": 0.8688174486160278, + "num_tokens": 25443403.0, + "step": 6065 + }, + { + "entropy": 0.22310646399855613, + "epoch": 1.4149667793449119, + "grad_norm": 0.9375, + "learning_rate": 1.994634323003157e-05, + "loss": 0.2445, + "mean_token_accuracy": 0.9442370891571045, + "num_tokens": 25461873.0, + "step": 6070 + }, + { + "entropy": 0.1819608300924301, + "epoch": 1.4161324163655438, + "grad_norm": 2.703125, + "learning_rate": 1.994625398496677e-05, + "loss": 0.319, + "mean_token_accuracy": 0.9387645661830902, + "num_tokens": 25479569.0, + "step": 6075 + }, + { + "entropy": 0.2639749272726476, + "epoch": 1.4172980533861756, + "grad_norm": 2.203125, + "learning_rate": 1.9946164666145887e-05, + "loss": 0.4714, + "mean_token_accuracy": 0.9076428413391113, + "num_tokens": 25498608.0, + "step": 6080 + }, + { + "entropy": 0.33660699874162675, + "epoch": 1.4184636904068073, + "grad_norm": 1.9453125, + "learning_rate": 1.9946075273570246e-05, + "loss": 0.712, + "mean_token_accuracy": 0.8807506680488586, + "num_tokens": 25508015.0, + "step": 6085 + }, + { + "entropy": 0.24338987190276384, + "epoch": 1.419629327427439, + "grad_norm": 4.90625, + "learning_rate": 1.9945985807241183e-05, + "loss": 0.4234, + "mean_token_accuracy": 0.9177171468734742, + "num_tokens": 25536317.0, + "step": 6090 + }, + { + "entropy": 0.1747225273400545, + "epoch": 1.4207949644480709, + "grad_norm": 0.38671875, + "learning_rate": 1.9945896267160033e-05, + "loss": 0.1504, + "mean_token_accuracy": 0.9511634230613708, + "num_tokens": 25581641.0, + "step": 6095 + }, + { + "entropy": 0.24279556684195996, + "epoch": 1.4219606014687027, + "grad_norm": 6.4375, + "learning_rate": 1.994580665332813e-05, + "loss": 0.3479, + "mean_token_accuracy": 0.9131585538387299, + "num_tokens": 25609683.0, + "step": 6100 + }, + { + "entropy": 0.21206288058310746, + "epoch": 1.4231262384893344, + "grad_norm": 5.125, + "learning_rate": 1.9945716965746815e-05, + "loss": 0.4123, + "mean_token_accuracy": 0.9267059683799743, + "num_tokens": 25631691.0, + "step": 6105 + }, + { + "entropy": 0.20703665874898433, + "epoch": 1.4242918755099663, + "grad_norm": 0.53515625, + "learning_rate": 1.9945627204417417e-05, + "loss": 0.3589, + "mean_token_accuracy": 0.9129282414913178, + "num_tokens": 25647570.0, + "step": 6110 + }, + { + "entropy": 0.2758414391428232, + "epoch": 1.425457512530598, + "grad_norm": 4.03125, + "learning_rate": 1.994553736934128e-05, + "loss": 0.4752, + "mean_token_accuracy": 0.9155567049980163, + "num_tokens": 25666958.0, + "step": 6115 + }, + { + "entropy": 0.24596691727638245, + "epoch": 1.4266231495512298, + "grad_norm": 1.9140625, + "learning_rate": 1.9945447460519744e-05, + "loss": 0.4994, + "mean_token_accuracy": 0.907034718990326, + "num_tokens": 25683703.0, + "step": 6120 + }, + { + "entropy": 0.22788504436612128, + "epoch": 1.4277887865718615, + "grad_norm": 2.21875, + "learning_rate": 1.9945357477954146e-05, + "loss": 0.4048, + "mean_token_accuracy": 0.9154785871505737, + "num_tokens": 25704111.0, + "step": 6125 + }, + { + "entropy": 0.2857265181839466, + "epoch": 1.4289544235924934, + "grad_norm": 3.890625, + "learning_rate": 1.994526742164583e-05, + "loss": 0.5323, + "mean_token_accuracy": 0.9008615195751191, + "num_tokens": 25715542.0, + "step": 6130 + }, + { + "entropy": 0.33440128406509756, + "epoch": 1.430120060613125, + "grad_norm": 0.38671875, + "learning_rate": 1.9945177291596138e-05, + "loss": 0.5732, + "mean_token_accuracy": 0.8830467939376831, + "num_tokens": 25748108.0, + "step": 6135 + }, + { + "entropy": 0.20241359770298004, + "epoch": 1.431285697633757, + "grad_norm": 1.734375, + "learning_rate": 1.9945087087806418e-05, + "loss": 0.3625, + "mean_token_accuracy": 0.9333572685718536, + "num_tokens": 25762188.0, + "step": 6140 + }, + { + "entropy": 0.2195176776498556, + "epoch": 1.4324513346543886, + "grad_norm": 2.984375, + "learning_rate": 1.9944996810278004e-05, + "loss": 0.3942, + "mean_token_accuracy": 0.9176979422569275, + "num_tokens": 25779783.0, + "step": 6145 + }, + { + "entropy": 0.24785535782575607, + "epoch": 1.4336169716750204, + "grad_norm": 0.44921875, + "learning_rate": 1.9944906459012256e-05, + "loss": 0.3539, + "mean_token_accuracy": 0.917992216348648, + "num_tokens": 25808857.0, + "step": 6150 + }, + { + "entropy": 0.2307575661689043, + "epoch": 1.434782608695652, + "grad_norm": 1.5078125, + "learning_rate": 1.9944816034010514e-05, + "loss": 0.3912, + "mean_token_accuracy": 0.9268999874591828, + "num_tokens": 25827633.0, + "step": 6155 + }, + { + "entropy": 0.3657380998134613, + "epoch": 1.435948245716284, + "grad_norm": 3.875, + "learning_rate": 1.994472553527413e-05, + "loss": 0.6706, + "mean_token_accuracy": 0.8987206935882568, + "num_tokens": 25835309.0, + "step": 6160 + }, + { + "entropy": 0.2581876091659069, + "epoch": 1.4371138827369156, + "grad_norm": 1.6484375, + "learning_rate": 1.9944634962804447e-05, + "loss": 0.3631, + "mean_token_accuracy": 0.9164925396442414, + "num_tokens": 25854498.0, + "step": 6165 + }, + { + "entropy": 0.23693926855921746, + "epoch": 1.4382795197575475, + "grad_norm": 3.921875, + "learning_rate": 1.9944544316602822e-05, + "loss": 0.4131, + "mean_token_accuracy": 0.9203273713588714, + "num_tokens": 25867321.0, + "step": 6170 + }, + { + "entropy": 0.21460621878504754, + "epoch": 1.4394451567781794, + "grad_norm": 3.53125, + "learning_rate": 1.99444535966706e-05, + "loss": 0.3414, + "mean_token_accuracy": 0.9152821362018585, + "num_tokens": 25883601.0, + "step": 6175 + }, + { + "entropy": 0.18589368872344494, + "epoch": 1.440610793798811, + "grad_norm": 0.55078125, + "learning_rate": 1.9944362803009143e-05, + "loss": 0.2293, + "mean_token_accuracy": 0.9320219933986664, + "num_tokens": 25907701.0, + "step": 6180 + }, + { + "entropy": 0.19626751802861692, + "epoch": 1.4417764308194427, + "grad_norm": 2.46875, + "learning_rate": 1.99442719356198e-05, + "loss": 0.325, + "mean_token_accuracy": 0.9340706586837768, + "num_tokens": 25923590.0, + "step": 6185 + }, + { + "entropy": 0.22295918092131614, + "epoch": 1.4429420678400746, + "grad_norm": 4.78125, + "learning_rate": 1.9944180994503924e-05, + "loss": 0.3643, + "mean_token_accuracy": 0.9329910993576049, + "num_tokens": 25949742.0, + "step": 6190 + }, + { + "entropy": 0.2750854782760143, + "epoch": 1.4441077048607065, + "grad_norm": 7.5, + "learning_rate": 1.994408997966287e-05, + "loss": 0.5914, + "mean_token_accuracy": 0.894541758298874, + "num_tokens": 25966558.0, + "step": 6195 + }, + { + "entropy": 0.2991751916706562, + "epoch": 1.4452733418813382, + "grad_norm": 4.09375, + "learning_rate": 1.9943998891098002e-05, + "loss": 0.541, + "mean_token_accuracy": 0.9036483585834503, + "num_tokens": 25980801.0, + "step": 6200 + }, + { + "entropy": 0.24568053856492042, + "epoch": 1.4464389789019698, + "grad_norm": 2.40625, + "learning_rate": 1.9943907728810675e-05, + "loss": 0.3813, + "mean_token_accuracy": 0.9138169586658478, + "num_tokens": 25998947.0, + "step": 6205 + }, + { + "entropy": 0.35819384828209877, + "epoch": 1.4476046159226017, + "grad_norm": 7.03125, + "learning_rate": 1.9943816492802245e-05, + "loss": 0.7506, + "mean_token_accuracy": 0.8739102721214295, + "num_tokens": 26018713.0, + "step": 6210 + }, + { + "entropy": 0.31613691374659536, + "epoch": 1.4487702529432336, + "grad_norm": 0.96875, + "learning_rate": 1.9943725183074078e-05, + "loss": 0.5115, + "mean_token_accuracy": 0.8758203983306885, + "num_tokens": 26046018.0, + "step": 6215 + }, + { + "entropy": 0.29431228525936604, + "epoch": 1.4499358899638652, + "grad_norm": 0.6171875, + "learning_rate": 1.994363379962753e-05, + "loss": 0.5854, + "mean_token_accuracy": 0.8824435234069824, + "num_tokens": 26072080.0, + "step": 6220 + }, + { + "entropy": 0.2346148520708084, + "epoch": 1.451101526984497, + "grad_norm": 0.95703125, + "learning_rate": 1.9943542342463967e-05, + "loss": 0.3645, + "mean_token_accuracy": 0.9219622015953064, + "num_tokens": 26095299.0, + "step": 6225 + }, + { + "entropy": 0.2745875285938382, + "epoch": 1.4522671640051288, + "grad_norm": 4.21875, + "learning_rate": 1.9943450811584754e-05, + "loss": 0.4315, + "mean_token_accuracy": 0.9061646342277527, + "num_tokens": 26114670.0, + "step": 6230 + }, + { + "entropy": 0.2439524047076702, + "epoch": 1.4534328010257607, + "grad_norm": 1.8203125, + "learning_rate": 1.994335920699125e-05, + "loss": 0.49, + "mean_token_accuracy": 0.9195599555969238, + "num_tokens": 26127294.0, + "step": 6235 + }, + { + "entropy": 0.2234463717788458, + "epoch": 1.4545984380463923, + "grad_norm": 0.90234375, + "learning_rate": 1.9943267528684825e-05, + "loss": 0.2566, + "mean_token_accuracy": 0.9297012031078339, + "num_tokens": 26157921.0, + "step": 6240 + }, + { + "entropy": 0.2692530658096075, + "epoch": 1.4557640750670242, + "grad_norm": 0.2294921875, + "learning_rate": 1.994317577666685e-05, + "loss": 0.57, + "mean_token_accuracy": 0.9084669768810272, + "num_tokens": 26184039.0, + "step": 6245 + }, + { + "entropy": 0.24896195121109485, + "epoch": 1.4569297120876559, + "grad_norm": 4.59375, + "learning_rate": 1.9943083950938688e-05, + "loss": 0.249, + "mean_token_accuracy": 0.9177485883235932, + "num_tokens": 26210767.0, + "step": 6250 + }, + { + "entropy": 0.3322709981352091, + "epoch": 1.4580953491082878, + "grad_norm": 4.9375, + "learning_rate": 1.994299205150171e-05, + "loss": 0.553, + "mean_token_accuracy": 0.882332730293274, + "num_tokens": 26232162.0, + "step": 6255 + }, + { + "entropy": 0.28127521388232707, + "epoch": 1.4592609861289194, + "grad_norm": 5.96875, + "learning_rate": 1.9942900078357285e-05, + "loss": 0.484, + "mean_token_accuracy": 0.9012116551399231, + "num_tokens": 26249149.0, + "step": 6260 + }, + { + "entropy": 0.27540549375116824, + "epoch": 1.4604266231495513, + "grad_norm": 5.03125, + "learning_rate": 1.9942808031506783e-05, + "loss": 0.5053, + "mean_token_accuracy": 0.8840371131896972, + "num_tokens": 26262178.0, + "step": 6265 + }, + { + "entropy": 0.35837407857179643, + "epoch": 1.461592260170183, + "grad_norm": 3.28125, + "learning_rate": 1.9942715910951584e-05, + "loss": 0.5515, + "mean_token_accuracy": 0.8817245364189148, + "num_tokens": 26290230.0, + "step": 6270 + }, + { + "entropy": 0.18010530173778533, + "epoch": 1.4627578971908148, + "grad_norm": 0.1943359375, + "learning_rate": 1.9942623716693053e-05, + "loss": 0.2224, + "mean_token_accuracy": 0.9430947363376617, + "num_tokens": 26329113.0, + "step": 6275 + }, + { + "entropy": 0.22894675768911837, + "epoch": 1.4639235342114465, + "grad_norm": 2.625, + "learning_rate": 1.994253144873257e-05, + "loss": 0.3407, + "mean_token_accuracy": 0.9291040778160096, + "num_tokens": 26354065.0, + "step": 6280 + }, + { + "entropy": 0.3529716327786446, + "epoch": 1.4650891712320784, + "grad_norm": 3.890625, + "learning_rate": 1.9942439107071508e-05, + "loss": 0.6958, + "mean_token_accuracy": 0.8839489579200744, + "num_tokens": 26364152.0, + "step": 6285 + }, + { + "entropy": 0.29959765523672105, + "epoch": 1.46625480825271, + "grad_norm": 2.515625, + "learning_rate": 1.9942346691711245e-05, + "loss": 0.6005, + "mean_token_accuracy": 0.9027487754821777, + "num_tokens": 26373463.0, + "step": 6290 + }, + { + "entropy": 0.2790189601480961, + "epoch": 1.467420445273342, + "grad_norm": 0.48828125, + "learning_rate": 1.9942254202653156e-05, + "loss": 0.3838, + "mean_token_accuracy": 0.896321427822113, + "num_tokens": 26397161.0, + "step": 6295 + }, + { + "entropy": 0.29518115893006325, + "epoch": 1.4685860822939736, + "grad_norm": 0.44921875, + "learning_rate": 1.994216163989863e-05, + "loss": 0.6156, + "mean_token_accuracy": 0.8970506012439727, + "num_tokens": 26418252.0, + "step": 6300 + }, + { + "entropy": 0.24554601386189462, + "epoch": 1.4697517193146055, + "grad_norm": 4.25, + "learning_rate": 1.9942069003449035e-05, + "loss": 0.4689, + "mean_token_accuracy": 0.9052075982093811, + "num_tokens": 26439931.0, + "step": 6305 + }, + { + "entropy": 0.17828930951654912, + "epoch": 1.4709173563352371, + "grad_norm": 1.6796875, + "learning_rate": 1.994197629330576e-05, + "loss": 0.1967, + "mean_token_accuracy": 0.9368055939674378, + "num_tokens": 26481238.0, + "step": 6310 + }, + { + "entropy": 0.23349117934703828, + "epoch": 1.472082993355869, + "grad_norm": 3.078125, + "learning_rate": 1.9941883509470185e-05, + "loss": 0.4269, + "mean_token_accuracy": 0.9104501307010651, + "num_tokens": 26495108.0, + "step": 6315 + }, + { + "entropy": 0.18809721656143666, + "epoch": 1.4732486303765007, + "grad_norm": 0.40234375, + "learning_rate": 1.9941790651943694e-05, + "loss": 0.3237, + "mean_token_accuracy": 0.9333221316337585, + "num_tokens": 26522784.0, + "step": 6320 + }, + { + "entropy": 0.3458826899528503, + "epoch": 1.4744142673971325, + "grad_norm": 4.78125, + "learning_rate": 1.9941697720727674e-05, + "loss": 0.6462, + "mean_token_accuracy": 0.8908734023571014, + "num_tokens": 26532735.0, + "step": 6325 + }, + { + "entropy": 0.23778062015771867, + "epoch": 1.4755799044177644, + "grad_norm": 1.3125, + "learning_rate": 1.9941604715823505e-05, + "loss": 0.4793, + "mean_token_accuracy": 0.9031484723091125, + "num_tokens": 26554290.0, + "step": 6330 + }, + { + "entropy": 0.2718735795468092, + "epoch": 1.476745541438396, + "grad_norm": 7.4375, + "learning_rate": 1.994151163723258e-05, + "loss": 0.2499, + "mean_token_accuracy": 0.9027468800544739, + "num_tokens": 26595658.0, + "step": 6335 + }, + { + "entropy": 0.2860164247453213, + "epoch": 1.4779111784590278, + "grad_norm": 0.4296875, + "learning_rate": 1.994141848495628e-05, + "loss": 0.4303, + "mean_token_accuracy": 0.9010000109672547, + "num_tokens": 26618053.0, + "step": 6340 + }, + { + "entropy": 0.33337589353322983, + "epoch": 1.4790768154796596, + "grad_norm": 3.078125, + "learning_rate": 1.9941325258996e-05, + "loss": 0.5813, + "mean_token_accuracy": 0.8955310165882111, + "num_tokens": 26627218.0, + "step": 6345 + }, + { + "entropy": 0.2800572752952576, + "epoch": 1.4802424525002915, + "grad_norm": 3.125, + "learning_rate": 1.9941231959353127e-05, + "loss": 0.5138, + "mean_token_accuracy": 0.9197395205497741, + "num_tokens": 26646367.0, + "step": 6350 + }, + { + "entropy": 0.24946743622422218, + "epoch": 1.4814080895209232, + "grad_norm": 1.7734375, + "learning_rate": 1.9941138586029057e-05, + "loss": 0.4069, + "mean_token_accuracy": 0.8970791816711425, + "num_tokens": 26666179.0, + "step": 6355 + }, + { + "entropy": 0.2526698425412178, + "epoch": 1.4825737265415548, + "grad_norm": 2.09375, + "learning_rate": 1.9941045139025175e-05, + "loss": 0.3809, + "mean_token_accuracy": 0.919477504491806, + "num_tokens": 26682431.0, + "step": 6360 + }, + { + "entropy": 0.2440191276371479, + "epoch": 1.4837393635621867, + "grad_norm": 0.404296875, + "learning_rate": 1.994095161834288e-05, + "loss": 0.2937, + "mean_token_accuracy": 0.916581517457962, + "num_tokens": 26708653.0, + "step": 6365 + }, + { + "entropy": 0.2495467372238636, + "epoch": 1.4849050005828186, + "grad_norm": 3.40625, + "learning_rate": 1.9940858023983564e-05, + "loss": 0.4609, + "mean_token_accuracy": 0.9264641582965851, + "num_tokens": 26729923.0, + "step": 6370 + }, + { + "entropy": 0.31366735994815825, + "epoch": 1.4860706376034503, + "grad_norm": 2.453125, + "learning_rate": 1.9940764355948624e-05, + "loss": 0.5459, + "mean_token_accuracy": 0.8930998623371125, + "num_tokens": 26747852.0, + "step": 6375 + }, + { + "entropy": 0.17747897058725357, + "epoch": 1.4872362746240821, + "grad_norm": 0.296875, + "learning_rate": 1.9940670614239455e-05, + "loss": 0.4019, + "mean_token_accuracy": 0.9336903989315033, + "num_tokens": 26771977.0, + "step": 6380 + }, + { + "entropy": 0.31067068725824354, + "epoch": 1.4884019116447138, + "grad_norm": 2.171875, + "learning_rate": 1.9940576798857458e-05, + "loss": 0.5135, + "mean_token_accuracy": 0.8752427935600281, + "num_tokens": 26803778.0, + "step": 6385 + }, + { + "entropy": 0.2708073750138283, + "epoch": 1.4895675486653457, + "grad_norm": 0.34765625, + "learning_rate": 1.994048290980403e-05, + "loss": 0.5891, + "mean_token_accuracy": 0.901687479019165, + "num_tokens": 26825704.0, + "step": 6390 + }, + { + "entropy": 0.2440513141453266, + "epoch": 1.4907331856859773, + "grad_norm": 0.86328125, + "learning_rate": 1.994038894708057e-05, + "loss": 0.3371, + "mean_token_accuracy": 0.9266144573688507, + "num_tokens": 26850678.0, + "step": 6395 + }, + { + "entropy": 0.21062698140740393, + "epoch": 1.4918988227066092, + "grad_norm": 2.140625, + "learning_rate": 1.994029491068848e-05, + "loss": 0.2388, + "mean_token_accuracy": 0.9315842866897583, + "num_tokens": 26871585.0, + "step": 6400 + }, + { + "entropy": 0.37964350283145903, + "epoch": 1.493064459727241, + "grad_norm": 5.0625, + "learning_rate": 1.994020080062916e-05, + "loss": 0.8883, + "mean_token_accuracy": 0.8595846474170685, + "num_tokens": 26879318.0, + "step": 6405 + }, + { + "entropy": 0.26095555648207663, + "epoch": 1.4942300967478728, + "grad_norm": 3.59375, + "learning_rate": 1.9940106616904018e-05, + "loss": 0.5295, + "mean_token_accuracy": 0.8947687804698944, + "num_tokens": 26898969.0, + "step": 6410 + }, + { + "entropy": 0.25472576431930066, + "epoch": 1.4953957337685044, + "grad_norm": 3.28125, + "learning_rate": 1.9940012359514456e-05, + "loss": 0.3701, + "mean_token_accuracy": 0.9185781836509704, + "num_tokens": 26921627.0, + "step": 6415 + }, + { + "entropy": 0.3112226489931345, + "epoch": 1.4965613707891363, + "grad_norm": 3.0, + "learning_rate": 1.993991802846188e-05, + "loss": 0.4863, + "mean_token_accuracy": 0.8925800144672393, + "num_tokens": 26943250.0, + "step": 6420 + }, + { + "entropy": 0.2728789247572422, + "epoch": 1.497727007809768, + "grad_norm": 0.75, + "learning_rate": 1.9939823623747695e-05, + "loss": 0.4913, + "mean_token_accuracy": 0.9044696092605591, + "num_tokens": 26966039.0, + "step": 6425 + }, + { + "entropy": 0.22179466150701047, + "epoch": 1.4988926448303999, + "grad_norm": 0.703125, + "learning_rate": 1.993972914537331e-05, + "loss": 0.465, + "mean_token_accuracy": 0.9178914129734039, + "num_tokens": 26984909.0, + "step": 6430 + }, + { + "entropy": 0.22185338092967868, + "epoch": 1.5000582818510315, + "grad_norm": 3.640625, + "learning_rate": 1.9939634593340133e-05, + "loss": 0.3028, + "mean_token_accuracy": 0.9346797227859497, + "num_tokens": 27017339.0, + "step": 6435 + }, + { + "entropy": 0.29497579857707024, + "epoch": 1.5012239188716634, + "grad_norm": 5.78125, + "learning_rate": 1.9939539967649576e-05, + "loss": 0.7158, + "mean_token_accuracy": 0.876299786567688, + "num_tokens": 27029104.0, + "step": 6440 + }, + { + "entropy": 0.2623417802155018, + "epoch": 1.5023895558922953, + "grad_norm": 0.6875, + "learning_rate": 1.9939445268303047e-05, + "loss": 0.5434, + "mean_token_accuracy": 0.8918785274028778, + "num_tokens": 27047768.0, + "step": 6445 + }, + { + "entropy": 0.20947806648910045, + "epoch": 1.503555192912927, + "grad_norm": 0.357421875, + "learning_rate": 1.993935049530196e-05, + "loss": 0.4116, + "mean_token_accuracy": 0.9175561010837555, + "num_tokens": 27073094.0, + "step": 6450 + }, + { + "entropy": 0.3465562131255865, + "epoch": 1.5047208299335586, + "grad_norm": 2.015625, + "learning_rate": 1.9939255648647732e-05, + "loss": 0.542, + "mean_token_accuracy": 0.8854370713233948, + "num_tokens": 27096021.0, + "step": 6455 + }, + { + "entropy": 0.27758454382419584, + "epoch": 1.5058864669541905, + "grad_norm": 3.421875, + "learning_rate": 1.993916072834177e-05, + "loss": 0.5268, + "mean_token_accuracy": 0.8995888471603394, + "num_tokens": 27123583.0, + "step": 6460 + }, + { + "entropy": 0.24073500484228133, + "epoch": 1.5070521039748224, + "grad_norm": 2.6875, + "learning_rate": 1.993906573438549e-05, + "loss": 0.383, + "mean_token_accuracy": 0.9229654431343078, + "num_tokens": 27142117.0, + "step": 6465 + }, + { + "entropy": 0.2729795090854168, + "epoch": 1.508217740995454, + "grad_norm": 0.59765625, + "learning_rate": 1.9938970666780312e-05, + "loss": 0.6103, + "mean_token_accuracy": 0.8879354178905488, + "num_tokens": 27155818.0, + "step": 6470 + }, + { + "entropy": 0.18979131281375886, + "epoch": 1.5093833780160857, + "grad_norm": 0.7890625, + "learning_rate": 1.9938875525527658e-05, + "loss": 0.192, + "mean_token_accuracy": 0.9271632313728333, + "num_tokens": 27183172.0, + "step": 6475 + }, + { + "entropy": 0.21792179644107817, + "epoch": 1.5105490150367176, + "grad_norm": 1.1875, + "learning_rate": 1.9938780310628935e-05, + "loss": 0.3931, + "mean_token_accuracy": 0.9339845836162567, + "num_tokens": 27196815.0, + "step": 6480 + }, + { + "entropy": 0.2289236381649971, + "epoch": 1.5117146520573495, + "grad_norm": 5.625, + "learning_rate": 1.9938685022085573e-05, + "loss": 0.452, + "mean_token_accuracy": 0.9143205046653747, + "num_tokens": 27207717.0, + "step": 6485 + }, + { + "entropy": 0.33930057361721994, + "epoch": 1.512880289077981, + "grad_norm": 5.90625, + "learning_rate": 1.9938589659898987e-05, + "loss": 0.5654, + "mean_token_accuracy": 0.8818209409713745, + "num_tokens": 27233471.0, + "step": 6490 + }, + { + "entropy": 0.2666054558008909, + "epoch": 1.5140459260986128, + "grad_norm": 0.427734375, + "learning_rate": 1.9938494224070605e-05, + "loss": 0.6512, + "mean_token_accuracy": 0.8890865564346313, + "num_tokens": 27253476.0, + "step": 6495 + }, + { + "entropy": 0.2990569584071636, + "epoch": 1.5152115631192447, + "grad_norm": 6.03125, + "learning_rate": 1.993839871460184e-05, + "loss": 0.5722, + "mean_token_accuracy": 0.9082420587539672, + "num_tokens": 27270163.0, + "step": 6500 + }, + { + "entropy": 0.25313875190913676, + "epoch": 1.5163772001398765, + "grad_norm": 5.625, + "learning_rate": 1.9938303131494127e-05, + "loss": 0.42, + "mean_token_accuracy": 0.9069484531879425, + "num_tokens": 27287541.0, + "step": 6505 + }, + { + "entropy": 0.2387036457657814, + "epoch": 1.5175428371605082, + "grad_norm": 4.8125, + "learning_rate": 1.9938207474748886e-05, + "loss": 0.5459, + "mean_token_accuracy": 0.9052523851394654, + "num_tokens": 27302864.0, + "step": 6510 + }, + { + "entropy": 0.2785582607612014, + "epoch": 1.5187084741811399, + "grad_norm": 4.03125, + "learning_rate": 1.9938111744367545e-05, + "loss": 0.4533, + "mean_token_accuracy": 0.8836054503917694, + "num_tokens": 27338949.0, + "step": 6515 + }, + { + "entropy": 0.2182111766189337, + "epoch": 1.5198741112017717, + "grad_norm": 0.59375, + "learning_rate": 1.9938015940351528e-05, + "loss": 0.4181, + "mean_token_accuracy": 0.9164810359477997, + "num_tokens": 27356468.0, + "step": 6520 + }, + { + "entropy": 0.3113727495074272, + "epoch": 1.5210397482224036, + "grad_norm": 2.546875, + "learning_rate": 1.9937920062702267e-05, + "loss": 0.501, + "mean_token_accuracy": 0.905505508184433, + "num_tokens": 27366690.0, + "step": 6525 + }, + { + "entropy": 0.23484750241041183, + "epoch": 1.5222053852430353, + "grad_norm": 7.5, + "learning_rate": 1.993782411142119e-05, + "loss": 0.3325, + "mean_token_accuracy": 0.9258572041988373, + "num_tokens": 27396712.0, + "step": 6530 + }, + { + "entropy": 0.22683720849454403, + "epoch": 1.523371022263667, + "grad_norm": 4.96875, + "learning_rate": 1.9937728086509732e-05, + "loss": 0.2282, + "mean_token_accuracy": 0.9178965389728546, + "num_tokens": 27428456.0, + "step": 6535 + }, + { + "entropy": 0.2929589316248894, + "epoch": 1.5245366592842988, + "grad_norm": 1.4765625, + "learning_rate": 1.9937631987969318e-05, + "loss": 0.5289, + "mean_token_accuracy": 0.8907686352729798, + "num_tokens": 27446640.0, + "step": 6540 + }, + { + "entropy": 0.22855143621563911, + "epoch": 1.5257022963049307, + "grad_norm": 2.8125, + "learning_rate": 1.9937535815801385e-05, + "loss": 0.3996, + "mean_token_accuracy": 0.907921028137207, + "num_tokens": 27460744.0, + "step": 6545 + }, + { + "entropy": 0.3677578168921173, + "epoch": 1.5268679333255624, + "grad_norm": 5.28125, + "learning_rate": 1.993743957000737e-05, + "loss": 0.612, + "mean_token_accuracy": 0.8666451275348663, + "num_tokens": 27492855.0, + "step": 6550 + }, + { + "entropy": 0.3148959677666426, + "epoch": 1.5280335703461942, + "grad_norm": 4.46875, + "learning_rate": 1.9937343250588698e-05, + "loss": 0.4407, + "mean_token_accuracy": 0.8961562097072602, + "num_tokens": 27518578.0, + "step": 6555 + }, + { + "entropy": 0.2055843833833933, + "epoch": 1.5291992073668261, + "grad_norm": 0.6953125, + "learning_rate": 1.9937246857546817e-05, + "loss": 0.3653, + "mean_token_accuracy": 0.928001344203949, + "num_tokens": 27534989.0, + "step": 6560 + }, + { + "entropy": 0.3105059742927551, + "epoch": 1.5303648443874578, + "grad_norm": 3.046875, + "learning_rate": 1.9937150390883156e-05, + "loss": 0.7427, + "mean_token_accuracy": 0.8818167328834534, + "num_tokens": 27545231.0, + "step": 6565 + }, + { + "entropy": 0.21620201570913194, + "epoch": 1.5315304814080895, + "grad_norm": 6.46875, + "learning_rate": 1.9937053850599163e-05, + "loss": 0.3257, + "mean_token_accuracy": 0.9393852114677429, + "num_tokens": 27566195.0, + "step": 6570 + }, + { + "entropy": 0.21292497627437115, + "epoch": 1.5326961184287213, + "grad_norm": 0.349609375, + "learning_rate": 1.9936957236696264e-05, + "loss": 0.4618, + "mean_token_accuracy": 0.9288613259792328, + "num_tokens": 27589114.0, + "step": 6575 + }, + { + "entropy": 0.23033034205436706, + "epoch": 1.5338617554493532, + "grad_norm": 4.8125, + "learning_rate": 1.993686054917591e-05, + "loss": 0.449, + "mean_token_accuracy": 0.9169645845890045, + "num_tokens": 27610688.0, + "step": 6580 + }, + { + "entropy": 0.24333803839981555, + "epoch": 1.5350273924699849, + "grad_norm": 6.71875, + "learning_rate": 1.9936763788039543e-05, + "loss": 0.3681, + "mean_token_accuracy": 0.9251050651073456, + "num_tokens": 27628200.0, + "step": 6585 + }, + { + "entropy": 0.24377419464290143, + "epoch": 1.5361930294906165, + "grad_norm": 2.625, + "learning_rate": 1.9936666953288598e-05, + "loss": 0.3051, + "mean_token_accuracy": 0.9210731863975525, + "num_tokens": 27651910.0, + "step": 6590 + }, + { + "entropy": 0.23668837770819665, + "epoch": 1.5373586665112484, + "grad_norm": 1.015625, + "learning_rate": 1.9936570044924526e-05, + "loss": 0.4558, + "mean_token_accuracy": 0.9113388657569885, + "num_tokens": 27665162.0, + "step": 6595 + }, + { + "entropy": 0.2739979222416878, + "epoch": 1.5385243035318803, + "grad_norm": 3.34375, + "learning_rate": 1.9936473062948765e-05, + "loss": 0.6057, + "mean_token_accuracy": 0.9049014866352081, + "num_tokens": 27676458.0, + "step": 6600 + }, + { + "entropy": 0.32901543751358986, + "epoch": 1.539689940552512, + "grad_norm": 2.734375, + "learning_rate": 1.993637600736277e-05, + "loss": 0.3299, + "mean_token_accuracy": 0.8835425734519958, + "num_tokens": 27711994.0, + "step": 6605 + }, + { + "entropy": 0.292670027539134, + "epoch": 1.5408555775731436, + "grad_norm": 0.5078125, + "learning_rate": 1.9936278878167985e-05, + "loss": 0.565, + "mean_token_accuracy": 0.8792059600353241, + "num_tokens": 27732770.0, + "step": 6610 + }, + { + "entropy": 0.2861447758972645, + "epoch": 1.5420212145937755, + "grad_norm": 3.96875, + "learning_rate": 1.9936181675365856e-05, + "loss": 0.5155, + "mean_token_accuracy": 0.9069945335388183, + "num_tokens": 27756632.0, + "step": 6615 + }, + { + "entropy": 0.22006136178970337, + "epoch": 1.5431868516144074, + "grad_norm": 1.4921875, + "learning_rate": 1.9936084398957834e-05, + "loss": 0.3175, + "mean_token_accuracy": 0.9396511971950531, + "num_tokens": 27778830.0, + "step": 6620 + }, + { + "entropy": 0.33254698645323516, + "epoch": 1.544352488635039, + "grad_norm": 4.21875, + "learning_rate": 1.993598704894537e-05, + "loss": 0.5365, + "mean_token_accuracy": 0.8744619786739349, + "num_tokens": 27805975.0, + "step": 6625 + }, + { + "entropy": 0.3152335677295923, + "epoch": 1.5455181256556707, + "grad_norm": 0.48828125, + "learning_rate": 1.9935889625329913e-05, + "loss": 0.5482, + "mean_token_accuracy": 0.9010307848453522, + "num_tokens": 27828444.0, + "step": 6630 + }, + { + "entropy": 0.2763508759438992, + "epoch": 1.5466837626763026, + "grad_norm": 1.78125, + "learning_rate": 1.9935792128112918e-05, + "loss": 0.416, + "mean_token_accuracy": 0.899656081199646, + "num_tokens": 27856252.0, + "step": 6635 + }, + { + "entropy": 0.29827424213290216, + "epoch": 1.5478493996969345, + "grad_norm": 0.5, + "learning_rate": 1.993569455729584e-05, + "loss": 0.462, + "mean_token_accuracy": 0.8946270823478699, + "num_tokens": 27880426.0, + "step": 6640 + }, + { + "entropy": 0.20729903429746627, + "epoch": 1.5490150367175661, + "grad_norm": 1.1796875, + "learning_rate": 1.993559691288013e-05, + "loss": 0.2037, + "mean_token_accuracy": 0.9211410582065582, + "num_tokens": 27901734.0, + "step": 6645 + }, + { + "entropy": 0.34493443965911863, + "epoch": 1.5501806737381978, + "grad_norm": 8.625, + "learning_rate": 1.993549919486725e-05, + "loss": 0.7576, + "mean_token_accuracy": 0.864366763830185, + "num_tokens": 27921161.0, + "step": 6650 + }, + { + "entropy": 0.29178492836654185, + "epoch": 1.5513463107588297, + "grad_norm": 10.25, + "learning_rate": 1.993540140325865e-05, + "loss": 0.5045, + "mean_token_accuracy": 0.885959267616272, + "num_tokens": 27954097.0, + "step": 6655 + }, + { + "entropy": 0.19429666809737683, + "epoch": 1.5525119477794616, + "grad_norm": 0.49609375, + "learning_rate": 1.9935303538055796e-05, + "loss": 0.2965, + "mean_token_accuracy": 0.9343727469444275, + "num_tokens": 27981520.0, + "step": 6660 + }, + { + "entropy": 0.25541278421878816, + "epoch": 1.5536775848000932, + "grad_norm": 4.0, + "learning_rate": 1.993520559926014e-05, + "loss": 0.4689, + "mean_token_accuracy": 0.9169036328792572, + "num_tokens": 27994168.0, + "step": 6665 + }, + { + "entropy": 0.32678827494382856, + "epoch": 1.5548432218207249, + "grad_norm": 3.375, + "learning_rate": 1.9935107586873145e-05, + "loss": 0.661, + "mean_token_accuracy": 0.8745863318443299, + "num_tokens": 28006200.0, + "step": 6670 + }, + { + "entropy": 0.2589764386415482, + "epoch": 1.5560088588413568, + "grad_norm": 1.0859375, + "learning_rate": 1.9935009500896273e-05, + "loss": 0.378, + "mean_token_accuracy": 0.9126704931259155, + "num_tokens": 28022450.0, + "step": 6675 + }, + { + "entropy": 0.28034481406211853, + "epoch": 1.5571744958619886, + "grad_norm": 1.0859375, + "learning_rate": 1.9934911341330986e-05, + "loss": 0.4009, + "mean_token_accuracy": 0.9167719483375549, + "num_tokens": 28045500.0, + "step": 6680 + }, + { + "entropy": 0.30468578450381756, + "epoch": 1.5583401328826203, + "grad_norm": 1.015625, + "learning_rate": 1.9934813108178752e-05, + "loss": 0.3279, + "mean_token_accuracy": 0.9120621025562287, + "num_tokens": 28088547.0, + "step": 6685 + }, + { + "entropy": 0.20318310568109155, + "epoch": 1.5595057699032522, + "grad_norm": 0.23046875, + "learning_rate": 1.9934714801441032e-05, + "loss": 0.4988, + "mean_token_accuracy": 0.9183111727237702, + "num_tokens": 28120417.0, + "step": 6690 + }, + { + "entropy": 0.19271549321711062, + "epoch": 1.560671406923884, + "grad_norm": 5.25, + "learning_rate": 1.9934616421119287e-05, + "loss": 0.2812, + "mean_token_accuracy": 0.9212694585323333, + "num_tokens": 28146606.0, + "step": 6695 + }, + { + "entropy": 0.23064907528460027, + "epoch": 1.5618370439445157, + "grad_norm": 3.609375, + "learning_rate": 1.9934517967214993e-05, + "loss": 0.2526, + "mean_token_accuracy": 0.9413141012191772, + "num_tokens": 28177886.0, + "step": 6700 + }, + { + "entropy": 0.2984270632266998, + "epoch": 1.5630026809651474, + "grad_norm": 4.125, + "learning_rate": 1.9934419439729615e-05, + "loss": 0.5552, + "mean_token_accuracy": 0.9000501334667206, + "num_tokens": 28196847.0, + "step": 6705 + }, + { + "entropy": 0.1965206727385521, + "epoch": 1.5641683179857793, + "grad_norm": 1.84375, + "learning_rate": 1.993432083866462e-05, + "loss": 0.2889, + "mean_token_accuracy": 0.9421004772186279, + "num_tokens": 28219554.0, + "step": 6710 + }, + { + "entropy": 0.3038263775408268, + "epoch": 1.5653339550064111, + "grad_norm": 4.59375, + "learning_rate": 1.993422216402148e-05, + "loss": 0.4245, + "mean_token_accuracy": 0.9077933967113495, + "num_tokens": 28241847.0, + "step": 6715 + }, + { + "entropy": 0.22943210899829863, + "epoch": 1.5664995920270428, + "grad_norm": 3.265625, + "learning_rate": 1.9934123415801666e-05, + "loss": 0.4105, + "mean_token_accuracy": 0.9183744013309478, + "num_tokens": 28270049.0, + "step": 6720 + }, + { + "entropy": 0.3342425040900707, + "epoch": 1.5676652290476745, + "grad_norm": 3.53125, + "learning_rate": 1.993402459400665e-05, + "loss": 0.6575, + "mean_token_accuracy": 0.8717486083507537, + "num_tokens": 28291347.0, + "step": 6725 + }, + { + "entropy": 0.20970200896263122, + "epoch": 1.5688308660683064, + "grad_norm": 0.419921875, + "learning_rate": 1.9933925698637905e-05, + "loss": 0.2657, + "mean_token_accuracy": 0.928368890285492, + "num_tokens": 28314009.0, + "step": 6730 + }, + { + "entropy": 0.3026204094290733, + "epoch": 1.5699965030889382, + "grad_norm": 5.34375, + "learning_rate": 1.993382672969691e-05, + "loss": 0.6261, + "mean_token_accuracy": 0.8962989449501038, + "num_tokens": 28323739.0, + "step": 6735 + }, + { + "entropy": 0.23009593058377503, + "epoch": 1.57116214010957, + "grad_norm": 0.703125, + "learning_rate": 1.9933727687185137e-05, + "loss": 0.3074, + "mean_token_accuracy": 0.9207689642906189, + "num_tokens": 28364853.0, + "step": 6740 + }, + { + "entropy": 0.35999022983014584, + "epoch": 1.5723277771302016, + "grad_norm": 1.03125, + "learning_rate": 1.993362857110406e-05, + "loss": 0.576, + "mean_token_accuracy": 0.8650194406509399, + "num_tokens": 28389330.0, + "step": 6745 + }, + { + "entropy": 0.27033420875668523, + "epoch": 1.5734934141508334, + "grad_norm": 2.671875, + "learning_rate": 1.9933529381455163e-05, + "loss": 0.4767, + "mean_token_accuracy": 0.9208850085735321, + "num_tokens": 28411791.0, + "step": 6750 + }, + { + "entropy": 0.19342294819653033, + "epoch": 1.5746590511714653, + "grad_norm": 2.0, + "learning_rate": 1.993343011823992e-05, + "loss": 0.3283, + "mean_token_accuracy": 0.9273386657238006, + "num_tokens": 28434134.0, + "step": 6755 + }, + { + "entropy": 0.22842872738838196, + "epoch": 1.575824688192097, + "grad_norm": 0.48046875, + "learning_rate": 1.993333078145982e-05, + "loss": 0.4746, + "mean_token_accuracy": 0.9192339181900024, + "num_tokens": 28455393.0, + "step": 6760 + }, + { + "entropy": 0.20059427116066217, + "epoch": 1.5769903252127286, + "grad_norm": 4.1875, + "learning_rate": 1.993323137111633e-05, + "loss": 0.4625, + "mean_token_accuracy": 0.9268925726413727, + "num_tokens": 28485132.0, + "step": 6765 + }, + { + "entropy": 0.25664917901158335, + "epoch": 1.5781559622333605, + "grad_norm": 5.28125, + "learning_rate": 1.9933131887210944e-05, + "loss": 0.499, + "mean_token_accuracy": 0.9118769764900208, + "num_tokens": 28496012.0, + "step": 6770 + }, + { + "entropy": 0.29543297439813615, + "epoch": 1.5793215992539924, + "grad_norm": 3.53125, + "learning_rate": 1.9933032329745144e-05, + "loss": 0.513, + "mean_token_accuracy": 0.8905039012432099, + "num_tokens": 28508468.0, + "step": 6775 + }, + { + "entropy": 0.27861638814210893, + "epoch": 1.580487236274624, + "grad_norm": 1.984375, + "learning_rate": 1.993293269872041e-05, + "loss": 0.7512, + "mean_token_accuracy": 0.8770630955696106, + "num_tokens": 28520357.0, + "step": 6780 + }, + { + "entropy": 0.26711825728416444, + "epoch": 1.5816528732952557, + "grad_norm": 3.234375, + "learning_rate": 1.993283299413823e-05, + "loss": 0.6114, + "mean_token_accuracy": 0.8977750062942504, + "num_tokens": 28529643.0, + "step": 6785 + }, + { + "entropy": 0.3120358551852405, + "epoch": 1.5828185103158876, + "grad_norm": 0.337890625, + "learning_rate": 1.993273321600009e-05, + "loss": 0.5817, + "mean_token_accuracy": 0.8871180176734924, + "num_tokens": 28549177.0, + "step": 6790 + }, + { + "entropy": 0.2286653283983469, + "epoch": 1.5839841473365195, + "grad_norm": 0.671875, + "learning_rate": 1.9932633364307478e-05, + "loss": 0.3054, + "mean_token_accuracy": 0.9169903874397278, + "num_tokens": 28585854.0, + "step": 6795 + }, + { + "entropy": 0.37026937305927277, + "epoch": 1.5851497843571511, + "grad_norm": 5.0, + "learning_rate": 1.9932533439061886e-05, + "loss": 0.516, + "mean_token_accuracy": 0.8859709143638611, + "num_tokens": 28603616.0, + "step": 6800 + }, + { + "entropy": 0.4093021884560585, + "epoch": 1.5863154213777828, + "grad_norm": 3.6875, + "learning_rate": 1.9932433440264798e-05, + "loss": 0.6387, + "mean_token_accuracy": 0.8732196629047394, + "num_tokens": 28623869.0, + "step": 6805 + }, + { + "entropy": 0.2910556077957153, + "epoch": 1.5874810583984147, + "grad_norm": 1.8828125, + "learning_rate": 1.993233336791771e-05, + "loss": 0.5225, + "mean_token_accuracy": 0.8892961859703064, + "num_tokens": 28653785.0, + "step": 6810 + }, + { + "entropy": 0.2758349239826202, + "epoch": 1.5886466954190466, + "grad_norm": 6.09375, + "learning_rate": 1.9932233222022113e-05, + "loss": 0.6505, + "mean_token_accuracy": 0.8887870073318481, + "num_tokens": 28664354.0, + "step": 6815 + }, + { + "entropy": 0.2991601226851344, + "epoch": 1.5898123324396782, + "grad_norm": 8.4375, + "learning_rate": 1.9932133002579502e-05, + "loss": 0.6383, + "mean_token_accuracy": 0.8858659088611602, + "num_tokens": 28687769.0, + "step": 6820 + }, + { + "entropy": 0.263477098941803, + "epoch": 1.59097796946031, + "grad_norm": 1.3359375, + "learning_rate": 1.9932032709591368e-05, + "loss": 0.2739, + "mean_token_accuracy": 0.9084848940372467, + "num_tokens": 28720836.0, + "step": 6825 + }, + { + "entropy": 0.19756391048431396, + "epoch": 1.592143606480942, + "grad_norm": 1.9609375, + "learning_rate": 1.9931932343059208e-05, + "loss": 0.414, + "mean_token_accuracy": 0.9262231230735779, + "num_tokens": 28736798.0, + "step": 6830 + }, + { + "entropy": 0.2347656786441803, + "epoch": 1.5933092435015737, + "grad_norm": 0.69921875, + "learning_rate": 1.993183190298452e-05, + "loss": 0.4251, + "mean_token_accuracy": 0.9266911625862122, + "num_tokens": 28752927.0, + "step": 6835 + }, + { + "entropy": 0.2072462685406208, + "epoch": 1.5944748805222053, + "grad_norm": 3.71875, + "learning_rate": 1.9931731389368797e-05, + "loss": 0.4016, + "mean_token_accuracy": 0.9348695755004883, + "num_tokens": 28788562.0, + "step": 6840 + }, + { + "entropy": 0.22971604485064745, + "epoch": 1.5956405175428372, + "grad_norm": 3.03125, + "learning_rate": 1.9931630802213543e-05, + "loss": 0.4706, + "mean_token_accuracy": 0.9057029843330383, + "num_tokens": 28809929.0, + "step": 6845 + }, + { + "entropy": 0.22827026546001433, + "epoch": 1.596806154563469, + "grad_norm": 3.953125, + "learning_rate": 1.9931530141520258e-05, + "loss": 0.5347, + "mean_token_accuracy": 0.9116665422916412, + "num_tokens": 28827185.0, + "step": 6850 + }, + { + "entropy": 0.2197150869295001, + "epoch": 1.5979717915841007, + "grad_norm": 0.5625, + "learning_rate": 1.993142940729044e-05, + "loss": 0.5541, + "mean_token_accuracy": 0.8902185618877411, + "num_tokens": 28860613.0, + "step": 6855 + }, + { + "entropy": 0.2837262708693743, + "epoch": 1.5991374286047324, + "grad_norm": 0.60546875, + "learning_rate": 1.9931328599525593e-05, + "loss": 0.67, + "mean_token_accuracy": 0.8655347049236297, + "num_tokens": 28875950.0, + "step": 6860 + }, + { + "entropy": 0.24237373284995556, + "epoch": 1.6003030656253643, + "grad_norm": 0.47265625, + "learning_rate": 1.993122771822722e-05, + "loss": 0.4202, + "mean_token_accuracy": 0.9144159615039825, + "num_tokens": 28901490.0, + "step": 6865 + }, + { + "entropy": 0.24070470854640008, + "epoch": 1.6014687026459962, + "grad_norm": 1.703125, + "learning_rate": 1.9931126763396823e-05, + "loss": 0.3715, + "mean_token_accuracy": 0.9270369708538055, + "num_tokens": 28917847.0, + "step": 6870 + }, + { + "entropy": 0.3077314287424088, + "epoch": 1.6026343396666278, + "grad_norm": 3.515625, + "learning_rate": 1.9931025735035908e-05, + "loss": 0.4596, + "mean_token_accuracy": 0.8983159244060517, + "num_tokens": 28941625.0, + "step": 6875 + }, + { + "entropy": 0.2766427606344223, + "epoch": 1.6037999766872595, + "grad_norm": 2.15625, + "learning_rate": 1.9930924633145983e-05, + "loss": 0.5664, + "mean_token_accuracy": 0.9030839681625367, + "num_tokens": 28952505.0, + "step": 6880 + }, + { + "entropy": 0.21303706653416157, + "epoch": 1.6049656137078914, + "grad_norm": 0.36328125, + "learning_rate": 1.9930823457728556e-05, + "loss": 0.3332, + "mean_token_accuracy": 0.9256323635578155, + "num_tokens": 28980831.0, + "step": 6885 + }, + { + "entropy": 0.26443149968981744, + "epoch": 1.6061312507285233, + "grad_norm": 4.28125, + "learning_rate": 1.9930722208785137e-05, + "loss": 0.4801, + "mean_token_accuracy": 0.9085823595523834, + "num_tokens": 28993269.0, + "step": 6890 + }, + { + "entropy": 0.3455100655555725, + "epoch": 1.607296887749155, + "grad_norm": 4.875, + "learning_rate": 1.993062088631723e-05, + "loss": 0.5618, + "mean_token_accuracy": 0.8891852796077728, + "num_tokens": 29002688.0, + "step": 6895 + }, + { + "entropy": 0.29940509535372256, + "epoch": 1.6084625247697866, + "grad_norm": 0.27734375, + "learning_rate": 1.9930519490326354e-05, + "loss": 0.474, + "mean_token_accuracy": 0.9166371762752533, + "num_tokens": 29033884.0, + "step": 6900 + }, + { + "entropy": 0.13665635734796525, + "epoch": 1.6096281617904185, + "grad_norm": 2.1875, + "learning_rate": 1.993041802081401e-05, + "loss": 0.1412, + "mean_token_accuracy": 0.9601432979106903, + "num_tokens": 29071820.0, + "step": 6905 + }, + { + "entropy": 0.24027777165174485, + "epoch": 1.6107937988110503, + "grad_norm": 0.357421875, + "learning_rate": 1.9930316477781722e-05, + "loss": 0.4429, + "mean_token_accuracy": 0.9213906168937683, + "num_tokens": 29091290.0, + "step": 6910 + }, + { + "entropy": 0.37214391976594924, + "epoch": 1.611959435831682, + "grad_norm": 1.09375, + "learning_rate": 1.9930214861230998e-05, + "loss": 0.5121, + "mean_token_accuracy": 0.886109858751297, + "num_tokens": 29123342.0, + "step": 6915 + }, + { + "entropy": 0.12675322201102973, + "epoch": 1.6131250728523137, + "grad_norm": 0.43359375, + "learning_rate": 1.9930113171163356e-05, + "loss": 0.0864, + "mean_token_accuracy": 0.9702485501766205, + "num_tokens": 29174885.0, + "step": 6920 + }, + { + "entropy": 0.2880819053389132, + "epoch": 1.6142907098729455, + "grad_norm": 0.828125, + "learning_rate": 1.9930011407580312e-05, + "loss": 0.3827, + "mean_token_accuracy": 0.8916058301925659, + "num_tokens": 29214406.0, + "step": 6925 + }, + { + "entropy": 0.23583326674997807, + "epoch": 1.6154563468935774, + "grad_norm": 6.3125, + "learning_rate": 1.9929909570483383e-05, + "loss": 0.3784, + "mean_token_accuracy": 0.9003340721130371, + "num_tokens": 29238310.0, + "step": 6930 + }, + { + "entropy": 0.23994187042117118, + "epoch": 1.616621983914209, + "grad_norm": 1.6171875, + "learning_rate": 1.9929807659874085e-05, + "loss": 0.4249, + "mean_token_accuracy": 0.9134172797203064, + "num_tokens": 29258683.0, + "step": 6935 + }, + { + "entropy": 0.38729229420423505, + "epoch": 1.6177876209348407, + "grad_norm": 4.15625, + "learning_rate": 1.992970567575394e-05, + "loss": 0.5001, + "mean_token_accuracy": 0.9179852843284607, + "num_tokens": 29280874.0, + "step": 6940 + }, + { + "entropy": 0.30107422918081284, + "epoch": 1.6189532579554726, + "grad_norm": 2.328125, + "learning_rate": 1.992960361812447e-05, + "loss": 0.7504, + "mean_token_accuracy": 0.8799044847488403, + "num_tokens": 29289798.0, + "step": 6945 + }, + { + "entropy": 0.25648994743824005, + "epoch": 1.6201188949761045, + "grad_norm": 2.4375, + "learning_rate": 1.9929501486987196e-05, + "loss": 0.5502, + "mean_token_accuracy": 0.9077392220497131, + "num_tokens": 29300408.0, + "step": 6950 + }, + { + "entropy": 0.2755729131400585, + "epoch": 1.6212845319967362, + "grad_norm": 2.59375, + "learning_rate": 1.992939928234364e-05, + "loss": 0.6221, + "mean_token_accuracy": 0.9050759017467499, + "num_tokens": 29313660.0, + "step": 6955 + }, + { + "entropy": 0.1966082751750946, + "epoch": 1.6224501690173678, + "grad_norm": 0.3203125, + "learning_rate": 1.9929297004195328e-05, + "loss": 0.3258, + "mean_token_accuracy": 0.9240009307861328, + "num_tokens": 29343988.0, + "step": 6960 + }, + { + "entropy": 0.2664320237934589, + "epoch": 1.623615806038, + "grad_norm": 4.25, + "learning_rate": 1.9929194652543784e-05, + "loss": 0.3882, + "mean_token_accuracy": 0.9111813545227051, + "num_tokens": 29363067.0, + "step": 6965 + }, + { + "entropy": 0.4264032058417797, + "epoch": 1.6247814430586316, + "grad_norm": 2.46875, + "learning_rate": 1.9929092227390528e-05, + "loss": 0.6451, + "mean_token_accuracy": 0.8786540269851685, + "num_tokens": 29386786.0, + "step": 6970 + }, + { + "entropy": 0.33386958464980127, + "epoch": 1.6259470800792633, + "grad_norm": 6.25, + "learning_rate": 1.9928989728737097e-05, + "loss": 0.6114, + "mean_token_accuracy": 0.8769558131694793, + "num_tokens": 29417271.0, + "step": 6975 + }, + { + "entropy": 0.29403403401374817, + "epoch": 1.6271127170998951, + "grad_norm": 4.25, + "learning_rate": 1.9928887156585017e-05, + "loss": 0.4335, + "mean_token_accuracy": 0.9137054145336151, + "num_tokens": 29442906.0, + "step": 6980 + }, + { + "entropy": 0.23713123053312302, + "epoch": 1.628278354120527, + "grad_norm": 2.703125, + "learning_rate": 1.9928784510935814e-05, + "loss": 0.5125, + "mean_token_accuracy": 0.919365006685257, + "num_tokens": 29454928.0, + "step": 6985 + }, + { + "entropy": 0.36967662423849107, + "epoch": 1.6294439911411587, + "grad_norm": 11.6875, + "learning_rate": 1.992868179179102e-05, + "loss": 0.9495, + "mean_token_accuracy": 0.8575898349285126, + "num_tokens": 29462426.0, + "step": 6990 + }, + { + "entropy": 0.24759368523955344, + "epoch": 1.6306096281617903, + "grad_norm": 5.59375, + "learning_rate": 1.9928578999152168e-05, + "loss": 0.435, + "mean_token_accuracy": 0.9145466089248657, + "num_tokens": 29481420.0, + "step": 6995 + }, + { + "entropy": 0.3167073156684637, + "epoch": 1.6317752651824222, + "grad_norm": 7.0, + "learning_rate": 1.992847613302079e-05, + "loss": 0.7354, + "mean_token_accuracy": 0.8781661272048951, + "num_tokens": 29493112.0, + "step": 7000 + }, + { + "entropy": 0.23697575367987156, + "epoch": 1.632940902203054, + "grad_norm": 3.578125, + "learning_rate": 1.992837319339842e-05, + "loss": 0.3644, + "mean_token_accuracy": 0.9137583911418915, + "num_tokens": 29521475.0, + "step": 7005 + }, + { + "entropy": 0.28280975893139837, + "epoch": 1.6341065392236858, + "grad_norm": 0.69921875, + "learning_rate": 1.9928270180286594e-05, + "loss": 0.4446, + "mean_token_accuracy": 0.9044004201889038, + "num_tokens": 29544330.0, + "step": 7010 + }, + { + "entropy": 0.294763021543622, + "epoch": 1.6352721762443174, + "grad_norm": 4.6875, + "learning_rate": 1.9928167093686848e-05, + "loss": 0.45, + "mean_token_accuracy": 0.8993681728839874, + "num_tokens": 29567647.0, + "step": 7015 + }, + { + "entropy": 0.29370351433753966, + "epoch": 1.6364378132649493, + "grad_norm": 0.78515625, + "learning_rate": 1.9928063933600716e-05, + "loss": 0.6359, + "mean_token_accuracy": 0.893934679031372, + "num_tokens": 29579581.0, + "step": 7020 + }, + { + "entropy": 0.21342548448592424, + "epoch": 1.6376034502855812, + "grad_norm": 3.703125, + "learning_rate": 1.992796070002974e-05, + "loss": 0.4403, + "mean_token_accuracy": 0.9128808677196503, + "num_tokens": 29601334.0, + "step": 7025 + }, + { + "entropy": 0.27890672609210015, + "epoch": 1.6387690873062128, + "grad_norm": 0.330078125, + "learning_rate": 1.9927857392975456e-05, + "loss": 0.4488, + "mean_token_accuracy": 0.8986436545848846, + "num_tokens": 29631924.0, + "step": 7030 + }, + { + "entropy": 0.24148422684520482, + "epoch": 1.6399347243268445, + "grad_norm": 7.25, + "learning_rate": 1.9927754012439407e-05, + "loss": 0.4239, + "mean_token_accuracy": 0.9171118617057801, + "num_tokens": 29660277.0, + "step": 7035 + }, + { + "entropy": 0.2776141263544559, + "epoch": 1.6411003613474764, + "grad_norm": 4.96875, + "learning_rate": 1.9927650558423136e-05, + "loss": 0.524, + "mean_token_accuracy": 0.905617070198059, + "num_tokens": 29671778.0, + "step": 7040 + }, + { + "entropy": 0.19468433856964112, + "epoch": 1.6422659983681083, + "grad_norm": 1.3046875, + "learning_rate": 1.9927547030928182e-05, + "loss": 0.3035, + "mean_token_accuracy": 0.9407100915908814, + "num_tokens": 29688438.0, + "step": 7045 + }, + { + "entropy": 0.3036914974451065, + "epoch": 1.64343163538874, + "grad_norm": 1.4765625, + "learning_rate": 1.9927443429956092e-05, + "loss": 0.5092, + "mean_token_accuracy": 0.8930172383785248, + "num_tokens": 29718105.0, + "step": 7050 + }, + { + "entropy": 0.270086932182312, + "epoch": 1.6445972724093716, + "grad_norm": 6.625, + "learning_rate": 1.9927339755508406e-05, + "loss": 0.5219, + "mean_token_accuracy": 0.907888799905777, + "num_tokens": 29732103.0, + "step": 7055 + }, + { + "entropy": 0.2985291965305805, + "epoch": 1.6457629094300035, + "grad_norm": 0.76953125, + "learning_rate": 1.9927236007586674e-05, + "loss": 0.3884, + "mean_token_accuracy": 0.9194758594036102, + "num_tokens": 29763323.0, + "step": 7060 + }, + { + "entropy": 0.24300597086548806, + "epoch": 1.6469285464506354, + "grad_norm": 0.72265625, + "learning_rate": 1.9927132186192443e-05, + "loss": 0.5663, + "mean_token_accuracy": 0.9005017936229706, + "num_tokens": 29783730.0, + "step": 7065 + }, + { + "entropy": 0.20479331091046332, + "epoch": 1.648094183471267, + "grad_norm": 0.5390625, + "learning_rate": 1.9927028291327262e-05, + "loss": 0.4199, + "mean_token_accuracy": 0.9226056575775147, + "num_tokens": 29805567.0, + "step": 7070 + }, + { + "entropy": 0.24875131323933602, + "epoch": 1.6492598204918987, + "grad_norm": 0.369140625, + "learning_rate": 1.9926924322992677e-05, + "loss": 0.4545, + "mean_token_accuracy": 0.90796457529068, + "num_tokens": 29833232.0, + "step": 7075 + }, + { + "entropy": 0.4148341499269009, + "epoch": 1.6504254575125306, + "grad_norm": 1.2265625, + "learning_rate": 1.9926820281190237e-05, + "loss": 0.5433, + "mean_token_accuracy": 0.8647517114877701, + "num_tokens": 29851404.0, + "step": 7080 + }, + { + "entropy": 0.32308564633131026, + "epoch": 1.6515910945331624, + "grad_norm": 6.15625, + "learning_rate": 1.99267161659215e-05, + "loss": 0.7448, + "mean_token_accuracy": 0.8654117107391357, + "num_tokens": 29859904.0, + "step": 7085 + }, + { + "entropy": 0.22259309757500886, + "epoch": 1.652756731553794, + "grad_norm": 9.3125, + "learning_rate": 1.9926611977188013e-05, + "loss": 0.4774, + "mean_token_accuracy": 0.9207153677940368, + "num_tokens": 29885795.0, + "step": 7090 + }, + { + "entropy": 0.20272818468511106, + "epoch": 1.6539223685744258, + "grad_norm": 0.375, + "learning_rate": 1.992650771499133e-05, + "loss": 0.4031, + "mean_token_accuracy": 0.9296795845031738, + "num_tokens": 29906835.0, + "step": 7095 + }, + { + "entropy": 0.31789565905928613, + "epoch": 1.6550880055950579, + "grad_norm": 3.890625, + "learning_rate": 1.992640337933301e-05, + "loss": 0.4696, + "mean_token_accuracy": 0.8921868026256561, + "num_tokens": 29929819.0, + "step": 7100 + }, + { + "entropy": 0.30611699670553205, + "epoch": 1.6562536426156895, + "grad_norm": 0.8671875, + "learning_rate": 1.9926298970214605e-05, + "loss": 0.6593, + "mean_token_accuracy": 0.8659330070018768, + "num_tokens": 29943553.0, + "step": 7105 + }, + { + "entropy": 0.2864277273416519, + "epoch": 1.6574192796363212, + "grad_norm": 1.375, + "learning_rate": 1.992619448763767e-05, + "loss": 0.6515, + "mean_token_accuracy": 0.8865693151950836, + "num_tokens": 29959174.0, + "step": 7110 + }, + { + "entropy": 0.3368852075189352, + "epoch": 1.658584916656953, + "grad_norm": 5.34375, + "learning_rate": 1.992608993160377e-05, + "loss": 0.661, + "mean_token_accuracy": 0.8898778975009918, + "num_tokens": 29970033.0, + "step": 7115 + }, + { + "entropy": 0.205504653416574, + "epoch": 1.659750553677585, + "grad_norm": 0.62890625, + "learning_rate": 1.9925985302114458e-05, + "loss": 0.3969, + "mean_token_accuracy": 0.9380353331565857, + "num_tokens": 30002128.0, + "step": 7120 + }, + { + "entropy": 0.36266718367114664, + "epoch": 1.6609161906982166, + "grad_norm": 1.6640625, + "learning_rate": 1.9925880599171297e-05, + "loss": 0.6024, + "mean_token_accuracy": 0.8730053007602692, + "num_tokens": 30031590.0, + "step": 7125 + }, + { + "entropy": 0.4454704590141773, + "epoch": 1.6620818277188483, + "grad_norm": 3.8125, + "learning_rate": 1.9925775822775848e-05, + "loss": 0.743, + "mean_token_accuracy": 0.8447412520647049, + "num_tokens": 30047795.0, + "step": 7130 + }, + { + "entropy": 0.24732003286480903, + "epoch": 1.6632474647394802, + "grad_norm": 2.75, + "learning_rate": 1.992567097292967e-05, + "loss": 0.4618, + "mean_token_accuracy": 0.9004866123199463, + "num_tokens": 30060337.0, + "step": 7135 + }, + { + "entropy": 0.299695748090744, + "epoch": 1.664413101760112, + "grad_norm": 0.50390625, + "learning_rate": 1.992556604963433e-05, + "loss": 0.3696, + "mean_token_accuracy": 0.9032521545886993, + "num_tokens": 30097236.0, + "step": 7140 + }, + { + "entropy": 0.29932774901390075, + "epoch": 1.6655787387807437, + "grad_norm": 4.5, + "learning_rate": 1.9925461052891394e-05, + "loss": 0.5304, + "mean_token_accuracy": 0.8978306174278259, + "num_tokens": 30123892.0, + "step": 7145 + }, + { + "entropy": 0.2939157888293266, + "epoch": 1.6667443758013754, + "grad_norm": 5.28125, + "learning_rate": 1.9925355982702422e-05, + "loss": 0.485, + "mean_token_accuracy": 0.9099433124065399, + "num_tokens": 30140095.0, + "step": 7150 + }, + { + "entropy": 0.3226566888391972, + "epoch": 1.6679100128220072, + "grad_norm": 5.71875, + "learning_rate": 1.9925250839068985e-05, + "loss": 0.6285, + "mean_token_accuracy": 0.878937691450119, + "num_tokens": 30153066.0, + "step": 7155 + }, + { + "entropy": 0.317183431237936, + "epoch": 1.6690756498426391, + "grad_norm": 0.5625, + "learning_rate": 1.992514562199265e-05, + "loss": 0.5418, + "mean_token_accuracy": 0.8883817255496979, + "num_tokens": 30172082.0, + "step": 7160 + }, + { + "entropy": 0.36269724033772943, + "epoch": 1.6702412868632708, + "grad_norm": 4.625, + "learning_rate": 1.992504033147499e-05, + "loss": 0.6548, + "mean_token_accuracy": 0.8761373639106751, + "num_tokens": 30194571.0, + "step": 7165 + }, + { + "entropy": 0.30474191829562186, + "epoch": 1.6714069238839024, + "grad_norm": 2.65625, + "learning_rate": 1.9924934967517566e-05, + "loss": 0.494, + "mean_token_accuracy": 0.9052575767040253, + "num_tokens": 30205882.0, + "step": 7170 + }, + { + "entropy": 0.27710217610001564, + "epoch": 1.6725725609045343, + "grad_norm": 0.49609375, + "learning_rate": 1.9924829530121955e-05, + "loss": 0.5382, + "mean_token_accuracy": 0.8841014266014099, + "num_tokens": 30227634.0, + "step": 7175 + }, + { + "entropy": 0.35291581228375435, + "epoch": 1.6737381979251662, + "grad_norm": 3.5625, + "learning_rate": 1.992472401928973e-05, + "loss": 0.5598, + "mean_token_accuracy": 0.8761431634426117, + "num_tokens": 30247877.0, + "step": 7180 + }, + { + "entropy": 0.20774665847420692, + "epoch": 1.6749038349457979, + "grad_norm": 0.4140625, + "learning_rate": 1.992461843502246e-05, + "loss": 0.1989, + "mean_token_accuracy": 0.9413784801959991, + "num_tokens": 30284908.0, + "step": 7185 + }, + { + "entropy": 0.1597726047039032, + "epoch": 1.6760694719664295, + "grad_norm": 1.0234375, + "learning_rate": 1.9924512777321724e-05, + "loss": 0.2075, + "mean_token_accuracy": 0.9432705879211426, + "num_tokens": 30313340.0, + "step": 7190 + }, + { + "entropy": 0.2070239342749119, + "epoch": 1.6772351089870614, + "grad_norm": 3.625, + "learning_rate": 1.9924407046189097e-05, + "loss": 0.3967, + "mean_token_accuracy": 0.937429141998291, + "num_tokens": 30327617.0, + "step": 7195 + }, + { + "entropy": 0.4867036297917366, + "epoch": 1.6784007460076933, + "grad_norm": 5.40625, + "learning_rate": 1.992430124162615e-05, + "loss": 0.8424, + "mean_token_accuracy": 0.8647673785686493, + "num_tokens": 30345129.0, + "step": 7200 + }, + { + "entropy": 0.3646535977721214, + "epoch": 1.679566383028325, + "grad_norm": 1.03125, + "learning_rate": 1.992419536363447e-05, + "loss": 0.6495, + "mean_token_accuracy": 0.8817200243473053, + "num_tokens": 30361294.0, + "step": 7205 + }, + { + "entropy": 0.21611667182296515, + "epoch": 1.6807320200489566, + "grad_norm": 4.3125, + "learning_rate": 1.992408941221563e-05, + "loss": 0.5187, + "mean_token_accuracy": 0.901872044801712, + "num_tokens": 30382658.0, + "step": 7210 + }, + { + "entropy": 0.2701580785214901, + "epoch": 1.6818976570695885, + "grad_norm": 2.28125, + "learning_rate": 1.9923983387371207e-05, + "loss": 0.4991, + "mean_token_accuracy": 0.906497186422348, + "num_tokens": 30401438.0, + "step": 7215 + }, + { + "entropy": 0.3351647056639194, + "epoch": 1.6830632940902204, + "grad_norm": 0.5703125, + "learning_rate": 1.992387728910279e-05, + "loss": 0.5747, + "mean_token_accuracy": 0.8796181321144104, + "num_tokens": 30421983.0, + "step": 7220 + }, + { + "entropy": 0.3249632440507412, + "epoch": 1.684228931110852, + "grad_norm": 1.15625, + "learning_rate": 1.992377111741195e-05, + "loss": 0.6371, + "mean_token_accuracy": 0.8933514654636383, + "num_tokens": 30433163.0, + "step": 7225 + }, + { + "entropy": 0.15651183463633062, + "epoch": 1.6853945681314837, + "grad_norm": 1.265625, + "learning_rate": 1.9923664872300284e-05, + "loss": 0.265, + "mean_token_accuracy": 0.9518414855003356, + "num_tokens": 30461301.0, + "step": 7230 + }, + { + "entropy": 0.29888438284397123, + "epoch": 1.6865602051521156, + "grad_norm": 4.59375, + "learning_rate": 1.9923558553769363e-05, + "loss": 0.4514, + "mean_token_accuracy": 0.8969875752925873, + "num_tokens": 30490325.0, + "step": 7235 + }, + { + "entropy": 0.2710587065666914, + "epoch": 1.6877258421727475, + "grad_norm": 1.9453125, + "learning_rate": 1.9923452161820785e-05, + "loss": 0.431, + "mean_token_accuracy": 0.9071939885616302, + "num_tokens": 30507491.0, + "step": 7240 + }, + { + "entropy": 0.2726780742406845, + "epoch": 1.6888914791933791, + "grad_norm": 6.0, + "learning_rate": 1.9923345696456126e-05, + "loss": 0.7774, + "mean_token_accuracy": 0.8846882879734039, + "num_tokens": 30517367.0, + "step": 7245 + }, + { + "entropy": 0.31769172176718713, + "epoch": 1.690057116214011, + "grad_norm": 2.875, + "learning_rate": 1.9923239157676978e-05, + "loss": 0.5541, + "mean_token_accuracy": 0.898011964559555, + "num_tokens": 30538251.0, + "step": 7250 + }, + { + "entropy": 0.40227584093809127, + "epoch": 1.6912227532346429, + "grad_norm": 3.71875, + "learning_rate": 1.992313254548493e-05, + "loss": 0.7193, + "mean_token_accuracy": 0.8725230753421783, + "num_tokens": 30556048.0, + "step": 7255 + }, + { + "entropy": 0.2761504050344229, + "epoch": 1.6923883902552745, + "grad_norm": 0.6875, + "learning_rate": 1.9923025859881567e-05, + "loss": 0.6691, + "mean_token_accuracy": 0.8683241248130799, + "num_tokens": 30576298.0, + "step": 7260 + }, + { + "entropy": 0.27000463437289, + "epoch": 1.6935540272759062, + "grad_norm": 3.09375, + "learning_rate": 1.992291910086849e-05, + "loss": 0.4296, + "mean_token_accuracy": 0.909073394536972, + "num_tokens": 30603726.0, + "step": 7265 + }, + { + "entropy": 0.2889959217980504, + "epoch": 1.694719664296538, + "grad_norm": 4.59375, + "learning_rate": 1.9922812268447283e-05, + "loss": 0.4767, + "mean_token_accuracy": 0.9106442332267761, + "num_tokens": 30641205.0, + "step": 7270 + }, + { + "entropy": 0.2706956649199128, + "epoch": 1.69588530131717, + "grad_norm": 1.8359375, + "learning_rate": 1.992270536261954e-05, + "loss": 0.471, + "mean_token_accuracy": 0.8838472962379456, + "num_tokens": 30671221.0, + "step": 7275 + }, + { + "entropy": 0.2506210308521986, + "epoch": 1.6970509383378016, + "grad_norm": 1.7109375, + "learning_rate": 1.9922598383386854e-05, + "loss": 0.2305, + "mean_token_accuracy": 0.8959549248218537, + "num_tokens": 30697512.0, + "step": 7280 + }, + { + "entropy": 0.2710033968091011, + "epoch": 1.6982165753584333, + "grad_norm": 9.5, + "learning_rate": 1.9922491330750824e-05, + "loss": 0.6513, + "mean_token_accuracy": 0.8868758141994476, + "num_tokens": 30714445.0, + "step": 7285 + }, + { + "entropy": 0.2399160273373127, + "epoch": 1.6993822123790652, + "grad_norm": 2.890625, + "learning_rate": 1.9922384204713044e-05, + "loss": 0.4042, + "mean_token_accuracy": 0.9155187606811523, + "num_tokens": 30732133.0, + "step": 7290 + }, + { + "entropy": 0.23681249283254147, + "epoch": 1.700547849399697, + "grad_norm": 0.4453125, + "learning_rate": 1.992227700527511e-05, + "loss": 0.2615, + "mean_token_accuracy": 0.9078514516353607, + "num_tokens": 30763956.0, + "step": 7295 + }, + { + "entropy": 0.15826738066971302, + "epoch": 1.7017134864203287, + "grad_norm": 0.46484375, + "learning_rate": 1.9922169732438624e-05, + "loss": 0.2281, + "mean_token_accuracy": 0.9464801192283631, + "num_tokens": 30803369.0, + "step": 7300 + }, + { + "entropy": 0.3240881063044071, + "epoch": 1.7028791234409604, + "grad_norm": 0.59765625, + "learning_rate": 1.9922062386205187e-05, + "loss": 0.4112, + "mean_token_accuracy": 0.9000102639198303, + "num_tokens": 30823285.0, + "step": 7305 + }, + { + "entropy": 0.1817174531519413, + "epoch": 1.7040447604615923, + "grad_norm": 0.7734375, + "learning_rate": 1.9921954966576392e-05, + "loss": 0.2731, + "mean_token_accuracy": 0.9324170172214508, + "num_tokens": 30843868.0, + "step": 7310 + }, + { + "entropy": 0.26547779366374014, + "epoch": 1.7052103974822241, + "grad_norm": 4.3125, + "learning_rate": 1.992184747355385e-05, + "loss": 0.563, + "mean_token_accuracy": 0.8946157336235047, + "num_tokens": 30857862.0, + "step": 7315 + }, + { + "entropy": 0.19620814472436904, + "epoch": 1.7063760345028558, + "grad_norm": 2.46875, + "learning_rate": 1.9921739907139153e-05, + "loss": 0.3691, + "mean_token_accuracy": 0.9361901104450225, + "num_tokens": 30870783.0, + "step": 7320 + }, + { + "entropy": 0.21691591143608094, + "epoch": 1.7075416715234875, + "grad_norm": 1.734375, + "learning_rate": 1.9921632267333915e-05, + "loss": 0.5116, + "mean_token_accuracy": 0.914357042312622, + "num_tokens": 30883953.0, + "step": 7325 + }, + { + "entropy": 0.2352630764245987, + "epoch": 1.7087073085441193, + "grad_norm": 4.15625, + "learning_rate": 1.9921524554139736e-05, + "loss": 0.4476, + "mean_token_accuracy": 0.916910320520401, + "num_tokens": 30899288.0, + "step": 7330 + }, + { + "entropy": 0.15292470753192902, + "epoch": 1.7098729455647512, + "grad_norm": 0.423828125, + "learning_rate": 1.9921416767558227e-05, + "loss": 0.2157, + "mean_token_accuracy": 0.945728212594986, + "num_tokens": 30931355.0, + "step": 7335 + }, + { + "entropy": 0.2981669930741191, + "epoch": 1.7110385825853829, + "grad_norm": 1.6640625, + "learning_rate": 1.992130890759099e-05, + "loss": 0.5402, + "mean_token_accuracy": 0.8897511839866639, + "num_tokens": 30950440.0, + "step": 7340 + }, + { + "entropy": 0.23863260447978973, + "epoch": 1.7122042196060145, + "grad_norm": 5.53125, + "learning_rate": 1.9921200974239632e-05, + "loss": 0.5498, + "mean_token_accuracy": 0.9120193362236023, + "num_tokens": 30962258.0, + "step": 7345 + }, + { + "entropy": 0.19053780883550644, + "epoch": 1.7133698566266464, + "grad_norm": 0.7265625, + "learning_rate": 1.992109296750577e-05, + "loss": 0.2194, + "mean_token_accuracy": 0.934450340270996, + "num_tokens": 30997839.0, + "step": 7350 + }, + { + "entropy": 0.18329034112393855, + "epoch": 1.7145354936472783, + "grad_norm": 2.59375, + "learning_rate": 1.9920984887391005e-05, + "loss": 0.4037, + "mean_token_accuracy": 0.9329216539859772, + "num_tokens": 31025334.0, + "step": 7355 + }, + { + "entropy": 0.19702217131853103, + "epoch": 1.71570113066791, + "grad_norm": 3.234375, + "learning_rate": 1.9920876733896957e-05, + "loss": 0.3438, + "mean_token_accuracy": 0.9348407089710236, + "num_tokens": 31050064.0, + "step": 7360 + }, + { + "entropy": 0.320951434969902, + "epoch": 1.7168667676885416, + "grad_norm": 3.3125, + "learning_rate": 1.9920768507025235e-05, + "loss": 0.5197, + "mean_token_accuracy": 0.8957397937774658, + "num_tokens": 31060139.0, + "step": 7365 + }, + { + "entropy": 0.19964747801423072, + "epoch": 1.7180324047091735, + "grad_norm": 3.953125, + "learning_rate": 1.9920660206777457e-05, + "loss": 0.3856, + "mean_token_accuracy": 0.9334841907024384, + "num_tokens": 31072742.0, + "step": 7370 + }, + { + "entropy": 0.17681464105844497, + "epoch": 1.7191980417298054, + "grad_norm": 0.435546875, + "learning_rate": 1.992055183315523e-05, + "loss": 0.211, + "mean_token_accuracy": 0.9506916999816895, + "num_tokens": 31104957.0, + "step": 7375 + }, + { + "entropy": 0.22340110838413238, + "epoch": 1.720363678750437, + "grad_norm": 6.9375, + "learning_rate": 1.9920443386160177e-05, + "loss": 0.4826, + "mean_token_accuracy": 0.9081883072853089, + "num_tokens": 31123653.0, + "step": 7380 + }, + { + "entropy": 0.18643004707992078, + "epoch": 1.721529315771069, + "grad_norm": 0.65234375, + "learning_rate": 1.992033486579391e-05, + "loss": 0.2215, + "mean_token_accuracy": 0.9491021037101746, + "num_tokens": 31157290.0, + "step": 7385 + }, + { + "entropy": 0.2977891772985458, + "epoch": 1.7226949527917008, + "grad_norm": 4.625, + "learning_rate": 1.992022627205805e-05, + "loss": 0.6306, + "mean_token_accuracy": 0.8962555050849914, + "num_tokens": 31167243.0, + "step": 7390 + }, + { + "entropy": 0.1719819199293852, + "epoch": 1.7238605898123325, + "grad_norm": 0.8203125, + "learning_rate": 1.9920117604954213e-05, + "loss": 0.1391, + "mean_token_accuracy": 0.9344309747219086, + "num_tokens": 31205604.0, + "step": 7395 + }, + { + "entropy": 0.2909649141132832, + "epoch": 1.7250262268329641, + "grad_norm": 5.40625, + "learning_rate": 1.9920008864484026e-05, + "loss": 0.4333, + "mean_token_accuracy": 0.9086243093013764, + "num_tokens": 31219789.0, + "step": 7400 + }, + { + "entropy": 0.31774648912250997, + "epoch": 1.726191863853596, + "grad_norm": 0.9140625, + "learning_rate": 1.9919900050649106e-05, + "loss": 0.6789, + "mean_token_accuracy": 0.8734631478786469, + "num_tokens": 31234682.0, + "step": 7405 + }, + { + "entropy": 0.2277542307972908, + "epoch": 1.727357500874228, + "grad_norm": 0.609375, + "learning_rate": 1.9919791163451078e-05, + "loss": 0.349, + "mean_token_accuracy": 0.9085290551185607, + "num_tokens": 31252277.0, + "step": 7410 + }, + { + "entropy": 0.23079803325235843, + "epoch": 1.7285231378948596, + "grad_norm": 5.0, + "learning_rate": 1.991968220289156e-05, + "loss": 0.3643, + "mean_token_accuracy": 0.9307330787181854, + "num_tokens": 31275731.0, + "step": 7415 + }, + { + "entropy": 0.3496759317815304, + "epoch": 1.7296887749154912, + "grad_norm": 6.5625, + "learning_rate": 1.991957316897218e-05, + "loss": 0.7099, + "mean_token_accuracy": 0.8756554901599884, + "num_tokens": 31296718.0, + "step": 7420 + }, + { + "entropy": 0.32018161565065384, + "epoch": 1.730854411936123, + "grad_norm": 0.83984375, + "learning_rate": 1.9919464061694573e-05, + "loss": 0.513, + "mean_token_accuracy": 0.8961473703384399, + "num_tokens": 31309746.0, + "step": 7425 + }, + { + "entropy": 0.2859540117904544, + "epoch": 1.732020048956755, + "grad_norm": 2.78125, + "learning_rate": 1.991935488106035e-05, + "loss": 0.6537, + "mean_token_accuracy": 0.887366658449173, + "num_tokens": 31327616.0, + "step": 7430 + }, + { + "entropy": 0.31319847926497457, + "epoch": 1.7331856859773866, + "grad_norm": 1.53125, + "learning_rate": 1.991924562707115e-05, + "loss": 0.5387, + "mean_token_accuracy": 0.8870364725589752, + "num_tokens": 31349824.0, + "step": 7435 + }, + { + "entropy": 0.3842922620475292, + "epoch": 1.7343513229980183, + "grad_norm": 0.490234375, + "learning_rate": 1.9919136299728597e-05, + "loss": 0.4692, + "mean_token_accuracy": 0.8711851298809051, + "num_tokens": 31373296.0, + "step": 7440 + }, + { + "entropy": 0.24882024489343166, + "epoch": 1.7355169600186502, + "grad_norm": 4.25, + "learning_rate": 1.991902689903432e-05, + "loss": 0.4052, + "mean_token_accuracy": 0.9191643178462983, + "num_tokens": 31396449.0, + "step": 7445 + }, + { + "entropy": 0.299553207680583, + "epoch": 1.736682597039282, + "grad_norm": 3.921875, + "learning_rate": 1.991891742498996e-05, + "loss": 0.454, + "mean_token_accuracy": 0.9090847849845887, + "num_tokens": 31416553.0, + "step": 7450 + }, + { + "entropy": 0.22123729214072227, + "epoch": 1.7378482340599137, + "grad_norm": 5.78125, + "learning_rate": 1.9918807877597137e-05, + "loss": 0.3916, + "mean_token_accuracy": 0.9285433471202851, + "num_tokens": 31431941.0, + "step": 7455 + }, + { + "entropy": 0.2913926810026169, + "epoch": 1.7390138710805454, + "grad_norm": 2.265625, + "learning_rate": 1.991869825685749e-05, + "loss": 0.6338, + "mean_token_accuracy": 0.8911004424095154, + "num_tokens": 31441855.0, + "step": 7460 + }, + { + "entropy": 0.29126920998096467, + "epoch": 1.7401795081011773, + "grad_norm": 5.125, + "learning_rate": 1.9918588562772658e-05, + "loss": 0.4673, + "mean_token_accuracy": 0.9109508275985718, + "num_tokens": 31452820.0, + "step": 7465 + }, + { + "entropy": 0.23168888092041015, + "epoch": 1.7413451451218092, + "grad_norm": 3.203125, + "learning_rate": 1.991847879534427e-05, + "loss": 0.4493, + "mean_token_accuracy": 0.9215346693992614, + "num_tokens": 31465636.0, + "step": 7470 + }, + { + "entropy": 0.2087232932448387, + "epoch": 1.7425107821424408, + "grad_norm": 1.328125, + "learning_rate": 1.991836895457397e-05, + "loss": 0.36, + "mean_token_accuracy": 0.9289436519145966, + "num_tokens": 31481900.0, + "step": 7475 + }, + { + "entropy": 0.27741892635822296, + "epoch": 1.7436764191630725, + "grad_norm": 4.0625, + "learning_rate": 1.9918259040463387e-05, + "loss": 0.5337, + "mean_token_accuracy": 0.9090081572532653, + "num_tokens": 31493712.0, + "step": 7480 + }, + { + "entropy": 0.5755782432854175, + "epoch": 1.7448420561837044, + "grad_norm": 1.359375, + "learning_rate": 1.9918149053014165e-05, + "loss": 1.1664, + "mean_token_accuracy": 0.8490794718265533, + "num_tokens": 31521846.0, + "step": 7485 + }, + { + "entropy": 0.26485144533216953, + "epoch": 1.7460076932043362, + "grad_norm": 8.6875, + "learning_rate": 1.9918038992227942e-05, + "loss": 0.5802, + "mean_token_accuracy": 0.9102464139461517, + "num_tokens": 31535230.0, + "step": 7490 + }, + { + "entropy": 0.32285096058622004, + "epoch": 1.747173330224968, + "grad_norm": 6.6875, + "learning_rate": 1.9917928858106363e-05, + "loss": 0.5593, + "mean_token_accuracy": 0.8858695566654206, + "num_tokens": 31560034.0, + "step": 7495 + }, + { + "entropy": 0.312009833753109, + "epoch": 1.7483389672455996, + "grad_norm": 5.0625, + "learning_rate": 1.9917818650651062e-05, + "loss": 0.5231, + "mean_token_accuracy": 0.8962408840656281, + "num_tokens": 31572987.0, + "step": 7500 + }, + { + "entropy": 0.1852190401405096, + "epoch": 1.7495046042662314, + "grad_norm": 2.53125, + "learning_rate": 1.9917708369863695e-05, + "loss": 0.3109, + "mean_token_accuracy": 0.9369418799877167, + "num_tokens": 31593919.0, + "step": 7505 + }, + { + "entropy": 0.28790936544537543, + "epoch": 1.7506702412868633, + "grad_norm": 10.25, + "learning_rate": 1.9917598015745897e-05, + "loss": 0.4879, + "mean_token_accuracy": 0.8942620098590851, + "num_tokens": 31614322.0, + "step": 7510 + }, + { + "entropy": 0.2300134082324803, + "epoch": 1.751835878307495, + "grad_norm": 0.56640625, + "learning_rate": 1.9917487588299315e-05, + "loss": 0.4391, + "mean_token_accuracy": 0.9022059321403504, + "num_tokens": 31645430.0, + "step": 7515 + }, + { + "entropy": 0.2961542010307312, + "epoch": 1.7530015153281269, + "grad_norm": 4.1875, + "learning_rate": 1.99173770875256e-05, + "loss": 0.3211, + "mean_token_accuracy": 0.9309226989746093, + "num_tokens": 31668774.0, + "step": 7520 + }, + { + "entropy": 0.29538979530334475, + "epoch": 1.7541671523487588, + "grad_norm": 0.94140625, + "learning_rate": 1.9917266513426395e-05, + "loss": 0.4159, + "mean_token_accuracy": 0.9046856939792634, + "num_tokens": 31690334.0, + "step": 7525 + }, + { + "entropy": 0.26101839244365693, + "epoch": 1.7553327893693904, + "grad_norm": 3.71875, + "learning_rate": 1.9917155866003348e-05, + "loss": 0.4387, + "mean_token_accuracy": 0.919017630815506, + "num_tokens": 31713885.0, + "step": 7530 + }, + { + "entropy": 0.2481499405577779, + "epoch": 1.756498426390022, + "grad_norm": 1.1796875, + "learning_rate": 1.9917045145258113e-05, + "loss": 0.3334, + "mean_token_accuracy": 0.9115027070045472, + "num_tokens": 31738068.0, + "step": 7535 + }, + { + "entropy": 0.24259122014045714, + "epoch": 1.757664063410654, + "grad_norm": 1.2890625, + "learning_rate": 1.9916934351192337e-05, + "loss": 0.4097, + "mean_token_accuracy": 0.9250048696994781, + "num_tokens": 31751725.0, + "step": 7540 + }, + { + "entropy": 0.2630830302834511, + "epoch": 1.7588297004312858, + "grad_norm": 6.03125, + "learning_rate": 1.9916823483807677e-05, + "loss": 0.6452, + "mean_token_accuracy": 0.8848126351833343, + "num_tokens": 31766007.0, + "step": 7545 + }, + { + "entropy": 0.29085163548588755, + "epoch": 1.7599953374519175, + "grad_norm": 7.65625, + "learning_rate": 1.9916712543105784e-05, + "loss": 0.591, + "mean_token_accuracy": 0.8972097933292389, + "num_tokens": 31777744.0, + "step": 7550 + }, + { + "entropy": 0.21113423565402628, + "epoch": 1.7611609744725492, + "grad_norm": 4.65625, + "learning_rate": 1.991660152908831e-05, + "loss": 0.3933, + "mean_token_accuracy": 0.919322258234024, + "num_tokens": 31808311.0, + "step": 7555 + }, + { + "entropy": 0.2468874402344227, + "epoch": 1.762326611493181, + "grad_norm": 3.3125, + "learning_rate": 1.9916490441756916e-05, + "loss": 0.3811, + "mean_token_accuracy": 0.9185133218765259, + "num_tokens": 31829676.0, + "step": 7560 + }, + { + "entropy": 0.22626171559095382, + "epoch": 1.763492248513813, + "grad_norm": 6.3125, + "learning_rate": 1.991637928111325e-05, + "loss": 0.4106, + "mean_token_accuracy": 0.921566492319107, + "num_tokens": 31851995.0, + "step": 7565 + }, + { + "entropy": 0.33512529246509076, + "epoch": 1.7646578855344446, + "grad_norm": 7.46875, + "learning_rate": 1.9916268047158976e-05, + "loss": 0.6447, + "mean_token_accuracy": 0.8610825300216675, + "num_tokens": 31876760.0, + "step": 7570 + }, + { + "entropy": 0.3237742781639099, + "epoch": 1.7658235225550762, + "grad_norm": 0.6328125, + "learning_rate": 1.9916156739895748e-05, + "loss": 0.555, + "mean_token_accuracy": 0.885119891166687, + "num_tokens": 31900782.0, + "step": 7575 + }, + { + "entropy": 0.2589297104626894, + "epoch": 1.7669891595757081, + "grad_norm": 0.8828125, + "learning_rate": 1.9916045359325234e-05, + "loss": 0.3046, + "mean_token_accuracy": 0.9248237907886505, + "num_tokens": 31933210.0, + "step": 7580 + }, + { + "entropy": 0.18404842913150787, + "epoch": 1.76815479659634, + "grad_norm": 0.8125, + "learning_rate": 1.9915933905449087e-05, + "loss": 0.1783, + "mean_token_accuracy": 0.9453945815563202, + "num_tokens": 31967439.0, + "step": 7585 + }, + { + "entropy": 0.14019958060234786, + "epoch": 1.7693204336169717, + "grad_norm": 2.015625, + "learning_rate": 1.9915822378268973e-05, + "loss": 0.2196, + "mean_token_accuracy": 0.9397667944431305, + "num_tokens": 31994838.0, + "step": 7590 + }, + { + "entropy": 0.24855429008603097, + "epoch": 1.7704860706376033, + "grad_norm": 1.9375, + "learning_rate": 1.991571077778655e-05, + "loss": 0.5965, + "mean_token_accuracy": 0.8934241354465484, + "num_tokens": 32009912.0, + "step": 7595 + }, + { + "entropy": 0.18048623353242874, + "epoch": 1.7716517076582352, + "grad_norm": 2.671875, + "learning_rate": 1.9915599104003486e-05, + "loss": 0.171, + "mean_token_accuracy": 0.9239882111549378, + "num_tokens": 32039346.0, + "step": 7600 + }, + { + "entropy": 0.20122657679021358, + "epoch": 1.772817344678867, + "grad_norm": 0.65234375, + "learning_rate": 1.991548735692145e-05, + "loss": 0.3101, + "mean_token_accuracy": 0.9241252362728118, + "num_tokens": 32068148.0, + "step": 7605 + }, + { + "entropy": 0.24722633212804795, + "epoch": 1.7739829816994988, + "grad_norm": 2.5625, + "learning_rate": 1.99153755365421e-05, + "loss": 0.4331, + "mean_token_accuracy": 0.9065563023090363, + "num_tokens": 32088079.0, + "step": 7610 + }, + { + "entropy": 0.2641620749607682, + "epoch": 1.7751486187201304, + "grad_norm": 1.1875, + "learning_rate": 1.991526364286711e-05, + "loss": 0.5494, + "mean_token_accuracy": 0.8998633921146393, + "num_tokens": 32112963.0, + "step": 7615 + }, + { + "entropy": 0.28498165756464006, + "epoch": 1.7763142557407623, + "grad_norm": 4.0, + "learning_rate": 1.9915151675898144e-05, + "loss": 0.6748, + "mean_token_accuracy": 0.8952282905578614, + "num_tokens": 32123851.0, + "step": 7620 + }, + { + "entropy": 0.33082397319376466, + "epoch": 1.7774798927613942, + "grad_norm": 5.75, + "learning_rate": 1.9915039635636876e-05, + "loss": 0.6175, + "mean_token_accuracy": 0.8862141251564026, + "num_tokens": 32138905.0, + "step": 7625 + }, + { + "entropy": 0.2725643038749695, + "epoch": 1.7786455297820258, + "grad_norm": 2.9375, + "learning_rate": 1.9914927522084975e-05, + "loss": 0.4621, + "mean_token_accuracy": 0.9120152533054352, + "num_tokens": 32159510.0, + "step": 7630 + }, + { + "entropy": 0.21518983766436578, + "epoch": 1.7798111668026575, + "grad_norm": 6.71875, + "learning_rate": 1.991481533524411e-05, + "loss": 0.3764, + "mean_token_accuracy": 0.9207785665988922, + "num_tokens": 32180441.0, + "step": 7635 + }, + { + "entropy": 0.2944613240659237, + "epoch": 1.7809768038232894, + "grad_norm": 5.4375, + "learning_rate": 1.991470307511596e-05, + "loss": 0.5341, + "mean_token_accuracy": 0.8910565853118897, + "num_tokens": 32195329.0, + "step": 7640 + }, + { + "entropy": 0.37638199105858805, + "epoch": 1.7821424408439213, + "grad_norm": 10.75, + "learning_rate": 1.9914590741702188e-05, + "loss": 0.818, + "mean_token_accuracy": 0.8501861274242402, + "num_tokens": 32212617.0, + "step": 7645 + }, + { + "entropy": 0.20262998938560486, + "epoch": 1.783308077864553, + "grad_norm": 0.94140625, + "learning_rate": 1.9914478335004482e-05, + "loss": 0.2963, + "mean_token_accuracy": 0.9370038688182831, + "num_tokens": 32233596.0, + "step": 7650 + }, + { + "entropy": 0.24313185326755046, + "epoch": 1.7844737148851848, + "grad_norm": 1.2265625, + "learning_rate": 1.9914365855024508e-05, + "loss": 0.3603, + "mean_token_accuracy": 0.9154565274715424, + "num_tokens": 32252957.0, + "step": 7655 + }, + { + "entropy": 0.3318315625190735, + "epoch": 1.7856393519058167, + "grad_norm": 4.96875, + "learning_rate": 1.9914253301763953e-05, + "loss": 0.6415, + "mean_token_accuracy": 0.8702471375465393, + "num_tokens": 32267022.0, + "step": 7660 + }, + { + "entropy": 0.2816934622824192, + "epoch": 1.7868049889264483, + "grad_norm": 0.8984375, + "learning_rate": 1.9914140675224483e-05, + "loss": 0.4164, + "mean_token_accuracy": 0.9272620856761933, + "num_tokens": 32280466.0, + "step": 7665 + }, + { + "entropy": 0.22036347948014737, + "epoch": 1.78797062594708, + "grad_norm": 1.140625, + "learning_rate": 1.991402797540779e-05, + "loss": 0.4315, + "mean_token_accuracy": 0.9199424505233764, + "num_tokens": 32294548.0, + "step": 7670 + }, + { + "entropy": 0.34325801730155947, + "epoch": 1.7891362629677119, + "grad_norm": 4.8125, + "learning_rate": 1.9913915202315544e-05, + "loss": 0.5674, + "mean_token_accuracy": 0.874156790971756, + "num_tokens": 32308005.0, + "step": 7675 + }, + { + "entropy": 0.22082845419645308, + "epoch": 1.7903018999883438, + "grad_norm": 1.53125, + "learning_rate": 1.9913802355949436e-05, + "loss": 0.4455, + "mean_token_accuracy": 0.9140099585056305, + "num_tokens": 32323466.0, + "step": 7680 + }, + { + "entropy": 0.2676783286035061, + "epoch": 1.7914675370089754, + "grad_norm": 5.3125, + "learning_rate": 1.9913689436311142e-05, + "loss": 0.5678, + "mean_token_accuracy": 0.8990582466125489, + "num_tokens": 32336418.0, + "step": 7685 + }, + { + "entropy": 0.38137408494949343, + "epoch": 1.792633174029607, + "grad_norm": 5.71875, + "learning_rate": 1.991357644340235e-05, + "loss": 0.7375, + "mean_token_accuracy": 0.8788779199123382, + "num_tokens": 32344359.0, + "step": 7690 + }, + { + "entropy": 0.3954359903931618, + "epoch": 1.793798811050239, + "grad_norm": 9.75, + "learning_rate": 1.9913463377224738e-05, + "loss": 0.7942, + "mean_token_accuracy": 0.8664931178092956, + "num_tokens": 32354899.0, + "step": 7695 + }, + { + "entropy": 0.27664031460881233, + "epoch": 1.7949644480708709, + "grad_norm": 1.015625, + "learning_rate": 1.9913350237780002e-05, + "loss": 0.3941, + "mean_token_accuracy": 0.9219299912452698, + "num_tokens": 32367076.0, + "step": 7700 + }, + { + "entropy": 0.17025743499398233, + "epoch": 1.7961300850915025, + "grad_norm": 7.0625, + "learning_rate": 1.991323702506982e-05, + "loss": 0.2769, + "mean_token_accuracy": 0.9408093333244324, + "num_tokens": 32397739.0, + "step": 7705 + }, + { + "entropy": 0.3191656589508057, + "epoch": 1.7972957221121342, + "grad_norm": 3.640625, + "learning_rate": 1.9913123739095885e-05, + "loss": 0.6224, + "mean_token_accuracy": 0.8828663647174835, + "num_tokens": 32408754.0, + "step": 7710 + }, + { + "entropy": 0.266577872633934, + "epoch": 1.798461359132766, + "grad_norm": 2.359375, + "learning_rate": 1.9913010379859885e-05, + "loss": 0.4473, + "mean_token_accuracy": 0.907596331834793, + "num_tokens": 32426950.0, + "step": 7715 + }, + { + "entropy": 0.2586742855608463, + "epoch": 1.799626996153398, + "grad_norm": 8.0625, + "learning_rate": 1.991289694736351e-05, + "loss": 0.4857, + "mean_token_accuracy": 0.9078260838985444, + "num_tokens": 32442630.0, + "step": 7720 + }, + { + "entropy": 0.21624571941792964, + "epoch": 1.8007926331740296, + "grad_norm": 3.484375, + "learning_rate": 1.9912783441608457e-05, + "loss": 0.2872, + "mean_token_accuracy": 0.9356445431709289, + "num_tokens": 32471635.0, + "step": 7725 + }, + { + "entropy": 0.27893735766410827, + "epoch": 1.8019582701946613, + "grad_norm": 6.03125, + "learning_rate": 1.9912669862596407e-05, + "loss": 0.5273, + "mean_token_accuracy": 0.9174108982086182, + "num_tokens": 32485082.0, + "step": 7730 + }, + { + "entropy": 0.28767716884613037, + "epoch": 1.8031239072152931, + "grad_norm": 6.34375, + "learning_rate": 1.9912556210329065e-05, + "loss": 0.61, + "mean_token_accuracy": 0.8947061181068421, + "num_tokens": 32496872.0, + "step": 7735 + }, + { + "entropy": 0.2328474037349224, + "epoch": 1.804289544235925, + "grad_norm": 2.859375, + "learning_rate": 1.9912442484808117e-05, + "loss": 0.2921, + "mean_token_accuracy": 0.9219469904899598, + "num_tokens": 32528904.0, + "step": 7740 + }, + { + "entropy": 0.23528299368917943, + "epoch": 1.8054551812565567, + "grad_norm": 0.828125, + "learning_rate": 1.9912328686035266e-05, + "loss": 0.2192, + "mean_token_accuracy": 0.9276223361492157, + "num_tokens": 32554343.0, + "step": 7745 + }, + { + "entropy": 0.2867948904633522, + "epoch": 1.8066208182771883, + "grad_norm": 1.9921875, + "learning_rate": 1.991221481401221e-05, + "loss": 0.4437, + "mean_token_accuracy": 0.9267707407474518, + "num_tokens": 32563606.0, + "step": 7750 + }, + { + "entropy": 0.3321516253054142, + "epoch": 1.8077864552978202, + "grad_norm": 9.25, + "learning_rate": 1.9912100868740635e-05, + "loss": 0.507, + "mean_token_accuracy": 0.8905054688453674, + "num_tokens": 32592476.0, + "step": 7755 + }, + { + "entropy": 0.2881818190217018, + "epoch": 1.808952092318452, + "grad_norm": 4.09375, + "learning_rate": 1.991198685022225e-05, + "loss": 0.527, + "mean_token_accuracy": 0.8902274250984192, + "num_tokens": 32610498.0, + "step": 7760 + }, + { + "entropy": 0.2766866091638803, + "epoch": 1.8101177293390838, + "grad_norm": 0.5859375, + "learning_rate": 1.991187275845875e-05, + "loss": 0.3293, + "mean_token_accuracy": 0.9011591911315918, + "num_tokens": 32636296.0, + "step": 7765 + }, + { + "entropy": 0.2094460479915142, + "epoch": 1.8112833663597154, + "grad_norm": 0.314453125, + "learning_rate": 1.9911758593451845e-05, + "loss": 0.3593, + "mean_token_accuracy": 0.9299488604068756, + "num_tokens": 32671123.0, + "step": 7770 + }, + { + "entropy": 0.3513875052332878, + "epoch": 1.8124490033803473, + "grad_norm": 5.28125, + "learning_rate": 1.991164435520323e-05, + "loss": 0.677, + "mean_token_accuracy": 0.8863472640514374, + "num_tokens": 32679984.0, + "step": 7775 + }, + { + "entropy": 0.20216977708041667, + "epoch": 1.8136146404009792, + "grad_norm": 4.09375, + "learning_rate": 1.9911530043714607e-05, + "loss": 0.4019, + "mean_token_accuracy": 0.9245426237583161, + "num_tokens": 32696537.0, + "step": 7780 + }, + { + "entropy": 0.2335958130657673, + "epoch": 1.8147802774216109, + "grad_norm": 5.8125, + "learning_rate": 1.991141565898769e-05, + "loss": 0.4809, + "mean_token_accuracy": 0.9251964628696442, + "num_tokens": 32708173.0, + "step": 7785 + }, + { + "entropy": 0.21332778688520193, + "epoch": 1.8159459144422427, + "grad_norm": 5.3125, + "learning_rate": 1.9911301201024174e-05, + "loss": 0.4063, + "mean_token_accuracy": 0.9151061058044434, + "num_tokens": 32736539.0, + "step": 7790 + }, + { + "entropy": 0.27434416934847833, + "epoch": 1.8171115514628746, + "grad_norm": 7.65625, + "learning_rate": 1.991118666982577e-05, + "loss": 0.4694, + "mean_token_accuracy": 0.9003516376018524, + "num_tokens": 32762365.0, + "step": 7795 + }, + { + "entropy": 0.23991770818829536, + "epoch": 1.8182771884835063, + "grad_norm": 0.5703125, + "learning_rate": 1.9911072065394186e-05, + "loss": 0.5031, + "mean_token_accuracy": 0.9172163844108582, + "num_tokens": 32777557.0, + "step": 7800 + }, + { + "entropy": 0.2554063454270363, + "epoch": 1.819442825504138, + "grad_norm": 0.7734375, + "learning_rate": 1.9910957387731133e-05, + "loss": 0.5378, + "mean_token_accuracy": 0.9102888762950897, + "num_tokens": 32791647.0, + "step": 7805 + }, + { + "entropy": 0.21996467523276805, + "epoch": 1.8206084625247698, + "grad_norm": 5.4375, + "learning_rate": 1.9910842636838317e-05, + "loss": 0.3305, + "mean_token_accuracy": 0.9108978271484375, + "num_tokens": 32820950.0, + "step": 7810 + }, + { + "entropy": 0.16689216084778308, + "epoch": 1.8217740995454017, + "grad_norm": 0.470703125, + "learning_rate": 1.9910727812717453e-05, + "loss": 0.2346, + "mean_token_accuracy": 0.9388123393058777, + "num_tokens": 32852381.0, + "step": 7815 + }, + { + "entropy": 0.20206578820943832, + "epoch": 1.8229397365660334, + "grad_norm": 3.171875, + "learning_rate": 1.9910612915370246e-05, + "loss": 0.2481, + "mean_token_accuracy": 0.9353121876716614, + "num_tokens": 32880441.0, + "step": 7820 + }, + { + "entropy": 0.17326834462583066, + "epoch": 1.824105373586665, + "grad_norm": 0.447265625, + "learning_rate": 1.991049794479842e-05, + "loss": 0.2222, + "mean_token_accuracy": 0.943918788433075, + "num_tokens": 32906058.0, + "step": 7825 + }, + { + "entropy": 0.24531815834343434, + "epoch": 1.825271010607297, + "grad_norm": 6.09375, + "learning_rate": 1.991038290100368e-05, + "loss": 0.3671, + "mean_token_accuracy": 0.9264979779720306, + "num_tokens": 32925098.0, + "step": 7830 + }, + { + "entropy": 0.15863981209695338, + "epoch": 1.8264366476279288, + "grad_norm": 0.97265625, + "learning_rate": 1.9910267783987747e-05, + "loss": 0.2095, + "mean_token_accuracy": 0.9451212227344513, + "num_tokens": 32950290.0, + "step": 7835 + }, + { + "entropy": 0.30518715754151343, + "epoch": 1.8276022846485604, + "grad_norm": 5.78125, + "learning_rate": 1.9910152593752335e-05, + "loss": 0.6845, + "mean_token_accuracy": 0.8782230734825134, + "num_tokens": 32964196.0, + "step": 7840 + }, + { + "entropy": 0.23406622558832169, + "epoch": 1.828767921669192, + "grad_norm": 0.412109375, + "learning_rate": 1.9910037330299165e-05, + "loss": 0.4499, + "mean_token_accuracy": 0.9033817291259766, + "num_tokens": 32989094.0, + "step": 7845 + }, + { + "entropy": 0.3372978284955025, + "epoch": 1.829933558689824, + "grad_norm": 6.96875, + "learning_rate": 1.9909921993629954e-05, + "loss": 0.5547, + "mean_token_accuracy": 0.876672887802124, + "num_tokens": 32998403.0, + "step": 7850 + }, + { + "entropy": 0.28051941096782684, + "epoch": 1.8310991957104559, + "grad_norm": 7.6875, + "learning_rate": 1.9909806583746417e-05, + "loss": 0.6219, + "mean_token_accuracy": 0.9012742102146148, + "num_tokens": 33007291.0, + "step": 7855 + }, + { + "entropy": 0.41001638807356355, + "epoch": 1.8322648327310875, + "grad_norm": 0.703125, + "learning_rate": 1.990969110065028e-05, + "loss": 0.4781, + "mean_token_accuracy": 0.8795218944549561, + "num_tokens": 33040644.0, + "step": 7860 + }, + { + "entropy": 0.2866490498185158, + "epoch": 1.8334304697517192, + "grad_norm": 2.109375, + "learning_rate": 1.9909575544343265e-05, + "loss": 0.7216, + "mean_token_accuracy": 0.8838229179382324, + "num_tokens": 33059451.0, + "step": 7865 + }, + { + "entropy": 0.3855082355439663, + "epoch": 1.834596106772351, + "grad_norm": 0.474609375, + "learning_rate": 1.9909459914827094e-05, + "loss": 0.6194, + "mean_token_accuracy": 0.8708645105361938, + "num_tokens": 33094571.0, + "step": 7870 + }, + { + "entropy": 0.33956164717674253, + "epoch": 1.835761743792983, + "grad_norm": 16.75, + "learning_rate": 1.990934421210349e-05, + "loss": 0.9041, + "mean_token_accuracy": 0.8637166440486908, + "num_tokens": 33102764.0, + "step": 7875 + }, + { + "entropy": 0.21683834567666055, + "epoch": 1.8369273808136146, + "grad_norm": 5.65625, + "learning_rate": 1.9909228436174184e-05, + "loss": 0.4211, + "mean_token_accuracy": 0.9251314640045166, + "num_tokens": 33128588.0, + "step": 7880 + }, + { + "entropy": 0.240068347658962, + "epoch": 1.8380930178342463, + "grad_norm": 7.03125, + "learning_rate": 1.9909112587040895e-05, + "loss": 0.4946, + "mean_token_accuracy": 0.888495671749115, + "num_tokens": 33155358.0, + "step": 7885 + }, + { + "entropy": 0.32991256155073645, + "epoch": 1.8392586548548782, + "grad_norm": 1.8046875, + "learning_rate": 1.9908996664705356e-05, + "loss": 0.6577, + "mean_token_accuracy": 0.8739120721817016, + "num_tokens": 33173942.0, + "step": 7890 + }, + { + "entropy": 0.2920090861618519, + "epoch": 1.84042429187551, + "grad_norm": 8.0625, + "learning_rate": 1.990888066916929e-05, + "loss": 0.5685, + "mean_token_accuracy": 0.8976893186569214, + "num_tokens": 33183850.0, + "step": 7895 + }, + { + "entropy": 0.21282969750463962, + "epoch": 1.8415899288961417, + "grad_norm": 6.4375, + "learning_rate": 1.9908764600434432e-05, + "loss": 0.327, + "mean_token_accuracy": 0.9291573703289032, + "num_tokens": 33209034.0, + "step": 7900 + }, + { + "entropy": 0.20020358134061098, + "epoch": 1.8427555659167734, + "grad_norm": 1.2265625, + "learning_rate": 1.9908648458502508e-05, + "loss": 0.2122, + "mean_token_accuracy": 0.9414223849773407, + "num_tokens": 33242582.0, + "step": 7905 + }, + { + "entropy": 0.263228052854538, + "epoch": 1.8439212029374052, + "grad_norm": 4.28125, + "learning_rate": 1.9908532243375254e-05, + "loss": 0.4269, + "mean_token_accuracy": 0.9060485661029816, + "num_tokens": 33253504.0, + "step": 7910 + }, + { + "entropy": 0.2593220267444849, + "epoch": 1.8450868399580371, + "grad_norm": 3.3125, + "learning_rate": 1.9908415955054403e-05, + "loss": 0.4668, + "mean_token_accuracy": 0.8901701092720031, + "num_tokens": 33269108.0, + "step": 7915 + }, + { + "entropy": 0.20201030634343625, + "epoch": 1.8462524769786688, + "grad_norm": 1.875, + "learning_rate": 1.9908299593541686e-05, + "loss": 0.4202, + "mean_token_accuracy": 0.9283289551734925, + "num_tokens": 33286890.0, + "step": 7920 + }, + { + "entropy": 0.22947550304234027, + "epoch": 1.8474181139993007, + "grad_norm": 2.296875, + "learning_rate": 1.990818315883884e-05, + "loss": 0.3026, + "mean_token_accuracy": 0.913777369260788, + "num_tokens": 33313771.0, + "step": 7925 + }, + { + "entropy": 0.481933256983757, + "epoch": 1.8485837510199326, + "grad_norm": 1.0546875, + "learning_rate": 1.99080666509476e-05, + "loss": 0.6726, + "mean_token_accuracy": 0.82081817984581, + "num_tokens": 33335889.0, + "step": 7930 + }, + { + "entropy": 0.29660770744085313, + "epoch": 1.8497493880405642, + "grad_norm": 3.53125, + "learning_rate": 1.9907950069869704e-05, + "loss": 0.572, + "mean_token_accuracy": 0.8943454444408416, + "num_tokens": 33346056.0, + "step": 7935 + }, + { + "entropy": 0.2565585695207119, + "epoch": 1.8509150250611959, + "grad_norm": 5.90625, + "learning_rate": 1.9907833415606893e-05, + "loss": 0.475, + "mean_token_accuracy": 0.911063802242279, + "num_tokens": 33356506.0, + "step": 7940 + }, + { + "entropy": 0.19112351574003697, + "epoch": 1.8520806620818278, + "grad_norm": 0.9609375, + "learning_rate": 1.99077166881609e-05, + "loss": 0.2182, + "mean_token_accuracy": 0.9242577075958252, + "num_tokens": 33380844.0, + "step": 7945 + }, + { + "entropy": 0.24360812529921533, + "epoch": 1.8532462991024596, + "grad_norm": 3.40625, + "learning_rate": 1.990759988753347e-05, + "loss": 0.4411, + "mean_token_accuracy": 0.9256996273994446, + "num_tokens": 33404874.0, + "step": 7950 + }, + { + "entropy": 0.2820186048746109, + "epoch": 1.8544119361230913, + "grad_norm": 3.296875, + "learning_rate": 1.9907483013726347e-05, + "loss": 0.6969, + "mean_token_accuracy": 0.8906833052635192, + "num_tokens": 33414812.0, + "step": 7955 + }, + { + "entropy": 0.24926723837852477, + "epoch": 1.855577573143723, + "grad_norm": 6.9375, + "learning_rate": 1.9907366066741272e-05, + "loss": 0.462, + "mean_token_accuracy": 0.9122967898845673, + "num_tokens": 33426282.0, + "step": 7960 + }, + { + "entropy": 0.2691373065114021, + "epoch": 1.8567432101643548, + "grad_norm": 7.15625, + "learning_rate": 1.9907249046579984e-05, + "loss": 0.7012, + "mean_token_accuracy": 0.8873024821281433, + "num_tokens": 33435803.0, + "step": 7965 + }, + { + "entropy": 0.22381224762648344, + "epoch": 1.8579088471849867, + "grad_norm": 0.40625, + "learning_rate": 1.9907131953244234e-05, + "loss": 0.4412, + "mean_token_accuracy": 0.9174625515937805, + "num_tokens": 33461242.0, + "step": 7970 + }, + { + "entropy": 0.24329218193888663, + "epoch": 1.8590744842056184, + "grad_norm": 7.1875, + "learning_rate": 1.9907014786735764e-05, + "loss": 0.5472, + "mean_token_accuracy": 0.9130038440227508, + "num_tokens": 33488300.0, + "step": 7975 + }, + { + "entropy": 0.2573273852467537, + "epoch": 1.86024012122625, + "grad_norm": 1.171875, + "learning_rate": 1.9906897547056326e-05, + "loss": 0.3734, + "mean_token_accuracy": 0.9164382576942444, + "num_tokens": 33510876.0, + "step": 7980 + }, + { + "entropy": 0.25100057646632196, + "epoch": 1.861405758246882, + "grad_norm": 0.8125, + "learning_rate": 1.9906780234207662e-05, + "loss": 0.4022, + "mean_token_accuracy": 0.9272303938865661, + "num_tokens": 33529188.0, + "step": 7985 + }, + { + "entropy": 0.16136855445802212, + "epoch": 1.8625713952675138, + "grad_norm": 0.640625, + "learning_rate": 1.9906662848191525e-05, + "loss": 0.1747, + "mean_token_accuracy": 0.9569616258144379, + "num_tokens": 33560189.0, + "step": 7990 + }, + { + "entropy": 0.22136666998267174, + "epoch": 1.8637370322881455, + "grad_norm": 1.1171875, + "learning_rate": 1.990654538900967e-05, + "loss": 0.3632, + "mean_token_accuracy": 0.9338773906230926, + "num_tokens": 33572755.0, + "step": 7995 + }, + { + "entropy": 0.20970266573131086, + "epoch": 1.8649026693087771, + "grad_norm": 8.8125, + "learning_rate": 1.9906427856663837e-05, + "loss": 0.4116, + "mean_token_accuracy": 0.9170647919178009, + "num_tokens": 33593609.0, + "step": 8000 + }, + { + "entropy": 0.282043930888176, + "epoch": 1.866068306329409, + "grad_norm": 3.546875, + "learning_rate": 1.9906310251155786e-05, + "loss": 0.4717, + "mean_token_accuracy": 0.9160214245319367, + "num_tokens": 33603633.0, + "step": 8005 + }, + { + "entropy": 0.22577755972743035, + "epoch": 1.867233943350041, + "grad_norm": 5.78125, + "learning_rate": 1.990619257248727e-05, + "loss": 0.4227, + "mean_token_accuracy": 0.9012669861316681, + "num_tokens": 33616958.0, + "step": 8010 + }, + { + "entropy": 0.2106149446219206, + "epoch": 1.8683995803706726, + "grad_norm": 4.53125, + "learning_rate": 1.9906074820660042e-05, + "loss": 0.1722, + "mean_token_accuracy": 0.9327871203422546, + "num_tokens": 33652503.0, + "step": 8015 + }, + { + "entropy": 0.24818255249410867, + "epoch": 1.8695652173913042, + "grad_norm": 6.84375, + "learning_rate": 1.9905956995675864e-05, + "loss": 0.4292, + "mean_token_accuracy": 0.9094979107379914, + "num_tokens": 33678232.0, + "step": 8020 + }, + { + "entropy": 0.26563177108764646, + "epoch": 1.870730854411936, + "grad_norm": 0.51953125, + "learning_rate": 1.9905839097536486e-05, + "loss": 0.3926, + "mean_token_accuracy": 0.912151038646698, + "num_tokens": 33701126.0, + "step": 8025 + }, + { + "entropy": 0.2256366118788719, + "epoch": 1.871896491432568, + "grad_norm": 0.765625, + "learning_rate": 1.990572112624367e-05, + "loss": 0.7137, + "mean_token_accuracy": 0.8795951902866364, + "num_tokens": 33716219.0, + "step": 8030 + }, + { + "entropy": 0.27713018376380205, + "epoch": 1.8730621284531996, + "grad_norm": 1.1875, + "learning_rate": 1.990560308179917e-05, + "loss": 0.4991, + "mean_token_accuracy": 0.9037650108337403, + "num_tokens": 33735725.0, + "step": 8035 + }, + { + "entropy": 0.20286475904285908, + "epoch": 1.8742277654738313, + "grad_norm": 0.4921875, + "learning_rate": 1.990548496420475e-05, + "loss": 0.3685, + "mean_token_accuracy": 0.9244837462902069, + "num_tokens": 33767813.0, + "step": 8040 + }, + { + "entropy": 0.1981831956654787, + "epoch": 1.8753934024944632, + "grad_norm": 0.427734375, + "learning_rate": 1.990536677346217e-05, + "loss": 0.384, + "mean_token_accuracy": 0.9329765677452088, + "num_tokens": 33786229.0, + "step": 8045 + }, + { + "entropy": 0.2955906016752124, + "epoch": 1.876559039515095, + "grad_norm": 6.75, + "learning_rate": 1.99052485095732e-05, + "loss": 0.4128, + "mean_token_accuracy": 0.9093118727207183, + "num_tokens": 33807214.0, + "step": 8050 + }, + { + "entropy": 0.3324073665775359, + "epoch": 1.8777246765357267, + "grad_norm": 6.5, + "learning_rate": 1.9905130172539587e-05, + "loss": 0.5357, + "mean_token_accuracy": 0.9027265429496765, + "num_tokens": 33825464.0, + "step": 8055 + }, + { + "entropy": 0.26243541240692136, + "epoch": 1.8788903135563586, + "grad_norm": 0.73046875, + "learning_rate": 1.9905011762363114e-05, + "loss": 0.5523, + "mean_token_accuracy": 0.9006957054138184, + "num_tokens": 33854687.0, + "step": 8060 + }, + { + "entropy": 0.1906348394230008, + "epoch": 1.8800559505769905, + "grad_norm": 0.427734375, + "learning_rate": 1.9904893279045535e-05, + "loss": 0.2821, + "mean_token_accuracy": 0.9366668999195099, + "num_tokens": 33887491.0, + "step": 8065 + }, + { + "entropy": 0.30027063712477686, + "epoch": 1.8812215875976221, + "grad_norm": 4.625, + "learning_rate": 1.9904774722588617e-05, + "loss": 0.4721, + "mean_token_accuracy": 0.9096989035606384, + "num_tokens": 33926836.0, + "step": 8070 + }, + { + "entropy": 0.25279470570385454, + "epoch": 1.8823872246182538, + "grad_norm": 3.09375, + "learning_rate": 1.9904656092994134e-05, + "loss": 0.5085, + "mean_token_accuracy": 0.9071327090263367, + "num_tokens": 33942968.0, + "step": 8075 + }, + { + "entropy": 0.35142759159207343, + "epoch": 1.8835528616388857, + "grad_norm": 7.4375, + "learning_rate": 1.990453739026385e-05, + "loss": 0.5752, + "mean_token_accuracy": 0.8828444004058837, + "num_tokens": 33959545.0, + "step": 8080 + }, + { + "entropy": 0.21633378714323043, + "epoch": 1.8847184986595176, + "grad_norm": 7.96875, + "learning_rate": 1.9904418614399537e-05, + "loss": 0.4535, + "mean_token_accuracy": 0.9193155586719512, + "num_tokens": 33972285.0, + "step": 8085 + }, + { + "entropy": 0.3462574012577534, + "epoch": 1.8858841356801492, + "grad_norm": 2.265625, + "learning_rate": 1.9904299765402965e-05, + "loss": 0.55, + "mean_token_accuracy": 0.8902999520301819, + "num_tokens": 33990022.0, + "step": 8090 + }, + { + "entropy": 0.2331024192273617, + "epoch": 1.887049772700781, + "grad_norm": 6.53125, + "learning_rate": 1.990418084327591e-05, + "loss": 0.4441, + "mean_token_accuracy": 0.917637175321579, + "num_tokens": 34012120.0, + "step": 8095 + }, + { + "entropy": 0.2364817973226309, + "epoch": 1.8882154097214128, + "grad_norm": 4.25, + "learning_rate": 1.990406184802014e-05, + "loss": 0.365, + "mean_token_accuracy": 0.9218486249446869, + "num_tokens": 34029217.0, + "step": 8100 + }, + { + "entropy": 0.15866220146417617, + "epoch": 1.8893810467420447, + "grad_norm": 4.71875, + "learning_rate": 1.990394277963743e-05, + "loss": 0.2626, + "mean_token_accuracy": 0.9401340663433075, + "num_tokens": 34057228.0, + "step": 8105 + }, + { + "entropy": 0.2617270015180111, + "epoch": 1.8905466837626763, + "grad_norm": 2.75, + "learning_rate": 1.990382363812956e-05, + "loss": 0.4059, + "mean_token_accuracy": 0.9226888716220856, + "num_tokens": 34072082.0, + "step": 8110 + }, + { + "entropy": 0.22469778992235662, + "epoch": 1.891712320783308, + "grad_norm": 5.84375, + "learning_rate": 1.9903704423498305e-05, + "loss": 0.4612, + "mean_token_accuracy": 0.9184304594993591, + "num_tokens": 34091194.0, + "step": 8115 + }, + { + "entropy": 0.2838134203106165, + "epoch": 1.8928779578039399, + "grad_norm": 0.9140625, + "learning_rate": 1.9903585135745442e-05, + "loss": 0.579, + "mean_token_accuracy": 0.8832531034946441, + "num_tokens": 34110127.0, + "step": 8120 + }, + { + "entropy": 0.30173076689243317, + "epoch": 1.8940435948245717, + "grad_norm": 5.0, + "learning_rate": 1.9903465774872744e-05, + "loss": 0.6166, + "mean_token_accuracy": 0.8917320251464844, + "num_tokens": 34119487.0, + "step": 8125 + }, + { + "entropy": 0.3589056760072708, + "epoch": 1.8952092318452034, + "grad_norm": 1.9765625, + "learning_rate": 1.9903346340881998e-05, + "loss": 0.4016, + "mean_token_accuracy": 0.9043947756290436, + "num_tokens": 34140987.0, + "step": 8130 + }, + { + "entropy": 0.21588889956474305, + "epoch": 1.896374868865835, + "grad_norm": 9.5625, + "learning_rate": 1.9903226833774985e-05, + "loss": 0.4883, + "mean_token_accuracy": 0.9089700102806091, + "num_tokens": 34160289.0, + "step": 8135 + }, + { + "entropy": 0.3851318970322609, + "epoch": 1.897540505886467, + "grad_norm": 2.734375, + "learning_rate": 1.9903107253553484e-05, + "loss": 0.7253, + "mean_token_accuracy": 0.8677813231945037, + "num_tokens": 34180349.0, + "step": 8140 + }, + { + "entropy": 0.2314096439629793, + "epoch": 1.8987061429070988, + "grad_norm": 3.09375, + "learning_rate": 1.990298760021928e-05, + "loss": 0.2881, + "mean_token_accuracy": 0.9264096140861511, + "num_tokens": 34199167.0, + "step": 8145 + }, + { + "entropy": 0.23651491105556488, + "epoch": 1.8998717799277305, + "grad_norm": 1.125, + "learning_rate": 1.9902867873774155e-05, + "loss": 0.4517, + "mean_token_accuracy": 0.901793110370636, + "num_tokens": 34213910.0, + "step": 8150 + }, + { + "entropy": 0.28956699259579183, + "epoch": 1.9010374169483621, + "grad_norm": 0.671875, + "learning_rate": 1.9902748074219896e-05, + "loss": 0.3724, + "mean_token_accuracy": 0.910913062095642, + "num_tokens": 34232143.0, + "step": 8155 + }, + { + "entropy": 0.26046003252267835, + "epoch": 1.902203053968994, + "grad_norm": 6.09375, + "learning_rate": 1.9902628201558288e-05, + "loss": 0.5784, + "mean_token_accuracy": 0.8950169742107391, + "num_tokens": 34251076.0, + "step": 8160 + }, + { + "entropy": 0.22758226692676545, + "epoch": 1.903368690989626, + "grad_norm": 3.3125, + "learning_rate": 1.9902508255791122e-05, + "loss": 0.3644, + "mean_token_accuracy": 0.9190545618534088, + "num_tokens": 34272139.0, + "step": 8165 + }, + { + "entropy": 0.2524715639650822, + "epoch": 1.9045343280102576, + "grad_norm": 5.125, + "learning_rate": 1.9902388236920182e-05, + "loss": 0.4574, + "mean_token_accuracy": 0.9164434134960174, + "num_tokens": 34282761.0, + "step": 8170 + }, + { + "entropy": 0.21465678084641696, + "epoch": 1.9056999650308892, + "grad_norm": 1.21875, + "learning_rate": 1.990226814494726e-05, + "loss": 0.2894, + "mean_token_accuracy": 0.9352861166000366, + "num_tokens": 34314814.0, + "step": 8175 + }, + { + "entropy": 0.20759768709540366, + "epoch": 1.9068656020515211, + "grad_norm": 1.03125, + "learning_rate": 1.990214797987415e-05, + "loss": 0.2024, + "mean_token_accuracy": 0.936842006444931, + "num_tokens": 34355228.0, + "step": 8180 + }, + { + "entropy": 0.22588071897625922, + "epoch": 1.908031239072153, + "grad_norm": 5.84375, + "learning_rate": 1.9902027741702636e-05, + "loss": 0.4937, + "mean_token_accuracy": 0.924340796470642, + "num_tokens": 34375090.0, + "step": 8185 + }, + { + "entropy": 0.21736494824290276, + "epoch": 1.9091968760927847, + "grad_norm": 2.328125, + "learning_rate": 1.9901907430434516e-05, + "loss": 0.3859, + "mean_token_accuracy": 0.9157032668590546, + "num_tokens": 34390572.0, + "step": 8190 + }, + { + "entropy": 0.2940547376871109, + "epoch": 1.9103625131134163, + "grad_norm": 3.625, + "learning_rate": 1.9901787046071587e-05, + "loss": 0.6023, + "mean_token_accuracy": 0.8895644903182983, + "num_tokens": 34410885.0, + "step": 8195 + }, + { + "entropy": 0.26008196324110033, + "epoch": 1.9115281501340484, + "grad_norm": 5.34375, + "learning_rate": 1.9901666588615636e-05, + "loss": 0.3953, + "mean_token_accuracy": 0.9115445196628571, + "num_tokens": 34436963.0, + "step": 8200 + }, + { + "entropy": 0.3289894135668874, + "epoch": 1.91269378715468, + "grad_norm": 0.458984375, + "learning_rate": 1.9901546058068467e-05, + "loss": 0.4833, + "mean_token_accuracy": 0.8767846524715424, + "num_tokens": 34466493.0, + "step": 8205 + }, + { + "entropy": 0.2564432403072715, + "epoch": 1.9138594241753117, + "grad_norm": 2.328125, + "learning_rate": 1.990142545443187e-05, + "loss": 0.4118, + "mean_token_accuracy": 0.9165556848049163, + "num_tokens": 34485513.0, + "step": 8210 + }, + { + "entropy": 0.2016835320740938, + "epoch": 1.9150250611959436, + "grad_norm": 0.4609375, + "learning_rate": 1.990130477770765e-05, + "loss": 0.352, + "mean_token_accuracy": 0.9331004083156585, + "num_tokens": 34516163.0, + "step": 8215 + }, + { + "entropy": 0.19940345510840415, + "epoch": 1.9161906982165755, + "grad_norm": 5.03125, + "learning_rate": 1.99011840278976e-05, + "loss": 0.4198, + "mean_token_accuracy": 0.9229754269123077, + "num_tokens": 34543852.0, + "step": 8220 + }, + { + "entropy": 0.24907640200108289, + "epoch": 1.9173563352372072, + "grad_norm": 0.59765625, + "learning_rate": 1.990106320500353e-05, + "loss": 0.4389, + "mean_token_accuracy": 0.9167008459568023, + "num_tokens": 34572670.0, + "step": 8225 + }, + { + "entropy": 0.2844414710998535, + "epoch": 1.9185219722578388, + "grad_norm": 3.0, + "learning_rate": 1.9900942309027234e-05, + "loss": 0.4554, + "mean_token_accuracy": 0.8996649920940399, + "num_tokens": 34592548.0, + "step": 8230 + }, + { + "entropy": 0.2390742838382721, + "epoch": 1.9196876092784707, + "grad_norm": 0.796875, + "learning_rate": 1.9900821339970516e-05, + "loss": 0.499, + "mean_token_accuracy": 0.9069253146648407, + "num_tokens": 34615695.0, + "step": 8235 + }, + { + "entropy": 0.2000205848366022, + "epoch": 1.9208532462991026, + "grad_norm": 0.455078125, + "learning_rate": 1.9900700297835183e-05, + "loss": 0.1793, + "mean_token_accuracy": 0.93028022646904, + "num_tokens": 34643986.0, + "step": 8240 + }, + { + "entropy": 0.3019634872674942, + "epoch": 1.9220188833197343, + "grad_norm": 4.75, + "learning_rate": 1.9900579182623034e-05, + "loss": 0.4367, + "mean_token_accuracy": 0.9008702993392944, + "num_tokens": 34665139.0, + "step": 8245 + }, + { + "entropy": 0.2354019209742546, + "epoch": 1.923184520340366, + "grad_norm": 1.1640625, + "learning_rate": 1.990045799433588e-05, + "loss": 0.4314, + "mean_token_accuracy": 0.9259082734584808, + "num_tokens": 34681508.0, + "step": 8250 + }, + { + "entropy": 0.28061345368623736, + "epoch": 1.9243501573609978, + "grad_norm": 3.921875, + "learning_rate": 1.9900336732975528e-05, + "loss": 0.4488, + "mean_token_accuracy": 0.9200140237808228, + "num_tokens": 34708622.0, + "step": 8255 + }, + { + "entropy": 0.34055967777967455, + "epoch": 1.9255157943816297, + "grad_norm": 10.125, + "learning_rate": 1.990021539854378e-05, + "loss": 0.7395, + "mean_token_accuracy": 0.8829732298851013, + "num_tokens": 34717903.0, + "step": 8260 + }, + { + "entropy": 0.2822610292583704, + "epoch": 1.9266814314022613, + "grad_norm": 0.328125, + "learning_rate": 1.9900093991042453e-05, + "loss": 0.568, + "mean_token_accuracy": 0.8975172460079193, + "num_tokens": 34744369.0, + "step": 8265 + }, + { + "entropy": 0.20326493717730046, + "epoch": 1.927847068422893, + "grad_norm": 0.423828125, + "learning_rate": 1.9899972510473356e-05, + "loss": 0.4511, + "mean_token_accuracy": 0.9232929706573486, + "num_tokens": 34766479.0, + "step": 8270 + }, + { + "entropy": 0.3232834428548813, + "epoch": 1.9290127054435249, + "grad_norm": 3.421875, + "learning_rate": 1.98998509568383e-05, + "loss": 0.5616, + "mean_token_accuracy": 0.885380220413208, + "num_tokens": 34781112.0, + "step": 8275 + }, + { + "entropy": 0.17031664066016675, + "epoch": 1.9301783424641568, + "grad_norm": 0.373046875, + "learning_rate": 1.9899729330139092e-05, + "loss": 0.2872, + "mean_token_accuracy": 0.9361357748508453, + "num_tokens": 34812316.0, + "step": 8280 + }, + { + "entropy": 0.17128741517663001, + "epoch": 1.9313439794847884, + "grad_norm": 1.8046875, + "learning_rate": 1.9899607630377553e-05, + "loss": 0.2588, + "mean_token_accuracy": 0.9484428107738495, + "num_tokens": 34827639.0, + "step": 8285 + }, + { + "entropy": 0.2159100666642189, + "epoch": 1.93250961650542, + "grad_norm": 0.6484375, + "learning_rate": 1.9899485857555492e-05, + "loss": 0.3604, + "mean_token_accuracy": 0.9325559318065644, + "num_tokens": 34846897.0, + "step": 8290 + }, + { + "entropy": 0.24534763917326927, + "epoch": 1.933675253526052, + "grad_norm": 5.90625, + "learning_rate": 1.9899364011674732e-05, + "loss": 0.4428, + "mean_token_accuracy": 0.9214135646820069, + "num_tokens": 34858892.0, + "step": 8295 + }, + { + "entropy": 0.2856233362108469, + "epoch": 1.9348408905466838, + "grad_norm": 2.03125, + "learning_rate": 1.9899242092737083e-05, + "loss": 0.5202, + "mean_token_accuracy": 0.887435781955719, + "num_tokens": 34873653.0, + "step": 8300 + }, + { + "entropy": 0.2192686103284359, + "epoch": 1.9360065275673155, + "grad_norm": 1.4609375, + "learning_rate": 1.9899120100744366e-05, + "loss": 0.3997, + "mean_token_accuracy": 0.9170324444770813, + "num_tokens": 34902515.0, + "step": 8305 + }, + { + "entropy": 0.24926665425300598, + "epoch": 1.9371721645879472, + "grad_norm": 2.9375, + "learning_rate": 1.98989980356984e-05, + "loss": 0.492, + "mean_token_accuracy": 0.908800745010376, + "num_tokens": 34914633.0, + "step": 8310 + }, + { + "entropy": 0.5896746765822172, + "epoch": 1.938337801608579, + "grad_norm": 5.75, + "learning_rate": 1.9898875897601006e-05, + "loss": 0.9085, + "mean_token_accuracy": 0.8313942730426789, + "num_tokens": 34960403.0, + "step": 8315 + }, + { + "entropy": 0.21931098848581315, + "epoch": 1.939503438629211, + "grad_norm": 2.828125, + "learning_rate": 1.9898753686454e-05, + "loss": 0.3373, + "mean_token_accuracy": 0.9282977223396301, + "num_tokens": 34976041.0, + "step": 8320 + }, + { + "entropy": 0.19581068167462945, + "epoch": 1.9406690756498426, + "grad_norm": 5.46875, + "learning_rate": 1.989863140225921e-05, + "loss": 0.3874, + "mean_token_accuracy": 0.9340884864330292, + "num_tokens": 35013548.0, + "step": 8325 + }, + { + "entropy": 0.22232236564159394, + "epoch": 1.9418347126704743, + "grad_norm": 6.84375, + "learning_rate": 1.9898509045018457e-05, + "loss": 0.5121, + "mean_token_accuracy": 0.9140840828418731, + "num_tokens": 35031301.0, + "step": 8330 + }, + { + "entropy": 0.36525106728076934, + "epoch": 1.9430003496911064, + "grad_norm": 6.0, + "learning_rate": 1.989838661473357e-05, + "loss": 0.7153, + "mean_token_accuracy": 0.8757256925106048, + "num_tokens": 35039659.0, + "step": 8335 + }, + { + "entropy": 0.24257199615240096, + "epoch": 1.944165986711738, + "grad_norm": 8.3125, + "learning_rate": 1.9898264111406368e-05, + "loss": 0.5561, + "mean_token_accuracy": 0.8935992956161499, + "num_tokens": 35050633.0, + "step": 8340 + }, + { + "entropy": 0.16945633180439473, + "epoch": 1.9453316237323697, + "grad_norm": 3.78125, + "learning_rate": 1.9898141535038682e-05, + "loss": 0.2938, + "mean_token_accuracy": 0.9406558275222778, + "num_tokens": 35073446.0, + "step": 8345 + }, + { + "entropy": 0.19805988781154155, + "epoch": 1.9464972607530016, + "grad_norm": 5.15625, + "learning_rate": 1.9898018885632338e-05, + "loss": 0.3331, + "mean_token_accuracy": 0.9338889300823212, + "num_tokens": 35090802.0, + "step": 8350 + }, + { + "entropy": 0.2623408816754818, + "epoch": 1.9476628977736334, + "grad_norm": 1.2578125, + "learning_rate": 1.9897896163189165e-05, + "loss": 0.3252, + "mean_token_accuracy": 0.9054560959339142, + "num_tokens": 35121812.0, + "step": 8355 + }, + { + "entropy": 0.2672824915498495, + "epoch": 1.948828534794265, + "grad_norm": 0.9765625, + "learning_rate": 1.9897773367710993e-05, + "loss": 0.3856, + "mean_token_accuracy": 0.9044731080532074, + "num_tokens": 35157778.0, + "step": 8360 + }, + { + "entropy": 0.3403429910540581, + "epoch": 1.9499941718148968, + "grad_norm": 13.75, + "learning_rate": 1.9897650499199654e-05, + "loss": 0.8828, + "mean_token_accuracy": 0.8561728596687317, + "num_tokens": 35174497.0, + "step": 8365 + }, + { + "entropy": 0.20890892669558525, + "epoch": 1.9511598088355286, + "grad_norm": 6.25, + "learning_rate": 1.989752755765698e-05, + "loss": 0.4039, + "mean_token_accuracy": 0.927830719947815, + "num_tokens": 35187734.0, + "step": 8370 + }, + { + "entropy": 0.29691824987530707, + "epoch": 1.9523254458561605, + "grad_norm": 7.78125, + "learning_rate": 1.9897404543084804e-05, + "loss": 0.515, + "mean_token_accuracy": 0.8886303842067719, + "num_tokens": 35202524.0, + "step": 8375 + }, + { + "entropy": 0.2688115481287241, + "epoch": 1.9534910828767922, + "grad_norm": 5.71875, + "learning_rate": 1.989728145548496e-05, + "loss": 0.4996, + "mean_token_accuracy": 0.9002360701560974, + "num_tokens": 35217656.0, + "step": 8380 + }, + { + "entropy": 0.24384147226810454, + "epoch": 1.9546567198974238, + "grad_norm": 0.9296875, + "learning_rate": 1.9897158294859282e-05, + "loss": 0.5896, + "mean_token_accuracy": 0.9057703733444213, + "num_tokens": 35231596.0, + "step": 8385 + }, + { + "entropy": 0.17410086654126644, + "epoch": 1.9558223569180557, + "grad_norm": 0.6015625, + "learning_rate": 1.9897035061209608e-05, + "loss": 0.2515, + "mean_token_accuracy": 0.9428678154945374, + "num_tokens": 35258131.0, + "step": 8390 + }, + { + "entropy": 0.27097597122192385, + "epoch": 1.9569879939386876, + "grad_norm": 11.1875, + "learning_rate": 1.989691175453778e-05, + "loss": 0.516, + "mean_token_accuracy": 0.9027996301651001, + "num_tokens": 35279654.0, + "step": 8395 + }, + { + "entropy": 0.2009141666814685, + "epoch": 1.9581536309593193, + "grad_norm": 5.40625, + "learning_rate": 1.9896788374845628e-05, + "loss": 0.4027, + "mean_token_accuracy": 0.9286759912967681, + "num_tokens": 35298667.0, + "step": 8400 + }, + { + "entropy": 0.27596412152051925, + "epoch": 1.959319267979951, + "grad_norm": 5.40625, + "learning_rate": 1.9896664922134995e-05, + "loss": 0.6089, + "mean_token_accuracy": 0.8886427342891693, + "num_tokens": 35312271.0, + "step": 8405 + }, + { + "entropy": 0.26666624546051027, + "epoch": 1.9604849050005828, + "grad_norm": 1.7578125, + "learning_rate": 1.9896541396407727e-05, + "loss": 0.4534, + "mean_token_accuracy": 0.8949091792106628, + "num_tokens": 35326375.0, + "step": 8410 + }, + { + "entropy": 0.25538104176521303, + "epoch": 1.9616505420212147, + "grad_norm": 0.71484375, + "learning_rate": 1.9896417797665663e-05, + "loss": 0.4964, + "mean_token_accuracy": 0.912453830242157, + "num_tokens": 35346842.0, + "step": 8415 + }, + { + "entropy": 0.25083237886428833, + "epoch": 1.9628161790418464, + "grad_norm": 4.53125, + "learning_rate": 1.989629412591064e-05, + "loss": 0.5104, + "mean_token_accuracy": 0.9088790953159332, + "num_tokens": 35366749.0, + "step": 8420 + }, + { + "entropy": 0.31124439314007757, + "epoch": 1.963981816062478, + "grad_norm": 4.96875, + "learning_rate": 1.989617038114451e-05, + "loss": 0.6628, + "mean_token_accuracy": 0.8773128747940063, + "num_tokens": 35379874.0, + "step": 8425 + }, + { + "entropy": 0.2417039457708597, + "epoch": 1.96514745308311, + "grad_norm": 2.8125, + "learning_rate": 1.9896046563369114e-05, + "loss": 0.4759, + "mean_token_accuracy": 0.9157770335674286, + "num_tokens": 35395582.0, + "step": 8430 + }, + { + "entropy": 0.24686218202114105, + "epoch": 1.9663130901037418, + "grad_norm": 9.375, + "learning_rate": 1.9895922672586302e-05, + "loss": 0.5481, + "mean_token_accuracy": 0.9166857957839966, + "num_tokens": 35407840.0, + "step": 8435 + }, + { + "entropy": 0.3242442309856415, + "epoch": 1.9674787271243734, + "grad_norm": 0.73046875, + "learning_rate": 1.9895798708797917e-05, + "loss": 0.5274, + "mean_token_accuracy": 0.8759232670068741, + "num_tokens": 35431071.0, + "step": 8440 + }, + { + "entropy": 0.22474229484796523, + "epoch": 1.968644364145005, + "grad_norm": 5.1875, + "learning_rate": 1.9895674672005812e-05, + "loss": 0.3409, + "mean_token_accuracy": 0.9216860592365265, + "num_tokens": 35451477.0, + "step": 8445 + }, + { + "entropy": 0.27756930217146875, + "epoch": 1.969810001165637, + "grad_norm": 1.09375, + "learning_rate": 1.9895550562211833e-05, + "loss": 0.4468, + "mean_token_accuracy": 0.9098743259906769, + "num_tokens": 35465074.0, + "step": 8450 + }, + { + "entropy": 0.24696202836930753, + "epoch": 1.9709756381862689, + "grad_norm": 0.80078125, + "learning_rate": 1.9895426379417828e-05, + "loss": 0.2022, + "mean_token_accuracy": 0.9224382996559143, + "num_tokens": 35489306.0, + "step": 8455 + }, + { + "entropy": 0.2881707139313221, + "epoch": 1.9721412752069005, + "grad_norm": 2.6875, + "learning_rate": 1.9895302123625656e-05, + "loss": 0.6088, + "mean_token_accuracy": 0.8947436928749084, + "num_tokens": 35498686.0, + "step": 8460 + }, + { + "entropy": 0.3253455236554146, + "epoch": 1.9733069122275322, + "grad_norm": 3.65625, + "learning_rate": 1.9895177794837167e-05, + "loss": 0.6071, + "mean_token_accuracy": 0.8912819802761078, + "num_tokens": 35508013.0, + "step": 8465 + }, + { + "entropy": 0.2203302625566721, + "epoch": 1.9744725492481643, + "grad_norm": 0.96875, + "learning_rate": 1.9895053393054214e-05, + "loss": 0.378, + "mean_token_accuracy": 0.9241073846817016, + "num_tokens": 35522649.0, + "step": 8470 + }, + { + "entropy": 0.18942125625908374, + "epoch": 1.975638186268796, + "grad_norm": 3.859375, + "learning_rate": 1.9894928918278652e-05, + "loss": 0.2913, + "mean_token_accuracy": 0.9361551582813263, + "num_tokens": 35540177.0, + "step": 8475 + }, + { + "entropy": 0.24017982184886932, + "epoch": 1.9768038232894276, + "grad_norm": 4.28125, + "learning_rate": 1.989480437051234e-05, + "loss": 0.4545, + "mean_token_accuracy": 0.9062156975269318, + "num_tokens": 35553338.0, + "step": 8480 + }, + { + "entropy": 0.2074093535542488, + "epoch": 1.9779694603100595, + "grad_norm": 2.640625, + "learning_rate": 1.9894679749757126e-05, + "loss": 0.3295, + "mean_token_accuracy": 0.9237248361110687, + "num_tokens": 35571435.0, + "step": 8485 + }, + { + "entropy": 0.42545375488698484, + "epoch": 1.9791350973306914, + "grad_norm": 5.4375, + "learning_rate": 1.989455505601488e-05, + "loss": 0.7273, + "mean_token_accuracy": 0.8888332307338714, + "num_tokens": 35597466.0, + "step": 8490 + }, + { + "entropy": 0.20387993920594455, + "epoch": 1.980300734351323, + "grad_norm": 0.9375, + "learning_rate": 1.9894430289287453e-05, + "loss": 0.286, + "mean_token_accuracy": 0.9397811949253082, + "num_tokens": 35621826.0, + "step": 8495 + }, + { + "entropy": 0.25947265028953553, + "epoch": 1.9814663713719547, + "grad_norm": 4.59375, + "learning_rate": 1.9894305449576713e-05, + "loss": 0.3965, + "mean_token_accuracy": 0.92665935754776, + "num_tokens": 35633794.0, + "step": 8500 + }, + { + "entropy": 0.30034587234258653, + "epoch": 1.9826320083925866, + "grad_norm": 0.59765625, + "learning_rate": 1.9894180536884514e-05, + "loss": 0.5105, + "mean_token_accuracy": 0.9011545956134797, + "num_tokens": 35656258.0, + "step": 8505 + }, + { + "entropy": 0.17847388423979282, + "epoch": 1.9837976454132185, + "grad_norm": 5.65625, + "learning_rate": 1.9894055551212725e-05, + "loss": 0.2273, + "mean_token_accuracy": 0.925702440738678, + "num_tokens": 35683494.0, + "step": 8510 + }, + { + "entropy": 0.17927013970911504, + "epoch": 1.9849632824338501, + "grad_norm": 1.5625, + "learning_rate": 1.9893930492563203e-05, + "loss": 0.2927, + "mean_token_accuracy": 0.9423952460289001, + "num_tokens": 35713313.0, + "step": 8515 + }, + { + "entropy": 0.30927509516477586, + "epoch": 1.9861289194544818, + "grad_norm": 1.234375, + "learning_rate": 1.9893805360937818e-05, + "loss": 0.5729, + "mean_token_accuracy": 0.8941884458065033, + "num_tokens": 35735043.0, + "step": 8520 + }, + { + "entropy": 0.23369169272482396, + "epoch": 1.9872945564751137, + "grad_norm": 4.65625, + "learning_rate": 1.9893680156338434e-05, + "loss": 0.4675, + "mean_token_accuracy": 0.9103543817996979, + "num_tokens": 35752057.0, + "step": 8525 + }, + { + "entropy": 0.27408935129642487, + "epoch": 1.9884601934957455, + "grad_norm": 2.265625, + "learning_rate": 1.9893554878766918e-05, + "loss": 0.5507, + "mean_token_accuracy": 0.8969059348106384, + "num_tokens": 35765013.0, + "step": 8530 + }, + { + "entropy": 0.2391377042979002, + "epoch": 1.9896258305163772, + "grad_norm": 3.21875, + "learning_rate": 1.9893429528225143e-05, + "loss": 0.375, + "mean_token_accuracy": 0.925329464673996, + "num_tokens": 35780367.0, + "step": 8535 + }, + { + "entropy": 0.2847207933664322, + "epoch": 1.9907914675370089, + "grad_norm": 2.171875, + "learning_rate": 1.989330410471497e-05, + "loss": 0.5161, + "mean_token_accuracy": 0.9022741556167603, + "num_tokens": 35791583.0, + "step": 8540 + }, + { + "entropy": 0.26735531222075226, + "epoch": 1.9919571045576407, + "grad_norm": 0.63671875, + "learning_rate": 1.989317860823827e-05, + "loss": 0.4382, + "mean_token_accuracy": 0.9113502144813538, + "num_tokens": 35812087.0, + "step": 8545 + }, + { + "entropy": 0.28238332718610765, + "epoch": 1.9931227415782726, + "grad_norm": 6.96875, + "learning_rate": 1.989305303879692e-05, + "loss": 0.647, + "mean_token_accuracy": 0.8785060703754425, + "num_tokens": 35822522.0, + "step": 8550 + }, + { + "entropy": 0.24268617071211337, + "epoch": 1.9942883785989043, + "grad_norm": 5.875, + "learning_rate": 1.989292739639279e-05, + "loss": 0.3323, + "mean_token_accuracy": 0.9189518809318542, + "num_tokens": 35843104.0, + "step": 8555 + }, + { + "entropy": 0.28384055122733115, + "epoch": 1.995454015619536, + "grad_norm": 3.75, + "learning_rate": 1.9892801681027754e-05, + "loss": 0.5158, + "mean_token_accuracy": 0.8955554962158203, + "num_tokens": 35856347.0, + "step": 8560 + }, + { + "entropy": 0.21551661379635334, + "epoch": 1.9966196526401678, + "grad_norm": 1.5546875, + "learning_rate": 1.9892675892703685e-05, + "loss": 0.3563, + "mean_token_accuracy": 0.9304292321205139, + "num_tokens": 35880168.0, + "step": 8565 + }, + { + "entropy": 0.2399544682353735, + "epoch": 1.9977852896607997, + "grad_norm": 0.6640625, + "learning_rate": 1.989255003142246e-05, + "loss": 0.4766, + "mean_token_accuracy": 0.9071892321109771, + "num_tokens": 35901547.0, + "step": 8570 + }, + { + "entropy": 0.1419397760182619, + "epoch": 1.9989509266814314, + "grad_norm": 0.423828125, + "learning_rate": 1.9892424097185953e-05, + "loss": 0.1211, + "mean_token_accuracy": 0.9566732287406922, + "num_tokens": 35938802.0, + "step": 8575 + }, + { + "entropy": 0.2179360402127107, + "epoch": 2.0, + "grad_norm": 14.5625, + "learning_rate": 1.989229808999604e-05, + "loss": 0.4882, + "mean_token_accuracy": 0.9187219209141202, + "num_tokens": 35961500.0, + "step": 8580 + }, + { + "entropy": 0.23930302262306213, + "epoch": 2.0011656370206317, + "grad_norm": 1.0859375, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.2686, + "mean_token_accuracy": 0.918672627210617, + "num_tokens": 33462.0, + "step": 8585 + }, + { + "entropy": 0.17616038862615824, + "epoch": 2.0023312740412638, + "grad_norm": 6.09375, + "learning_rate": 6.923076923076923e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.9223396599292755, + "num_tokens": 68428.0, + "step": 8590 + }, + { + "entropy": 0.22200995236635207, + "epoch": 2.0034969110618954, + "grad_norm": 1.875, + "learning_rate": 1.0769230769230771e-05, + "loss": 0.3672, + "mean_token_accuracy": 0.9198292434215546, + "num_tokens": 80228.0, + "step": 8595 + }, + { + "entropy": 0.19995370060205458, + "epoch": 2.004662548082527, + "grad_norm": 1.734375, + "learning_rate": 1.4615384615384617e-05, + "loss": 0.3429, + "mean_token_accuracy": 0.9279805183410644, + "num_tokens": 99552.0, + "step": 8600 + }, + { + "entropy": 0.19114257879555224, + "epoch": 2.0058281851031587, + "grad_norm": 0.98828125, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.3238, + "mean_token_accuracy": 0.9297996819019317, + "num_tokens": 115995.0, + "step": 8605 + }, + { + "entropy": 0.12970165833830832, + "epoch": 2.006993822123791, + "grad_norm": 0.9296875, + "learning_rate": 2.230769230769231e-05, + "loss": 0.2439, + "mean_token_accuracy": 0.9543951272964477, + "num_tokens": 136920.0, + "step": 8610 + }, + { + "entropy": 0.2722026415169239, + "epoch": 2.0081594591444225, + "grad_norm": 2.0625, + "learning_rate": 2.6153846153846157e-05, + "loss": 0.4356, + "mean_token_accuracy": 0.9024666368961334, + "num_tokens": 158942.0, + "step": 8615 + }, + { + "entropy": 0.21159663870930673, + "epoch": 2.009325096165054, + "grad_norm": 6.28125, + "learning_rate": 3e-05, + "loss": 0.3646, + "mean_token_accuracy": 0.9342296838760376, + "num_tokens": 178917.0, + "step": 8620 + }, + { + "entropy": 0.18026983588933945, + "epoch": 2.010490733185686, + "grad_norm": 2.203125, + "learning_rate": 3.384615384615385e-05, + "loss": 0.2755, + "mean_token_accuracy": 0.93057501912117, + "num_tokens": 195722.0, + "step": 8625 + }, + { + "entropy": 0.26880242880433797, + "epoch": 2.011656370206318, + "grad_norm": 11.375, + "learning_rate": 3.769230769230769e-05, + "loss": 0.5401, + "mean_token_accuracy": 0.9098369419574738, + "num_tokens": 219314.0, + "step": 8630 + }, + { + "entropy": 0.2589976190589368, + "epoch": 2.0128220072269496, + "grad_norm": 6.75, + "learning_rate": 4.1538461538461544e-05, + "loss": 0.492, + "mean_token_accuracy": 0.909549605846405, + "num_tokens": 247321.0, + "step": 8635 + }, + { + "entropy": 0.2384048379957676, + "epoch": 2.0139876442475813, + "grad_norm": 7.28125, + "learning_rate": 4.538461538461539e-05, + "loss": 0.3351, + "mean_token_accuracy": 0.9222664594650268, + "num_tokens": 267350.0, + "step": 8640 + }, + { + "entropy": 0.1944281917065382, + "epoch": 2.015153281268213, + "grad_norm": 3.375, + "learning_rate": 4.923076923076924e-05, + "loss": 0.2959, + "mean_token_accuracy": 0.9433736026287078, + "num_tokens": 282504.0, + "step": 8645 + }, + { + "entropy": 0.28175385296344757, + "epoch": 2.016318918288845, + "grad_norm": 14.75, + "learning_rate": 4.999999994035399e-05, + "loss": 0.6312, + "mean_token_accuracy": 0.8823612987995147, + "num_tokens": 300273.0, + "step": 8650 + }, + { + "entropy": 0.22739908546209336, + "epoch": 2.0174845553094767, + "grad_norm": 1.65625, + "learning_rate": 4.999999969804205e-05, + "loss": 0.3442, + "mean_token_accuracy": 0.9122324585914612, + "num_tokens": 323539.0, + "step": 8655 + }, + { + "entropy": 0.2628894064575434, + "epoch": 2.0186501923301083, + "grad_norm": 1.4765625, + "learning_rate": 4.9999999269336304e-05, + "loss": 0.427, + "mean_token_accuracy": 0.9008684635162354, + "num_tokens": 348652.0, + "step": 8660 + }, + { + "entropy": 0.2020172208547592, + "epoch": 2.01981582935074, + "grad_norm": 0.65625, + "learning_rate": 4.999999865423678e-05, + "loss": 0.3984, + "mean_token_accuracy": 0.9196797311306, + "num_tokens": 369047.0, + "step": 8665 + }, + { + "entropy": 0.21287547498941423, + "epoch": 2.020981466371372, + "grad_norm": 7.15625, + "learning_rate": 4.9999997852743475e-05, + "loss": 0.5729, + "mean_token_accuracy": 0.8993529319763184, + "num_tokens": 388371.0, + "step": 8670 + }, + { + "entropy": 0.19923520274460316, + "epoch": 2.0221471033920038, + "grad_norm": 1.1875, + "learning_rate": 4.9999996864856396e-05, + "loss": 0.2562, + "mean_token_accuracy": 0.9336078345775605, + "num_tokens": 412629.0, + "step": 8675 + }, + { + "entropy": 0.30621002092957494, + "epoch": 2.0233127404126354, + "grad_norm": 7.9375, + "learning_rate": 4.999999569057556e-05, + "loss": 0.5006, + "mean_token_accuracy": 0.903810465335846, + "num_tokens": 432090.0, + "step": 8680 + }, + { + "entropy": 0.21448403932154178, + "epoch": 2.024478377433267, + "grad_norm": 5.21875, + "learning_rate": 4.9999994329900996e-05, + "loss": 0.4303, + "mean_token_accuracy": 0.9238200068473816, + "num_tokens": 456098.0, + "step": 8685 + }, + { + "entropy": 0.2428566601127386, + "epoch": 2.025644014453899, + "grad_norm": 0.373046875, + "learning_rate": 4.999999278283271e-05, + "loss": 0.3631, + "mean_token_accuracy": 0.9116427540779114, + "num_tokens": 495551.0, + "step": 8690 + }, + { + "entropy": 0.29837720319628713, + "epoch": 2.026809651474531, + "grad_norm": 3.84375, + "learning_rate": 4.999999104937073e-05, + "loss": 0.3483, + "mean_token_accuracy": 0.9145365118980407, + "num_tokens": 511921.0, + "step": 8695 + }, + { + "entropy": 0.20137713775038718, + "epoch": 2.0279752884951625, + "grad_norm": 5.34375, + "learning_rate": 4.9999989129515084e-05, + "loss": 0.3361, + "mean_token_accuracy": 0.9304125428199768, + "num_tokens": 531784.0, + "step": 8700 + }, + { + "entropy": 0.2114957518875599, + "epoch": 2.029140925515794, + "grad_norm": 1.8515625, + "learning_rate": 4.9999987023265806e-05, + "loss": 0.4151, + "mean_token_accuracy": 0.9195410490036011, + "num_tokens": 547546.0, + "step": 8705 + }, + { + "entropy": 0.20500697158277034, + "epoch": 2.0303065625364263, + "grad_norm": 1.59375, + "learning_rate": 4.9999984730622904e-05, + "loss": 0.3087, + "mean_token_accuracy": 0.922219431400299, + "num_tokens": 564629.0, + "step": 8710 + }, + { + "entropy": 0.19661780446767807, + "epoch": 2.031472199557058, + "grad_norm": 4.9375, + "learning_rate": 4.9999982251586444e-05, + "loss": 0.419, + "mean_token_accuracy": 0.9265513479709625, + "num_tokens": 579540.0, + "step": 8715 + }, + { + "entropy": 0.3375698685646057, + "epoch": 2.0326378365776896, + "grad_norm": 5.78125, + "learning_rate": 4.999997958615644e-05, + "loss": 0.5782, + "mean_token_accuracy": 0.8879574716091156, + "num_tokens": 596627.0, + "step": 8720 + }, + { + "entropy": 0.21210809499025346, + "epoch": 2.0338034735983217, + "grad_norm": 2.34375, + "learning_rate": 4.999997673433294e-05, + "loss": 0.3076, + "mean_token_accuracy": 0.9169882595539093, + "num_tokens": 617829.0, + "step": 8725 + }, + { + "entropy": 0.2801529258489609, + "epoch": 2.0349691106189534, + "grad_norm": 7.75, + "learning_rate": 4.9999973696115984e-05, + "loss": 0.8278, + "mean_token_accuracy": 0.8704663276672363, + "num_tokens": 627421.0, + "step": 8730 + }, + { + "entropy": 0.31733031272888185, + "epoch": 2.036134747639585, + "grad_norm": 7.65625, + "learning_rate": 4.9999970471505634e-05, + "loss": 0.6683, + "mean_token_accuracy": 0.8924909234046936, + "num_tokens": 639800.0, + "step": 8735 + }, + { + "entropy": 0.21051425635814666, + "epoch": 2.0373003846602167, + "grad_norm": 6.84375, + "learning_rate": 4.999996706050191e-05, + "loss": 0.4327, + "mean_token_accuracy": 0.9305059254169464, + "num_tokens": 654829.0, + "step": 8740 + }, + { + "entropy": 0.2580501724034548, + "epoch": 2.038466021680849, + "grad_norm": 0.5234375, + "learning_rate": 4.999996346310487e-05, + "loss": 0.3248, + "mean_token_accuracy": 0.9246833562850952, + "num_tokens": 681095.0, + "step": 8745 + }, + { + "entropy": 0.3147982403635979, + "epoch": 2.0396316587014804, + "grad_norm": 4.09375, + "learning_rate": 4.9999959679314586e-05, + "loss": 0.5683, + "mean_token_accuracy": 0.902126133441925, + "num_tokens": 700848.0, + "step": 8750 + }, + { + "entropy": 0.21838752701878547, + "epoch": 2.040797295722112, + "grad_norm": 1.0625, + "learning_rate": 4.99999557091311e-05, + "loss": 0.3909, + "mean_token_accuracy": 0.9119821190834045, + "num_tokens": 720120.0, + "step": 8755 + }, + { + "entropy": 0.2715337313711643, + "epoch": 2.0419629327427438, + "grad_norm": 3.921875, + "learning_rate": 4.999995155255447e-05, + "loss": 0.5753, + "mean_token_accuracy": 0.8893308877944947, + "num_tokens": 740168.0, + "step": 8760 + }, + { + "entropy": 0.37294210195541383, + "epoch": 2.043128569763376, + "grad_norm": 4.34375, + "learning_rate": 4.9999947209584754e-05, + "loss": 0.6569, + "mean_token_accuracy": 0.8944745361804962, + "num_tokens": 751619.0, + "step": 8765 + }, + { + "entropy": 0.25972907468676565, + "epoch": 2.0442942067840075, + "grad_norm": 4.8125, + "learning_rate": 4.9999942680222036e-05, + "loss": 0.4362, + "mean_token_accuracy": 0.9104464769363403, + "num_tokens": 763407.0, + "step": 8770 + }, + { + "entropy": 0.24379628524184227, + "epoch": 2.045459843804639, + "grad_norm": 2.34375, + "learning_rate": 4.999993796446637e-05, + "loss": 0.3387, + "mean_token_accuracy": 0.9292517900466919, + "num_tokens": 781722.0, + "step": 8775 + }, + { + "entropy": 0.3385071136057377, + "epoch": 2.046625480825271, + "grad_norm": 1.203125, + "learning_rate": 4.9999933062317826e-05, + "loss": 0.7102, + "mean_token_accuracy": 0.8811876237392425, + "num_tokens": 801495.0, + "step": 8780 + }, + { + "entropy": 0.31932273507118225, + "epoch": 2.047791117845903, + "grad_norm": 7.53125, + "learning_rate": 4.9999927973776475e-05, + "loss": 0.8247, + "mean_token_accuracy": 0.8673913955688477, + "num_tokens": 809945.0, + "step": 8785 + }, + { + "entropy": 0.40632414594292643, + "epoch": 2.0489567548665346, + "grad_norm": 5.0, + "learning_rate": 4.999992269884241e-05, + "loss": 0.7773, + "mean_token_accuracy": 0.867892587184906, + "num_tokens": 821067.0, + "step": 8790 + }, + { + "entropy": 0.2714473832398653, + "epoch": 2.0501223918871663, + "grad_norm": 1.2734375, + "learning_rate": 4.9999917237515684e-05, + "loss": 0.3201, + "mean_token_accuracy": 0.9276492118835449, + "num_tokens": 843342.0, + "step": 8795 + }, + { + "entropy": 0.25552888177335265, + "epoch": 2.051288028907798, + "grad_norm": 4.15625, + "learning_rate": 4.9999911589796386e-05, + "loss": 0.5356, + "mean_token_accuracy": 0.9006185114383698, + "num_tokens": 867328.0, + "step": 8800 + }, + { + "entropy": 0.42955063804984095, + "epoch": 2.05245366592843, + "grad_norm": 0.96875, + "learning_rate": 4.999990575568462e-05, + "loss": 0.9482, + "mean_token_accuracy": 0.8779349982738495, + "num_tokens": 894639.0, + "step": 8805 + }, + { + "entropy": 0.25447643976658585, + "epoch": 2.0536193029490617, + "grad_norm": 2.453125, + "learning_rate": 4.999989973518046e-05, + "loss": 0.5433, + "mean_token_accuracy": 0.90226930975914, + "num_tokens": 914097.0, + "step": 8810 + }, + { + "entropy": 0.3173729632049799, + "epoch": 2.0547849399696934, + "grad_norm": 0.625, + "learning_rate": 4.999989352828398e-05, + "loss": 0.4612, + "mean_token_accuracy": 0.8994117856025696, + "num_tokens": 937133.0, + "step": 8815 + }, + { + "entropy": 0.27526839822530746, + "epoch": 2.055950576990325, + "grad_norm": 1.4375, + "learning_rate": 4.999988713499529e-05, + "loss": 0.6232, + "mean_token_accuracy": 0.8864486396312714, + "num_tokens": 954495.0, + "step": 8820 + }, + { + "entropy": 0.19231984689831733, + "epoch": 2.057116214010957, + "grad_norm": 5.53125, + "learning_rate": 4.999988055531449e-05, + "loss": 0.3043, + "mean_token_accuracy": 0.928051370382309, + "num_tokens": 985578.0, + "step": 8825 + }, + { + "entropy": 0.22631950937211515, + "epoch": 2.058281851031589, + "grad_norm": 0.6640625, + "learning_rate": 4.999987378924166e-05, + "loss": 0.4999, + "mean_token_accuracy": 0.9139285802841186, + "num_tokens": 1007174.0, + "step": 8830 + }, + { + "entropy": 0.24679026156663894, + "epoch": 2.0594474880522204, + "grad_norm": 4.3125, + "learning_rate": 4.999986683677691e-05, + "loss": 0.4819, + "mean_token_accuracy": 0.9124704420566558, + "num_tokens": 1022221.0, + "step": 8835 + }, + { + "entropy": 0.3226664915680885, + "epoch": 2.060613125072852, + "grad_norm": 4.1875, + "learning_rate": 4.999985969792036e-05, + "loss": 0.6056, + "mean_token_accuracy": 0.9017972528934479, + "num_tokens": 1034663.0, + "step": 8840 + }, + { + "entropy": 0.35001400858163834, + "epoch": 2.061778762093484, + "grad_norm": 2.0625, + "learning_rate": 4.999985237267209e-05, + "loss": 0.5449, + "mean_token_accuracy": 0.8659915328025818, + "num_tokens": 1068813.0, + "step": 8845 + }, + { + "entropy": 0.28483418338000777, + "epoch": 2.062944399114116, + "grad_norm": 2.15625, + "learning_rate": 4.999984486103222e-05, + "loss": 0.3729, + "mean_token_accuracy": 0.9030164957046509, + "num_tokens": 1085834.0, + "step": 8850 + }, + { + "entropy": 0.2964754268527031, + "epoch": 2.0641100361347475, + "grad_norm": 1.8046875, + "learning_rate": 4.999983716300086e-05, + "loss": 0.4947, + "mean_token_accuracy": 0.9007986903190612, + "num_tokens": 1107579.0, + "step": 8855 + }, + { + "entropy": 0.22273534834384917, + "epoch": 2.065275673155379, + "grad_norm": 1.890625, + "learning_rate": 4.999982927857814e-05, + "loss": 0.4777, + "mean_token_accuracy": 0.9066579163074493, + "num_tokens": 1140203.0, + "step": 8860 + }, + { + "entropy": 0.2912256710231304, + "epoch": 2.0664413101760113, + "grad_norm": 2.265625, + "learning_rate": 4.999982120776415e-05, + "loss": 0.5221, + "mean_token_accuracy": 0.8961217284202576, + "num_tokens": 1152463.0, + "step": 8865 + }, + { + "entropy": 0.2787528317421675, + "epoch": 2.067606947196643, + "grad_norm": 0.59765625, + "learning_rate": 4.999981295055903e-05, + "loss": 0.363, + "mean_token_accuracy": 0.8967165648937225, + "num_tokens": 1184273.0, + "step": 8870 + }, + { + "entropy": 0.27786948047578336, + "epoch": 2.0687725842172746, + "grad_norm": 1.34375, + "learning_rate": 4.99998045069629e-05, + "loss": 0.5324, + "mean_token_accuracy": 0.9002691745758057, + "num_tokens": 1203672.0, + "step": 8875 + }, + { + "entropy": 0.25195387527346613, + "epoch": 2.0699382212379067, + "grad_norm": 4.5, + "learning_rate": 4.999979587697589e-05, + "loss": 0.678, + "mean_token_accuracy": 0.890131413936615, + "num_tokens": 1215263.0, + "step": 8880 + }, + { + "entropy": 0.21976680774241686, + "epoch": 2.0711038582585384, + "grad_norm": 2.375, + "learning_rate": 4.9999787060598106e-05, + "loss": 0.2744, + "mean_token_accuracy": 0.9232886552810669, + "num_tokens": 1249096.0, + "step": 8885 + }, + { + "entropy": 0.2308237187564373, + "epoch": 2.07226949527917, + "grad_norm": 4.25, + "learning_rate": 4.999977805782971e-05, + "loss": 0.4887, + "mean_token_accuracy": 0.9085934281349182, + "num_tokens": 1279525.0, + "step": 8890 + }, + { + "entropy": 0.26388890072703364, + "epoch": 2.0734351322998017, + "grad_norm": 1.609375, + "learning_rate": 4.999976886867082e-05, + "loss": 0.4447, + "mean_token_accuracy": 0.9158779978752136, + "num_tokens": 1301732.0, + "step": 8895 + }, + { + "entropy": 0.20551118552684783, + "epoch": 2.074600769320434, + "grad_norm": 0.9140625, + "learning_rate": 4.999975949312157e-05, + "loss": 0.3894, + "mean_token_accuracy": 0.9103192627429962, + "num_tokens": 1319107.0, + "step": 8900 + }, + { + "entropy": 0.335835388302803, + "epoch": 2.0757664063410655, + "grad_norm": 1.46875, + "learning_rate": 4.999974993118211e-05, + "loss": 0.4564, + "mean_token_accuracy": 0.8883191406726837, + "num_tokens": 1344284.0, + "step": 8905 + }, + { + "entropy": 0.2874822109937668, + "epoch": 2.076932043361697, + "grad_norm": 5.90625, + "learning_rate": 4.999974018285258e-05, + "loss": 0.6495, + "mean_token_accuracy": 0.8915266275405884, + "num_tokens": 1353734.0, + "step": 8910 + }, + { + "entropy": 0.22463925033807755, + "epoch": 2.078097680382329, + "grad_norm": 1.765625, + "learning_rate": 4.9999730248133115e-05, + "loss": 0.3548, + "mean_token_accuracy": 0.9258445799350739, + "num_tokens": 1383642.0, + "step": 8915 + }, + { + "entropy": 0.3080735132098198, + "epoch": 2.079263317402961, + "grad_norm": 2.59375, + "learning_rate": 4.999972012702388e-05, + "loss": 0.531, + "mean_token_accuracy": 0.9089854300022125, + "num_tokens": 1395352.0, + "step": 8920 + }, + { + "entropy": 0.28653545752167703, + "epoch": 2.0804289544235925, + "grad_norm": 1.0, + "learning_rate": 4.999970981952501e-05, + "loss": 0.4904, + "mean_token_accuracy": 0.8980461597442627, + "num_tokens": 1409925.0, + "step": 8925 + }, + { + "entropy": 0.28012751042842865, + "epoch": 2.081594591444224, + "grad_norm": 6.5625, + "learning_rate": 4.999969932563667e-05, + "loss": 0.7632, + "mean_token_accuracy": 0.8697837233543396, + "num_tokens": 1419995.0, + "step": 8930 + }, + { + "entropy": 0.23538232855498792, + "epoch": 2.082760228464856, + "grad_norm": 0.65234375, + "learning_rate": 4.999968864535901e-05, + "loss": 0.4925, + "mean_token_accuracy": 0.9042471885681153, + "num_tokens": 1449443.0, + "step": 8935 + }, + { + "entropy": 0.30510496273636817, + "epoch": 2.083925865485488, + "grad_norm": 5.0625, + "learning_rate": 4.99996777786922e-05, + "loss": 0.5945, + "mean_token_accuracy": 0.8913271486759186, + "num_tokens": 1473675.0, + "step": 8940 + }, + { + "entropy": 0.3335975080728531, + "epoch": 2.0850915025061196, + "grad_norm": 1.640625, + "learning_rate": 4.9999666725636384e-05, + "loss": 0.4122, + "mean_token_accuracy": 0.9008392214775085, + "num_tokens": 1492252.0, + "step": 8945 + }, + { + "entropy": 0.40691106468439103, + "epoch": 2.0862571395267513, + "grad_norm": 2.59375, + "learning_rate": 4.999965548619174e-05, + "loss": 0.909, + "mean_token_accuracy": 0.8463218748569489, + "num_tokens": 1507068.0, + "step": 8950 + }, + { + "entropy": 0.27208645939826964, + "epoch": 2.087422776547383, + "grad_norm": 1.3046875, + "learning_rate": 4.999964406035843e-05, + "loss": 0.3961, + "mean_token_accuracy": 0.9165595531463623, + "num_tokens": 1539096.0, + "step": 8955 + }, + { + "entropy": 0.2435051068663597, + "epoch": 2.088588413568015, + "grad_norm": 4.125, + "learning_rate": 4.9999632448136634e-05, + "loss": 0.5914, + "mean_token_accuracy": 0.9110420107841491, + "num_tokens": 1561118.0, + "step": 8960 + }, + { + "entropy": 0.17810285724699498, + "epoch": 2.0897540505886467, + "grad_norm": 0.55859375, + "learning_rate": 4.999962064952651e-05, + "loss": 0.2275, + "mean_token_accuracy": 0.9336501121520996, + "num_tokens": 1587774.0, + "step": 8965 + }, + { + "entropy": 0.3193057609722018, + "epoch": 2.0909196876092784, + "grad_norm": 0.81640625, + "learning_rate": 4.9999608664528244e-05, + "loss": 0.3811, + "mean_token_accuracy": 0.8954979538917541, + "num_tokens": 1619172.0, + "step": 8970 + }, + { + "entropy": 0.22312114909291267, + "epoch": 2.09208532462991, + "grad_norm": 0.71875, + "learning_rate": 4.999959649314202e-05, + "loss": 0.5581, + "mean_token_accuracy": 0.9176813662052155, + "num_tokens": 1648290.0, + "step": 8975 + }, + { + "entropy": 0.2536992236971855, + "epoch": 2.093250961650542, + "grad_norm": 3.15625, + "learning_rate": 4.999958413536799e-05, + "loss": 0.4422, + "mean_token_accuracy": 0.9238847613334655, + "num_tokens": 1673427.0, + "step": 8980 + }, + { + "entropy": 0.22880229726433754, + "epoch": 2.094416598671174, + "grad_norm": 2.984375, + "learning_rate": 4.999957159120639e-05, + "loss": 0.4996, + "mean_token_accuracy": 0.918055659532547, + "num_tokens": 1694895.0, + "step": 8985 + }, + { + "entropy": 0.27447298876941206, + "epoch": 2.0955822356918055, + "grad_norm": 1.6640625, + "learning_rate": 4.999955886065735e-05, + "loss": 0.4274, + "mean_token_accuracy": 0.8964529633522034, + "num_tokens": 1720601.0, + "step": 8990 + }, + { + "entropy": 0.3642170369625092, + "epoch": 2.096747872712437, + "grad_norm": 5.40625, + "learning_rate": 4.9999545943721105e-05, + "loss": 0.9018, + "mean_token_accuracy": 0.8594711780548095, + "num_tokens": 1729024.0, + "step": 8995 + }, + { + "entropy": 0.3267651729285717, + "epoch": 2.0979135097330692, + "grad_norm": 2.578125, + "learning_rate": 4.999953284039782e-05, + "loss": 0.7126, + "mean_token_accuracy": 0.8791795670986176, + "num_tokens": 1741471.0, + "step": 9000 + }, + { + "entropy": 0.28976150676608087, + "epoch": 2.099079146753701, + "grad_norm": 3.03125, + "learning_rate": 4.99995195506877e-05, + "loss": 0.4418, + "mean_token_accuracy": 0.8930846333503724, + "num_tokens": 1766579.0, + "step": 9005 + }, + { + "entropy": 0.24256085567176341, + "epoch": 2.1002447837743325, + "grad_norm": 4.125, + "learning_rate": 4.999950607459095e-05, + "loss": 0.3859, + "mean_token_accuracy": 0.9247425496578217, + "num_tokens": 1784147.0, + "step": 9010 + }, + { + "entropy": 0.2715363338589668, + "epoch": 2.1014104207949647, + "grad_norm": 1.9453125, + "learning_rate": 4.999949241210776e-05, + "loss": 0.5245, + "mean_token_accuracy": 0.8970009207725524, + "num_tokens": 1799052.0, + "step": 9015 + }, + { + "entropy": 0.2423691965639591, + "epoch": 2.1025760578155963, + "grad_norm": 2.546875, + "learning_rate": 4.999947856323834e-05, + "loss": 0.3323, + "mean_token_accuracy": 0.9294469296932221, + "num_tokens": 1827445.0, + "step": 9020 + }, + { + "entropy": 0.36271433904767036, + "epoch": 2.103741694836228, + "grad_norm": 6.0, + "learning_rate": 4.9999464527982886e-05, + "loss": 0.8631, + "mean_token_accuracy": 0.8470502078533173, + "num_tokens": 1844015.0, + "step": 9025 + }, + { + "entropy": 0.3287566237151623, + "epoch": 2.1049073318568596, + "grad_norm": 5.3125, + "learning_rate": 4.9999450306341627e-05, + "loss": 0.6193, + "mean_token_accuracy": 0.8893731594085693, + "num_tokens": 1862324.0, + "step": 9030 + }, + { + "entropy": 0.2500194745138288, + "epoch": 2.1060729688774917, + "grad_norm": 2.5625, + "learning_rate": 4.999943589831476e-05, + "loss": 0.4734, + "mean_token_accuracy": 0.9056059777736664, + "num_tokens": 1882622.0, + "step": 9035 + }, + { + "entropy": 0.2787175789475441, + "epoch": 2.1072386058981234, + "grad_norm": 4.34375, + "learning_rate": 4.99994213039025e-05, + "loss": 0.64, + "mean_token_accuracy": 0.8964097678661347, + "num_tokens": 1893194.0, + "step": 9040 + }, + { + "entropy": 0.30563063621520997, + "epoch": 2.108404242918755, + "grad_norm": 2.515625, + "learning_rate": 4.999940652310507e-05, + "loss": 0.6266, + "mean_token_accuracy": 0.889850401878357, + "num_tokens": 1903993.0, + "step": 9045 + }, + { + "entropy": 0.2862144157290459, + "epoch": 2.1095698799393867, + "grad_norm": 2.859375, + "learning_rate": 4.9999391555922695e-05, + "loss": 0.7127, + "mean_token_accuracy": 0.8805917859077453, + "num_tokens": 1913810.0, + "step": 9050 + }, + { + "entropy": 0.3218904435634613, + "epoch": 2.110735516960019, + "grad_norm": 6.28125, + "learning_rate": 4.999937640235558e-05, + "loss": 0.578, + "mean_token_accuracy": 0.9038245022296906, + "num_tokens": 1926767.0, + "step": 9055 + }, + { + "entropy": 0.4377041935920715, + "epoch": 2.1119011539806505, + "grad_norm": 5.125, + "learning_rate": 4.9999361062403974e-05, + "loss": 1.034, + "mean_token_accuracy": 0.8412764191627502, + "num_tokens": 1933914.0, + "step": 9060 + }, + { + "entropy": 0.33127313032746314, + "epoch": 2.113066791001282, + "grad_norm": 2.46875, + "learning_rate": 4.999934553606809e-05, + "loss": 0.5326, + "mean_token_accuracy": 0.9018651664257049, + "num_tokens": 1944254.0, + "step": 9065 + }, + { + "entropy": 0.28445643484592437, + "epoch": 2.114232428021914, + "grad_norm": 2.25, + "learning_rate": 4.999932982334816e-05, + "loss": 0.4395, + "mean_token_accuracy": 0.9009204924106597, + "num_tokens": 1987487.0, + "step": 9070 + }, + { + "entropy": 0.27651037350296975, + "epoch": 2.115398065042546, + "grad_norm": 3.265625, + "learning_rate": 4.999931392424442e-05, + "loss": 0.501, + "mean_token_accuracy": 0.9066934764385224, + "num_tokens": 2009000.0, + "step": 9075 + }, + { + "entropy": 0.3002724215388298, + "epoch": 2.1165637020631776, + "grad_norm": 3.515625, + "learning_rate": 4.999929783875712e-05, + "loss": 0.4387, + "mean_token_accuracy": 0.9174822568893433, + "num_tokens": 2022851.0, + "step": 9080 + }, + { + "entropy": 0.3476183444261551, + "epoch": 2.1177293390838092, + "grad_norm": 2.375, + "learning_rate": 4.999928156688648e-05, + "loss": 0.6287, + "mean_token_accuracy": 0.8917272806167602, + "num_tokens": 2033218.0, + "step": 9085 + }, + { + "entropy": 0.19192668609321117, + "epoch": 2.118894976104441, + "grad_norm": 0.7421875, + "learning_rate": 4.999926510863276e-05, + "loss": 0.296, + "mean_token_accuracy": 0.9337211489677429, + "num_tokens": 2052410.0, + "step": 9090 + }, + { + "entropy": 0.3516515165567398, + "epoch": 2.120060613125073, + "grad_norm": 6.40625, + "learning_rate": 4.999924846399619e-05, + "loss": 0.7027, + "mean_token_accuracy": 0.8735112309455871, + "num_tokens": 2071429.0, + "step": 9095 + }, + { + "entropy": 0.18202774375677108, + "epoch": 2.1212262501457047, + "grad_norm": 3.984375, + "learning_rate": 4.999923163297703e-05, + "loss": 0.2499, + "mean_token_accuracy": 0.9505364894866943, + "num_tokens": 2103427.0, + "step": 9100 + }, + { + "entropy": 0.35498632192611695, + "epoch": 2.1223918871663363, + "grad_norm": 2.6875, + "learning_rate": 4.999921461557552e-05, + "loss": 0.7245, + "mean_token_accuracy": 0.8860435366630555, + "num_tokens": 2112754.0, + "step": 9105 + }, + { + "entropy": 0.30842496901750566, + "epoch": 2.123557524186968, + "grad_norm": 0.98046875, + "learning_rate": 4.999919741179193e-05, + "loss": 0.6197, + "mean_token_accuracy": 0.8909704387187958, + "num_tokens": 2131569.0, + "step": 9110 + }, + { + "entropy": 0.30220485720783474, + "epoch": 2.1247231612076, + "grad_norm": 0.68359375, + "learning_rate": 4.99991800216265e-05, + "loss": 0.4954, + "mean_token_accuracy": 0.8969570279121399, + "num_tokens": 2160157.0, + "step": 9115 + }, + { + "entropy": 0.2391287475824356, + "epoch": 2.1258887982282317, + "grad_norm": 1.5, + "learning_rate": 4.99991624450795e-05, + "loss": 0.2911, + "mean_token_accuracy": 0.9212132573127747, + "num_tokens": 2183922.0, + "step": 9120 + }, + { + "entropy": 0.26402276530861857, + "epoch": 2.1270544352488634, + "grad_norm": 4.5625, + "learning_rate": 4.999914468215119e-05, + "loss": 0.6939, + "mean_token_accuracy": 0.8836956083774566, + "num_tokens": 2198004.0, + "step": 9125 + }, + { + "entropy": 0.24503479078412055, + "epoch": 2.128220072269495, + "grad_norm": 0.765625, + "learning_rate": 4.9999126732841825e-05, + "loss": 0.3761, + "mean_token_accuracy": 0.9078773021697998, + "num_tokens": 2222287.0, + "step": 9130 + }, + { + "entropy": 0.24612213671207428, + "epoch": 2.129385709290127, + "grad_norm": 1.4453125, + "learning_rate": 4.9999108597151684e-05, + "loss": 0.5347, + "mean_token_accuracy": 0.8830598950386047, + "num_tokens": 2249866.0, + "step": 9135 + }, + { + "entropy": 0.29214831814169884, + "epoch": 2.130551346310759, + "grad_norm": 0.44921875, + "learning_rate": 4.999909027508104e-05, + "loss": 0.6057, + "mean_token_accuracy": 0.8993490874767304, + "num_tokens": 2272427.0, + "step": 9140 + }, + { + "entropy": 0.31067397333681585, + "epoch": 2.1317169833313905, + "grad_norm": 6.65625, + "learning_rate": 4.999907176663015e-05, + "loss": 0.6869, + "mean_token_accuracy": 0.8875278353691101, + "num_tokens": 2286353.0, + "step": 9145 + }, + { + "entropy": 0.37334904372692107, + "epoch": 2.1328826203520226, + "grad_norm": 3.9375, + "learning_rate": 4.999905307179931e-05, + "loss": 0.9226, + "mean_token_accuracy": 0.858127874135971, + "num_tokens": 2294175.0, + "step": 9150 + }, + { + "entropy": 0.31301565021276473, + "epoch": 2.1340482573726542, + "grad_norm": 4.28125, + "learning_rate": 4.9999034190588776e-05, + "loss": 0.6624, + "mean_token_accuracy": 0.8904252409934997, + "num_tokens": 2307730.0, + "step": 9155 + }, + { + "entropy": 0.2692046908661723, + "epoch": 2.135213894393286, + "grad_norm": 5.625, + "learning_rate": 4.9999015122998855e-05, + "loss": 0.4516, + "mean_token_accuracy": 0.9097493112087249, + "num_tokens": 2336014.0, + "step": 9160 + }, + { + "entropy": 0.32199760563671587, + "epoch": 2.1363795314139176, + "grad_norm": 0.75, + "learning_rate": 4.999899586902982e-05, + "loss": 0.5622, + "mean_token_accuracy": 0.8987860381603241, + "num_tokens": 2350363.0, + "step": 9165 + }, + { + "entropy": 0.2867584332823753, + "epoch": 2.1375451684345497, + "grad_norm": 1.5703125, + "learning_rate": 4.9998976428681946e-05, + "loss": 0.5252, + "mean_token_accuracy": 0.9095795571804046, + "num_tokens": 2361743.0, + "step": 9170 + }, + { + "entropy": 0.23813503086566926, + "epoch": 2.1387108054551813, + "grad_norm": 5.0625, + "learning_rate": 4.999895680195554e-05, + "loss": 0.3408, + "mean_token_accuracy": 0.9232898116111755, + "num_tokens": 2377516.0, + "step": 9175 + }, + { + "entropy": 0.20952856484800578, + "epoch": 2.139876442475813, + "grad_norm": 2.09375, + "learning_rate": 4.999893698885089e-05, + "loss": 0.2643, + "mean_token_accuracy": 0.9201667666435241, + "num_tokens": 2412415.0, + "step": 9180 + }, + { + "entropy": 0.2775612033903599, + "epoch": 2.1410420794964447, + "grad_norm": 2.046875, + "learning_rate": 4.9998916989368286e-05, + "loss": 0.5957, + "mean_token_accuracy": 0.9061172902584076, + "num_tokens": 2434137.0, + "step": 9185 + }, + { + "entropy": 0.33088565692305566, + "epoch": 2.1422077165170768, + "grad_norm": 2.46875, + "learning_rate": 4.999889680350804e-05, + "loss": 0.6631, + "mean_token_accuracy": 0.8853426575660706, + "num_tokens": 2448098.0, + "step": 9190 + }, + { + "entropy": 0.35131275504827497, + "epoch": 2.1433733535377084, + "grad_norm": 4.625, + "learning_rate": 4.999887643127044e-05, + "loss": 0.7744, + "mean_token_accuracy": 0.8763688683509827, + "num_tokens": 2457682.0, + "step": 9195 + }, + { + "entropy": 0.35075569823384284, + "epoch": 2.14453899055834, + "grad_norm": 6.3125, + "learning_rate": 4.999885587265578e-05, + "loss": 0.6041, + "mean_token_accuracy": 0.8755062699317933, + "num_tokens": 2470650.0, + "step": 9200 + }, + { + "entropy": 0.24803269598633051, + "epoch": 2.1457046275789717, + "grad_norm": 1.140625, + "learning_rate": 4.999883512766439e-05, + "loss": 0.4269, + "mean_token_accuracy": 0.9069607615470886, + "num_tokens": 2495284.0, + "step": 9205 + }, + { + "entropy": 0.2419408529996872, + "epoch": 2.146870264599604, + "grad_norm": 3.234375, + "learning_rate": 4.999881419629657e-05, + "loss": 0.522, + "mean_token_accuracy": 0.8945335686206818, + "num_tokens": 2515051.0, + "step": 9210 + }, + { + "entropy": 0.2983535796403885, + "epoch": 2.1480359016202355, + "grad_norm": 1.4921875, + "learning_rate": 4.999879307855263e-05, + "loss": 0.5812, + "mean_token_accuracy": 0.8816781997680664, + "num_tokens": 2534616.0, + "step": 9215 + }, + { + "entropy": 0.2238714762032032, + "epoch": 2.149201538640867, + "grad_norm": 0.85546875, + "learning_rate": 4.9998771774432886e-05, + "loss": 0.2949, + "mean_token_accuracy": 0.9409159839153289, + "num_tokens": 2568484.0, + "step": 9220 + }, + { + "entropy": 0.4029300630092621, + "epoch": 2.150367175661499, + "grad_norm": 5.03125, + "learning_rate": 4.999875028393765e-05, + "loss": 0.9551, + "mean_token_accuracy": 0.8563934624195099, + "num_tokens": 2576709.0, + "step": 9225 + }, + { + "entropy": 0.23757186979055406, + "epoch": 2.151532812682131, + "grad_norm": 1.9375, + "learning_rate": 4.999872860706725e-05, + "loss": 0.5217, + "mean_token_accuracy": 0.9103751957416535, + "num_tokens": 2594634.0, + "step": 9230 + }, + { + "entropy": 0.2861368380486965, + "epoch": 2.1526984497027626, + "grad_norm": 0.78515625, + "learning_rate": 4.999870674382202e-05, + "loss": 0.6282, + "mean_token_accuracy": 0.8889648199081421, + "num_tokens": 2611422.0, + "step": 9235 + }, + { + "entropy": 0.27242892384529116, + "epoch": 2.1538640867233942, + "grad_norm": 1.4453125, + "learning_rate": 4.999868469420226e-05, + "loss": 0.5124, + "mean_token_accuracy": 0.8922030091285705, + "num_tokens": 2626832.0, + "step": 9240 + }, + { + "entropy": 0.2865546464920044, + "epoch": 2.155029723744026, + "grad_norm": 1.5390625, + "learning_rate": 4.999866245820831e-05, + "loss": 0.6028, + "mean_token_accuracy": 0.8990983128547668, + "num_tokens": 2641289.0, + "step": 9245 + }, + { + "entropy": 0.2518985107541084, + "epoch": 2.156195360764658, + "grad_norm": 1.4921875, + "learning_rate": 4.999864003584051e-05, + "loss": 0.5041, + "mean_token_accuracy": 0.9163178861141205, + "num_tokens": 2655191.0, + "step": 9250 + }, + { + "entropy": 0.4007525980472565, + "epoch": 2.1573609977852897, + "grad_norm": 2.546875, + "learning_rate": 4.9998617427099185e-05, + "loss": 0.7206, + "mean_token_accuracy": 0.8826470613479614, + "num_tokens": 2665464.0, + "step": 9255 + }, + { + "entropy": 0.32439158745110036, + "epoch": 2.1585266348059213, + "grad_norm": 1.3828125, + "learning_rate": 4.999859463198468e-05, + "loss": 0.41, + "mean_token_accuracy": 0.8896235227584839, + "num_tokens": 2688611.0, + "step": 9260 + }, + { + "entropy": 0.2774831034243107, + "epoch": 2.159692271826553, + "grad_norm": 3.75, + "learning_rate": 4.999857165049733e-05, + "loss": 0.517, + "mean_token_accuracy": 0.9092964768409729, + "num_tokens": 2700618.0, + "step": 9265 + }, + { + "entropy": 0.22982106544077396, + "epoch": 2.160857908847185, + "grad_norm": 0.77734375, + "learning_rate": 4.999854848263747e-05, + "loss": 0.3502, + "mean_token_accuracy": 0.9041347980499268, + "num_tokens": 2729608.0, + "step": 9270 + }, + { + "entropy": 0.25315537825226786, + "epoch": 2.1620235458678168, + "grad_norm": 1.3046875, + "learning_rate": 4.999852512840546e-05, + "loss": 0.3534, + "mean_token_accuracy": 0.9024879693984985, + "num_tokens": 2754626.0, + "step": 9275 + }, + { + "entropy": 0.42271759584546087, + "epoch": 2.1631891828884484, + "grad_norm": 5.25, + "learning_rate": 4.999850158780164e-05, + "loss": 0.6212, + "mean_token_accuracy": 0.8725330352783203, + "num_tokens": 2773491.0, + "step": 9280 + }, + { + "entropy": 0.3202828958630562, + "epoch": 2.1643548199090805, + "grad_norm": 1.7421875, + "learning_rate": 4.999847786082637e-05, + "loss": 0.5417, + "mean_token_accuracy": 0.8816564381122589, + "num_tokens": 2786083.0, + "step": 9285 + }, + { + "entropy": 0.23811395298689603, + "epoch": 2.165520456929712, + "grad_norm": 4.03125, + "learning_rate": 4.9998453947479986e-05, + "loss": 0.4888, + "mean_token_accuracy": 0.9188705086708069, + "num_tokens": 2814594.0, + "step": 9290 + }, + { + "entropy": 0.3060807779431343, + "epoch": 2.166686093950344, + "grad_norm": 4.65625, + "learning_rate": 4.999842984776285e-05, + "loss": 0.2856, + "mean_token_accuracy": 0.9265418946743011, + "num_tokens": 2847092.0, + "step": 9295 + }, + { + "entropy": 0.4221868872642517, + "epoch": 2.1678517309709755, + "grad_norm": 3.109375, + "learning_rate": 4.999840556167534e-05, + "loss": 0.7217, + "mean_token_accuracy": 0.8905176043510437, + "num_tokens": 2856372.0, + "step": 9300 + }, + { + "entropy": 0.3070937130600214, + "epoch": 2.1690173679916076, + "grad_norm": 5.96875, + "learning_rate": 4.999838108921779e-05, + "loss": 0.4854, + "mean_token_accuracy": 0.9074550926685333, + "num_tokens": 2880328.0, + "step": 9305 + }, + { + "entropy": 0.406447908654809, + "epoch": 2.1701830050122393, + "grad_norm": 7.78125, + "learning_rate": 4.999835643039059e-05, + "loss": 0.7267, + "mean_token_accuracy": 0.8538496255874634, + "num_tokens": 2901644.0, + "step": 9310 + }, + { + "entropy": 0.36476370841264727, + "epoch": 2.171348642032871, + "grad_norm": 4.5, + "learning_rate": 4.9998331585194094e-05, + "loss": 0.6199, + "mean_token_accuracy": 0.8748889327049255, + "num_tokens": 2920190.0, + "step": 9315 + }, + { + "entropy": 0.39984625913202765, + "epoch": 2.1725142790535026, + "grad_norm": 4.0625, + "learning_rate": 4.9998306553628685e-05, + "loss": 0.8104, + "mean_token_accuracy": 0.8600794672966003, + "num_tokens": 2933890.0, + "step": 9320 + }, + { + "entropy": 0.29543534517288206, + "epoch": 2.1736799160741347, + "grad_norm": 4.34375, + "learning_rate": 4.9998281335694715e-05, + "loss": 0.5628, + "mean_token_accuracy": 0.902722305059433, + "num_tokens": 2954062.0, + "step": 9325 + }, + { + "entropy": 0.23410847783088684, + "epoch": 2.1748455530947663, + "grad_norm": 1.6171875, + "learning_rate": 4.999825593139257e-05, + "loss": 0.2356, + "mean_token_accuracy": 0.9225547075271606, + "num_tokens": 2985408.0, + "step": 9330 + }, + { + "entropy": 0.27478082813322546, + "epoch": 2.176011190115398, + "grad_norm": 5.125, + "learning_rate": 4.999823034072264e-05, + "loss": 0.582, + "mean_token_accuracy": 0.8869442343711853, + "num_tokens": 3002720.0, + "step": 9335 + }, + { + "entropy": 0.23570253998041152, + "epoch": 2.1771768271360297, + "grad_norm": 0.6796875, + "learning_rate": 4.999820456368529e-05, + "loss": 0.3097, + "mean_token_accuracy": 0.9236724317073822, + "num_tokens": 3029395.0, + "step": 9340 + }, + { + "entropy": 0.22580685541033746, + "epoch": 2.1783424641566618, + "grad_norm": 0.75390625, + "learning_rate": 4.999817860028092e-05, + "loss": 0.2503, + "mean_token_accuracy": 0.9057783663272858, + "num_tokens": 3063358.0, + "step": 9345 + }, + { + "entropy": 0.2577318917959929, + "epoch": 2.1795081011772934, + "grad_norm": 1.0625, + "learning_rate": 4.99981524505099e-05, + "loss": 0.4401, + "mean_token_accuracy": 0.8984253942966461, + "num_tokens": 3086596.0, + "step": 9350 + }, + { + "entropy": 0.29805635251104834, + "epoch": 2.180673738197925, + "grad_norm": 6.1875, + "learning_rate": 4.9998126114372625e-05, + "loss": 0.431, + "mean_token_accuracy": 0.9108638763427734, + "num_tokens": 3104755.0, + "step": 9355 + }, + { + "entropy": 0.2779708236455917, + "epoch": 2.1818393752185568, + "grad_norm": 5.625, + "learning_rate": 4.99980995918695e-05, + "loss": 0.7074, + "mean_token_accuracy": 0.880797129869461, + "num_tokens": 3115412.0, + "step": 9360 + }, + { + "entropy": 0.2565476704388857, + "epoch": 2.183005012239189, + "grad_norm": 9.375, + "learning_rate": 4.9998072883000916e-05, + "loss": 0.5649, + "mean_token_accuracy": 0.9002249300479889, + "num_tokens": 3128809.0, + "step": 9365 + }, + { + "entropy": 0.24895308613777162, + "epoch": 2.1841706492598205, + "grad_norm": 2.859375, + "learning_rate": 4.999804598776726e-05, + "loss": 0.5438, + "mean_token_accuracy": 0.8996934533119202, + "num_tokens": 3140578.0, + "step": 9370 + }, + { + "entropy": 0.2575036585330963, + "epoch": 2.185336286280452, + "grad_norm": 3.78125, + "learning_rate": 4.999801890616894e-05, + "loss": 0.3789, + "mean_token_accuracy": 0.9098849713802337, + "num_tokens": 3163367.0, + "step": 9375 + }, + { + "entropy": 0.3303291242569685, + "epoch": 2.186501923301084, + "grad_norm": 4.71875, + "learning_rate": 4.999799163820636e-05, + "loss": 0.6691, + "mean_token_accuracy": 0.8991015374660491, + "num_tokens": 3177936.0, + "step": 9380 + }, + { + "entropy": 0.23629399687051772, + "epoch": 2.187667560321716, + "grad_norm": 1.515625, + "learning_rate": 4.999796418387993e-05, + "loss": 0.3031, + "mean_token_accuracy": 0.9319840252399445, + "num_tokens": 3202018.0, + "step": 9385 + }, + { + "entropy": 0.2849699892103672, + "epoch": 2.1888331973423476, + "grad_norm": 3.265625, + "learning_rate": 4.9997936543190055e-05, + "loss": 0.4566, + "mean_token_accuracy": 0.9006444752216339, + "num_tokens": 3215472.0, + "step": 9390 + }, + { + "entropy": 0.2143367573618889, + "epoch": 2.1899988343629793, + "grad_norm": 0.498046875, + "learning_rate": 4.9997908716137144e-05, + "loss": 0.4926, + "mean_token_accuracy": 0.9119088351726532, + "num_tokens": 3240374.0, + "step": 9395 + }, + { + "entropy": 0.28837391287088393, + "epoch": 2.191164471383611, + "grad_norm": 3.484375, + "learning_rate": 4.999788070272162e-05, + "loss": 0.6007, + "mean_token_accuracy": 0.8912509143352508, + "num_tokens": 3260575.0, + "step": 9400 + }, + { + "entropy": 0.28325546756386755, + "epoch": 2.192330108404243, + "grad_norm": 0.70703125, + "learning_rate": 4.99978525029439e-05, + "loss": 0.4228, + "mean_token_accuracy": 0.9063177049160004, + "num_tokens": 3289344.0, + "step": 9405 + }, + { + "entropy": 0.3486835271120071, + "epoch": 2.1934957454248747, + "grad_norm": 0.859375, + "learning_rate": 4.999782411680439e-05, + "loss": 0.5126, + "mean_token_accuracy": 0.8930414140224456, + "num_tokens": 3313588.0, + "step": 9410 + }, + { + "entropy": 0.24004043117165566, + "epoch": 2.1946613824455063, + "grad_norm": 0.671875, + "learning_rate": 4.999779554430354e-05, + "loss": 0.5383, + "mean_token_accuracy": 0.8893763542175293, + "num_tokens": 3335335.0, + "step": 9415 + }, + { + "entropy": 0.34101747423410417, + "epoch": 2.1958270194661385, + "grad_norm": 5.0, + "learning_rate": 4.9997766785441744e-05, + "loss": 0.6741, + "mean_token_accuracy": 0.8797512233257294, + "num_tokens": 3344468.0, + "step": 9420 + }, + { + "entropy": 0.2273413881659508, + "epoch": 2.19699265648677, + "grad_norm": 4.09375, + "learning_rate": 4.999773784021946e-05, + "loss": 0.3916, + "mean_token_accuracy": 0.922112226486206, + "num_tokens": 3368729.0, + "step": 9425 + }, + { + "entropy": 0.3727362662553787, + "epoch": 2.1981582935074018, + "grad_norm": 3.21875, + "learning_rate": 4.99977087086371e-05, + "loss": 0.6149, + "mean_token_accuracy": 0.8818917512893677, + "num_tokens": 3382191.0, + "step": 9430 + }, + { + "entropy": 0.2546384513378143, + "epoch": 2.1993239305280334, + "grad_norm": 2.46875, + "learning_rate": 4.999767939069511e-05, + "loss": 0.3529, + "mean_token_accuracy": 0.9229681074619294, + "num_tokens": 3397221.0, + "step": 9435 + }, + { + "entropy": 0.2805795904248953, + "epoch": 2.2004895675486655, + "grad_norm": 4.34375, + "learning_rate": 4.999764988639393e-05, + "loss": 0.3557, + "mean_token_accuracy": 0.9172143399715423, + "num_tokens": 3420792.0, + "step": 9440 + }, + { + "entropy": 0.16744249165058137, + "epoch": 2.201655204569297, + "grad_norm": 2.5625, + "learning_rate": 4.999762019573398e-05, + "loss": 0.174, + "mean_token_accuracy": 0.9522073805332184, + "num_tokens": 3456134.0, + "step": 9445 + }, + { + "entropy": 0.29852482452988627, + "epoch": 2.202820841589929, + "grad_norm": 4.96875, + "learning_rate": 4.999759031871572e-05, + "loss": 0.425, + "mean_token_accuracy": 0.8770089387893677, + "num_tokens": 3492906.0, + "step": 9450 + }, + { + "entropy": 0.38035735934972764, + "epoch": 2.2039864786105605, + "grad_norm": 3.421875, + "learning_rate": 4.9997560255339594e-05, + "loss": 0.6763, + "mean_token_accuracy": 0.8728886842727661, + "num_tokens": 3511065.0, + "step": 9455 + }, + { + "entropy": 0.27133081033825873, + "epoch": 2.2051521156311926, + "grad_norm": 1.734375, + "learning_rate": 4.999753000560604e-05, + "loss": 0.4138, + "mean_token_accuracy": 0.9164373219013214, + "num_tokens": 3541530.0, + "step": 9460 + }, + { + "entropy": 0.19618333354592324, + "epoch": 2.2063177526518243, + "grad_norm": 1.1484375, + "learning_rate": 4.999749956951552e-05, + "loss": 0.3624, + "mean_token_accuracy": 0.9286963403224945, + "num_tokens": 3584076.0, + "step": 9465 + }, + { + "entropy": 0.28177291825413703, + "epoch": 2.207483389672456, + "grad_norm": 3.53125, + "learning_rate": 4.999746894706848e-05, + "loss": 0.5682, + "mean_token_accuracy": 0.9091279208660126, + "num_tokens": 3597496.0, + "step": 9470 + }, + { + "entropy": 0.2513377882540226, + "epoch": 2.2086490266930876, + "grad_norm": 2.78125, + "learning_rate": 4.999743813826539e-05, + "loss": 0.5108, + "mean_token_accuracy": 0.9064539134502411, + "num_tokens": 3615165.0, + "step": 9475 + }, + { + "entropy": 0.1830581733956933, + "epoch": 2.2098146637137197, + "grad_norm": 2.46875, + "learning_rate": 4.999740714310669e-05, + "loss": 0.3692, + "mean_token_accuracy": 0.9332661211490632, + "num_tokens": 3645536.0, + "step": 9480 + }, + { + "entropy": 0.2112976111471653, + "epoch": 2.2109803007343514, + "grad_norm": 1.5234375, + "learning_rate": 4.999737596159286e-05, + "loss": 0.265, + "mean_token_accuracy": 0.933832836151123, + "num_tokens": 3681490.0, + "step": 9485 + }, + { + "entropy": 0.18377817682921888, + "epoch": 2.212145937754983, + "grad_norm": 0.61328125, + "learning_rate": 4.999734459372436e-05, + "loss": 0.2467, + "mean_token_accuracy": 0.9410947382450103, + "num_tokens": 3720285.0, + "step": 9490 + }, + { + "entropy": 0.2656033456325531, + "epoch": 2.2133115747756147, + "grad_norm": 4.1875, + "learning_rate": 4.9997313039501645e-05, + "loss": 0.3963, + "mean_token_accuracy": 0.9108889162540436, + "num_tokens": 3740548.0, + "step": 9495 + }, + { + "entropy": 0.23448058478534223, + "epoch": 2.214477211796247, + "grad_norm": 0.73046875, + "learning_rate": 4.99972812989252e-05, + "loss": 0.3424, + "mean_token_accuracy": 0.915939325094223, + "num_tokens": 3765814.0, + "step": 9500 + }, + { + "entropy": 0.2662409141659737, + "epoch": 2.2156428488168785, + "grad_norm": 3.1875, + "learning_rate": 4.9997249371995495e-05, + "loss": 0.4384, + "mean_token_accuracy": 0.903855949640274, + "num_tokens": 3788788.0, + "step": 9505 + }, + { + "entropy": 0.3477610141038895, + "epoch": 2.21680848583751, + "grad_norm": 6.4375, + "learning_rate": 4.999721725871301e-05, + "loss": 0.8293, + "mean_token_accuracy": 0.8557975172996521, + "num_tokens": 3797775.0, + "step": 9510 + }, + { + "entropy": 0.2112328600138426, + "epoch": 2.2179741228581418, + "grad_norm": 1.015625, + "learning_rate": 4.999718495907821e-05, + "loss": 0.4428, + "mean_token_accuracy": 0.9236923813819885, + "num_tokens": 3821084.0, + "step": 9515 + }, + { + "entropy": 0.2622928474098444, + "epoch": 2.219139759878774, + "grad_norm": 4.375, + "learning_rate": 4.999715247309159e-05, + "loss": 0.4995, + "mean_token_accuracy": 0.9056877613067627, + "num_tokens": 3843608.0, + "step": 9520 + }, + { + "entropy": 0.27753055542707444, + "epoch": 2.2203053968994055, + "grad_norm": 4.53125, + "learning_rate": 4.999711980075363e-05, + "loss": 0.4655, + "mean_token_accuracy": 0.9027316212654114, + "num_tokens": 3870966.0, + "step": 9525 + }, + { + "entropy": 0.24506778195500373, + "epoch": 2.221471033920037, + "grad_norm": 7.65625, + "learning_rate": 4.999708694206481e-05, + "loss": 0.4823, + "mean_token_accuracy": 0.893078601360321, + "num_tokens": 3898239.0, + "step": 9530 + }, + { + "entropy": 0.2446131432428956, + "epoch": 2.222636670940669, + "grad_norm": 0.78515625, + "learning_rate": 4.999705389702564e-05, + "loss": 0.2505, + "mean_token_accuracy": 0.9135530173778534, + "num_tokens": 3927262.0, + "step": 9535 + }, + { + "entropy": 0.2848976690322161, + "epoch": 2.223802307961301, + "grad_norm": 3.875, + "learning_rate": 4.999702066563658e-05, + "loss": 0.4168, + "mean_token_accuracy": 0.9176702558994293, + "num_tokens": 3970657.0, + "step": 9540 + }, + { + "entropy": 0.26413310021162034, + "epoch": 2.2249679449819326, + "grad_norm": 0.6015625, + "learning_rate": 4.9996987247898165e-05, + "loss": 0.4095, + "mean_token_accuracy": 0.9062711656093597, + "num_tokens": 3998608.0, + "step": 9545 + }, + { + "entropy": 0.26131284348666667, + "epoch": 2.2261335820025643, + "grad_norm": 3.34375, + "learning_rate": 4.9996953643810865e-05, + "loss": 0.6044, + "mean_token_accuracy": 0.8963152229785919, + "num_tokens": 4026551.0, + "step": 9550 + }, + { + "entropy": 0.3163293443620205, + "epoch": 2.2272992190231964, + "grad_norm": 4.125, + "learning_rate": 4.999691985337519e-05, + "loss": 0.5343, + "mean_token_accuracy": 0.9052989423274994, + "num_tokens": 4042248.0, + "step": 9555 + }, + { + "entropy": 0.2981589982286096, + "epoch": 2.228464856043828, + "grad_norm": 2.21875, + "learning_rate": 4.999688587659165e-05, + "loss": 0.3268, + "mean_token_accuracy": 0.9134493827819824, + "num_tokens": 4070174.0, + "step": 9560 + }, + { + "entropy": 0.4530819907784462, + "epoch": 2.2296304930644597, + "grad_norm": 15.125, + "learning_rate": 4.999685171346073e-05, + "loss": 0.7796, + "mean_token_accuracy": 0.8395863771438599, + "num_tokens": 4097335.0, + "step": 9565 + }, + { + "entropy": 0.3206008315086365, + "epoch": 2.2307961300850914, + "grad_norm": 4.6875, + "learning_rate": 4.999681736398297e-05, + "loss": 0.6619, + "mean_token_accuracy": 0.8948346257209778, + "num_tokens": 4108162.0, + "step": 9570 + }, + { + "entropy": 0.22897542864084244, + "epoch": 2.2319617671057235, + "grad_norm": 3.96875, + "learning_rate": 4.9996782828158864e-05, + "loss": 0.5222, + "mean_token_accuracy": 0.8944558799266815, + "num_tokens": 4127142.0, + "step": 9575 + }, + { + "entropy": 0.30314663872122766, + "epoch": 2.233127404126355, + "grad_norm": 1.6875, + "learning_rate": 4.999674810598892e-05, + "loss": 0.6904, + "mean_token_accuracy": 0.8810445845127106, + "num_tokens": 4147146.0, + "step": 9580 + }, + { + "entropy": 0.27426308766007423, + "epoch": 2.234293041146987, + "grad_norm": 4.875, + "learning_rate": 4.9996713197473674e-05, + "loss": 0.4958, + "mean_token_accuracy": 0.8857026934623718, + "num_tokens": 4162818.0, + "step": 9585 + }, + { + "entropy": 0.24289622381329537, + "epoch": 2.2354586781676185, + "grad_norm": 3.65625, + "learning_rate": 4.999667810261364e-05, + "loss": 0.3609, + "mean_token_accuracy": 0.9134008288383484, + "num_tokens": 4180167.0, + "step": 9590 + }, + { + "entropy": 0.27631061524152756, + "epoch": 2.2366243151882506, + "grad_norm": 2.59375, + "learning_rate": 4.9996642821409337e-05, + "loss": 0.5898, + "mean_token_accuracy": 0.9002882838249207, + "num_tokens": 4192730.0, + "step": 9595 + }, + { + "entropy": 0.3148947186768055, + "epoch": 2.237789952208882, + "grad_norm": 1.984375, + "learning_rate": 4.999660735386129e-05, + "loss": 0.5189, + "mean_token_accuracy": 0.905198335647583, + "num_tokens": 4213631.0, + "step": 9600 + }, + { + "entropy": 0.2848841480910778, + "epoch": 2.238955589229514, + "grad_norm": 1.046875, + "learning_rate": 4.999657169997003e-05, + "loss": 0.4329, + "mean_token_accuracy": 0.9120867669582366, + "num_tokens": 4242635.0, + "step": 9605 + }, + { + "entropy": 0.2845593631267548, + "epoch": 2.2401212262501455, + "grad_norm": 0.4921875, + "learning_rate": 4.9996535859736094e-05, + "loss": 0.4789, + "mean_token_accuracy": 0.8860286891460418, + "num_tokens": 4269157.0, + "step": 9610 + }, + { + "entropy": 0.19470894411206247, + "epoch": 2.2412868632707776, + "grad_norm": 1.7890625, + "learning_rate": 4.999649983316002e-05, + "loss": 0.3254, + "mean_token_accuracy": 0.9371213376522064, + "num_tokens": 4294105.0, + "step": 9615 + }, + { + "entropy": 0.227097824588418, + "epoch": 2.2424525002914093, + "grad_norm": 0.5859375, + "learning_rate": 4.999646362024232e-05, + "loss": 0.3103, + "mean_token_accuracy": 0.9307858169078826, + "num_tokens": 4312123.0, + "step": 9620 + }, + { + "entropy": 0.4397062212228775, + "epoch": 2.243618137312041, + "grad_norm": 1.6484375, + "learning_rate": 4.9996427220983566e-05, + "loss": 0.685, + "mean_token_accuracy": 0.8548967897891998, + "num_tokens": 4332766.0, + "step": 9625 + }, + { + "entropy": 0.32833139449357984, + "epoch": 2.2447837743326726, + "grad_norm": 1.65625, + "learning_rate": 4.999639063538428e-05, + "loss": 0.5871, + "mean_token_accuracy": 0.9062894225120545, + "num_tokens": 4343414.0, + "step": 9630 + }, + { + "entropy": 0.2734944049268961, + "epoch": 2.2459494113533047, + "grad_norm": 7.65625, + "learning_rate": 4.9996353863445016e-05, + "loss": 0.4236, + "mean_token_accuracy": 0.8899337708950043, + "num_tokens": 4374207.0, + "step": 9635 + }, + { + "entropy": 0.358840149641037, + "epoch": 2.2471150483739364, + "grad_norm": 4.84375, + "learning_rate": 4.999631690516632e-05, + "loss": 0.8609, + "mean_token_accuracy": 0.862494820356369, + "num_tokens": 4382229.0, + "step": 9640 + }, + { + "entropy": 0.30931427925825117, + "epoch": 2.248280685394568, + "grad_norm": 3.875, + "learning_rate": 4.9996279760548746e-05, + "loss": 0.5868, + "mean_token_accuracy": 0.8713773310184478, + "num_tokens": 4402365.0, + "step": 9645 + }, + { + "entropy": 0.45645159631967547, + "epoch": 2.2494463224151997, + "grad_norm": 4.34375, + "learning_rate": 4.9996242429592846e-05, + "loss": 1.3055, + "mean_token_accuracy": 0.8058351427316666, + "num_tokens": 4412470.0, + "step": 9650 + }, + { + "entropy": 0.38287107944488524, + "epoch": 2.250611959435832, + "grad_norm": 4.15625, + "learning_rate": 4.999620491229917e-05, + "loss": 0.7953, + "mean_token_accuracy": 0.8735459566116333, + "num_tokens": 4420732.0, + "step": 9655 + }, + { + "entropy": 0.19908476956188678, + "epoch": 2.2517775964564635, + "grad_norm": 3.5625, + "learning_rate": 4.9996167208668285e-05, + "loss": 0.2668, + "mean_token_accuracy": 0.9488315343856811, + "num_tokens": 4449369.0, + "step": 9660 + }, + { + "entropy": 0.2650495123118162, + "epoch": 2.252943233477095, + "grad_norm": 3.96875, + "learning_rate": 4.9996129318700754e-05, + "loss": 0.3855, + "mean_token_accuracy": 0.8953422248363495, + "num_tokens": 4469436.0, + "step": 9665 + }, + { + "entropy": 0.267975103110075, + "epoch": 2.254108870497727, + "grad_norm": 1.546875, + "learning_rate": 4.999609124239714e-05, + "loss": 0.4036, + "mean_token_accuracy": 0.9119938552379608, + "num_tokens": 4490774.0, + "step": 9670 + }, + { + "entropy": 0.18695454075932502, + "epoch": 2.255274507518359, + "grad_norm": 9.4375, + "learning_rate": 4.999605297975801e-05, + "loss": 0.3051, + "mean_token_accuracy": 0.9388129413127899, + "num_tokens": 4522761.0, + "step": 9675 + }, + { + "entropy": 0.21596661265939474, + "epoch": 2.2564401445389906, + "grad_norm": 3.484375, + "learning_rate": 4.999601453078394e-05, + "loss": 0.4132, + "mean_token_accuracy": 0.9199613273143769, + "num_tokens": 4559702.0, + "step": 9680 + }, + { + "entropy": 0.19011754989624025, + "epoch": 2.257605781559622, + "grad_norm": 7.84375, + "learning_rate": 4.999597589547549e-05, + "loss": 0.4957, + "mean_token_accuracy": 0.9160790145397186, + "num_tokens": 4585921.0, + "step": 9685 + }, + { + "entropy": 0.2749990925192833, + "epoch": 2.2587714185802543, + "grad_norm": 2.21875, + "learning_rate": 4.999593707383324e-05, + "loss": 0.4338, + "mean_token_accuracy": 0.9007754683494568, + "num_tokens": 4602847.0, + "step": 9690 + }, + { + "entropy": 0.3145416095852852, + "epoch": 2.259937055600886, + "grad_norm": 2.234375, + "learning_rate": 4.999589806585778e-05, + "loss": 0.4691, + "mean_token_accuracy": 0.8774303197860718, + "num_tokens": 4639511.0, + "step": 9695 + }, + { + "entropy": 0.2634598663076758, + "epoch": 2.2611026926215176, + "grad_norm": 6.375, + "learning_rate": 4.999585887154969e-05, + "loss": 0.3744, + "mean_token_accuracy": 0.90503711104393, + "num_tokens": 4667015.0, + "step": 9700 + }, + { + "entropy": 0.32402964755892755, + "epoch": 2.2622683296421493, + "grad_norm": 1.5390625, + "learning_rate": 4.9995819490909544e-05, + "loss": 0.6425, + "mean_token_accuracy": 0.8781411290168762, + "num_tokens": 4681786.0, + "step": 9705 + }, + { + "entropy": 0.384226082265377, + "epoch": 2.263433966662781, + "grad_norm": 3.890625, + "learning_rate": 4.999577992393794e-05, + "loss": 0.436, + "mean_token_accuracy": 0.8902345955371856, + "num_tokens": 4698503.0, + "step": 9710 + }, + { + "entropy": 0.2572923541069031, + "epoch": 2.264599603683413, + "grad_norm": 3.15625, + "learning_rate": 4.9995740170635454e-05, + "loss": 0.5012, + "mean_token_accuracy": 0.9116134166717529, + "num_tokens": 4719487.0, + "step": 9715 + }, + { + "entropy": 0.26376195102930067, + "epoch": 2.2657652407040447, + "grad_norm": 3.34375, + "learning_rate": 4.999570023100269e-05, + "loss": 0.6524, + "mean_token_accuracy": 0.8757637798786163, + "num_tokens": 4731607.0, + "step": 9720 + }, + { + "entropy": 0.3332611836493015, + "epoch": 2.2669308777246764, + "grad_norm": 4.71875, + "learning_rate": 4.999566010504024e-05, + "loss": 0.6479, + "mean_token_accuracy": 0.8807567059993744, + "num_tokens": 4747368.0, + "step": 9725 + }, + { + "entropy": 0.25054799765348434, + "epoch": 2.2680965147453085, + "grad_norm": 4.875, + "learning_rate": 4.9995619792748704e-05, + "loss": 0.3999, + "mean_token_accuracy": 0.9023214876651764, + "num_tokens": 4770577.0, + "step": 9730 + }, + { + "entropy": 0.30612233132123945, + "epoch": 2.26926215176594, + "grad_norm": 4.90625, + "learning_rate": 4.999557929412868e-05, + "loss": 0.6414, + "mean_token_accuracy": 0.8935961663722992, + "num_tokens": 4780746.0, + "step": 9735 + }, + { + "entropy": 0.27663658782839773, + "epoch": 2.270427788786572, + "grad_norm": 5.75, + "learning_rate": 4.9995538609180786e-05, + "loss": 0.5946, + "mean_token_accuracy": 0.9024072468280793, + "num_tokens": 4803752.0, + "step": 9740 + }, + { + "entropy": 0.28249269388616083, + "epoch": 2.2715934258072035, + "grad_norm": 0.73046875, + "learning_rate": 4.9995497737905604e-05, + "loss": 0.4113, + "mean_token_accuracy": 0.8761835098266602, + "num_tokens": 4827859.0, + "step": 9745 + }, + { + "entropy": 0.32249484956264496, + "epoch": 2.2727590628278356, + "grad_norm": 4.625, + "learning_rate": 4.999545668030377e-05, + "loss": 0.4384, + "mean_token_accuracy": 0.8888987183570862, + "num_tokens": 4842531.0, + "step": 9750 + }, + { + "entropy": 0.269062814116478, + "epoch": 2.2739246998484672, + "grad_norm": 0.494140625, + "learning_rate": 4.999541543637587e-05, + "loss": 0.2706, + "mean_token_accuracy": 0.9015085399150848, + "num_tokens": 4867038.0, + "step": 9755 + }, + { + "entropy": 0.3270197004079819, + "epoch": 2.275090336869099, + "grad_norm": 2.671875, + "learning_rate": 4.999537400612254e-05, + "loss": 0.7103, + "mean_token_accuracy": 0.8657562255859375, + "num_tokens": 4887793.0, + "step": 9760 + }, + { + "entropy": 0.3015760459005833, + "epoch": 2.2762559738897306, + "grad_norm": 2.203125, + "learning_rate": 4.999533238954438e-05, + "loss": 0.4924, + "mean_token_accuracy": 0.8944982051849365, + "num_tokens": 4903059.0, + "step": 9765 + }, + { + "entropy": 0.3116227902472019, + "epoch": 2.2774216109103627, + "grad_norm": 0.71875, + "learning_rate": 4.999529058664203e-05, + "loss": 0.5879, + "mean_token_accuracy": 0.8805232584476471, + "num_tokens": 4929379.0, + "step": 9770 + }, + { + "entropy": 0.21581790670752526, + "epoch": 2.2785872479309943, + "grad_norm": 4.125, + "learning_rate": 4.9995248597416095e-05, + "loss": 0.409, + "mean_token_accuracy": 0.925852793455124, + "num_tokens": 4951193.0, + "step": 9775 + }, + { + "entropy": 0.2722294881939888, + "epoch": 2.279752884951626, + "grad_norm": 1.3125, + "learning_rate": 4.9995206421867214e-05, + "loss": 0.526, + "mean_token_accuracy": 0.888937509059906, + "num_tokens": 4975705.0, + "step": 9780 + }, + { + "entropy": 0.3000844083726406, + "epoch": 2.280918521972258, + "grad_norm": 1.671875, + "learning_rate": 4.999516405999601e-05, + "loss": 0.4401, + "mean_token_accuracy": 0.8820734798908234, + "num_tokens": 4995047.0, + "step": 9785 + }, + { + "entropy": 0.4159097492694855, + "epoch": 2.2820841589928897, + "grad_norm": 1.8203125, + "learning_rate": 4.999512151180312e-05, + "loss": 0.7088, + "mean_token_accuracy": 0.850404542684555, + "num_tokens": 5011382.0, + "step": 9790 + }, + { + "entropy": 0.3981002628803253, + "epoch": 2.2832497960135214, + "grad_norm": 1.2890625, + "learning_rate": 4.999507877728917e-05, + "loss": 0.6603, + "mean_token_accuracy": 0.8550622880458831, + "num_tokens": 5038138.0, + "step": 9795 + }, + { + "entropy": 0.32265768349170687, + "epoch": 2.284415433034153, + "grad_norm": 3.0625, + "learning_rate": 4.99950358564548e-05, + "loss": 0.5588, + "mean_token_accuracy": 0.8854742705821991, + "num_tokens": 5065301.0, + "step": 9800 + }, + { + "entropy": 0.29461741372942923, + "epoch": 2.2855810700547847, + "grad_norm": 4.625, + "learning_rate": 4.9994992749300656e-05, + "loss": 0.6825, + "mean_token_accuracy": 0.8878226995468139, + "num_tokens": 5082478.0, + "step": 9805 + }, + { + "entropy": 0.29813172612339256, + "epoch": 2.286746707075417, + "grad_norm": 4.5625, + "learning_rate": 4.999494945582738e-05, + "loss": 0.6086, + "mean_token_accuracy": 0.8924034178256989, + "num_tokens": 5101083.0, + "step": 9810 + }, + { + "entropy": 0.2548986107110977, + "epoch": 2.2879123440960485, + "grad_norm": 0.85546875, + "learning_rate": 4.99949059760356e-05, + "loss": 0.3604, + "mean_token_accuracy": 0.9100441932678223, + "num_tokens": 5128706.0, + "step": 9815 + }, + { + "entropy": 0.45110637992620467, + "epoch": 2.28907798111668, + "grad_norm": 6.15625, + "learning_rate": 4.9994862309925995e-05, + "loss": 0.6881, + "mean_token_accuracy": 0.884794396162033, + "num_tokens": 5171832.0, + "step": 9820 + }, + { + "entropy": 0.33002062514424324, + "epoch": 2.2902436181373123, + "grad_norm": 2.390625, + "learning_rate": 4.999481845749919e-05, + "loss": 0.6272, + "mean_token_accuracy": 0.883812814950943, + "num_tokens": 5184622.0, + "step": 9825 + }, + { + "entropy": 0.23434590175747871, + "epoch": 2.291409255157944, + "grad_norm": 0.70703125, + "learning_rate": 4.999477441875585e-05, + "loss": 0.2984, + "mean_token_accuracy": 0.9214715182781219, + "num_tokens": 5212319.0, + "step": 9830 + }, + { + "entropy": 0.20613327473402024, + "epoch": 2.2925748921785756, + "grad_norm": 2.1875, + "learning_rate": 4.9994730193696634e-05, + "loss": 0.2315, + "mean_token_accuracy": 0.9313937187194824, + "num_tokens": 5234725.0, + "step": 9835 + }, + { + "entropy": 0.22340739741921425, + "epoch": 2.2937405291992072, + "grad_norm": 7.78125, + "learning_rate": 4.9994685782322195e-05, + "loss": 0.5222, + "mean_token_accuracy": 0.9099694013595581, + "num_tokens": 5248701.0, + "step": 9840 + }, + { + "entropy": 0.33365772254765036, + "epoch": 2.294906166219839, + "grad_norm": 5.28125, + "learning_rate": 4.9994641184633195e-05, + "loss": 0.6683, + "mean_token_accuracy": 0.8572822451591492, + "num_tokens": 5264275.0, + "step": 9845 + }, + { + "entropy": 0.1759272739291191, + "epoch": 2.296071803240471, + "grad_norm": 1.71875, + "learning_rate": 4.999459640063031e-05, + "loss": 0.2545, + "mean_token_accuracy": 0.9185454368591308, + "num_tokens": 5291729.0, + "step": 9850 + }, + { + "entropy": 0.27733614556491376, + "epoch": 2.2972374402611027, + "grad_norm": 6.375, + "learning_rate": 4.999455143031419e-05, + "loss": 0.5294, + "mean_token_accuracy": 0.8734292328357697, + "num_tokens": 5312219.0, + "step": 9855 + }, + { + "entropy": 0.2136335577815771, + "epoch": 2.2984030772817343, + "grad_norm": 6.03125, + "learning_rate": 4.999450627368552e-05, + "loss": 0.4283, + "mean_token_accuracy": 0.9121579229831696, + "num_tokens": 5353777.0, + "step": 9860 + }, + { + "entropy": 0.26821391507983205, + "epoch": 2.2995687143023664, + "grad_norm": 2.65625, + "learning_rate": 4.999446093074497e-05, + "loss": 0.298, + "mean_token_accuracy": 0.9142944037914276, + "num_tokens": 5370503.0, + "step": 9865 + }, + { + "entropy": 0.22883135303854943, + "epoch": 2.300734351322998, + "grad_norm": 1.7265625, + "learning_rate": 4.999441540149321e-05, + "loss": 0.3735, + "mean_token_accuracy": 0.9307141840457916, + "num_tokens": 5394825.0, + "step": 9870 + }, + { + "entropy": 0.22144275531172752, + "epoch": 2.3018999883436297, + "grad_norm": 2.90625, + "learning_rate": 4.999436968593093e-05, + "loss": 0.4087, + "mean_token_accuracy": 0.9137353122234344, + "num_tokens": 5410431.0, + "step": 9875 + }, + { + "entropy": 0.27658817134797575, + "epoch": 2.3030656253642614, + "grad_norm": 4.90625, + "learning_rate": 4.9994323784058805e-05, + "loss": 0.5001, + "mean_token_accuracy": 0.9053739786148072, + "num_tokens": 5430503.0, + "step": 9880 + }, + { + "entropy": 0.28773905634880065, + "epoch": 2.3042312623848935, + "grad_norm": 5.84375, + "learning_rate": 4.999427769587752e-05, + "loss": 0.6671, + "mean_token_accuracy": 0.8880759894847869, + "num_tokens": 5441651.0, + "step": 9885 + }, + { + "entropy": 0.2942317187786102, + "epoch": 2.305396899405525, + "grad_norm": 2.921875, + "learning_rate": 4.999423142138775e-05, + "loss": 0.6182, + "mean_token_accuracy": 0.8829988837242126, + "num_tokens": 5467590.0, + "step": 9890 + }, + { + "entropy": 0.37470342963933945, + "epoch": 2.306562536426157, + "grad_norm": 4.625, + "learning_rate": 4.9994184960590215e-05, + "loss": 0.8504, + "mean_token_accuracy": 0.8617240965366364, + "num_tokens": 5475251.0, + "step": 9895 + }, + { + "entropy": 0.2544895693659782, + "epoch": 2.3077281734467885, + "grad_norm": 2.234375, + "learning_rate": 4.999413831348558e-05, + "loss": 0.324, + "mean_token_accuracy": 0.9116761982440948, + "num_tokens": 5489862.0, + "step": 9900 + }, + { + "entropy": 0.23354592993855477, + "epoch": 2.3088938104674206, + "grad_norm": 1.609375, + "learning_rate": 4.999409148007455e-05, + "loss": 0.4133, + "mean_token_accuracy": 0.908677488565445, + "num_tokens": 5515027.0, + "step": 9905 + }, + { + "entropy": 0.3375227749347687, + "epoch": 2.3100594474880523, + "grad_norm": 5.4375, + "learning_rate": 4.999404446035782e-05, + "loss": 0.6789, + "mean_token_accuracy": 0.8693827629089356, + "num_tokens": 5524493.0, + "step": 9910 + }, + { + "entropy": 0.3225863866508007, + "epoch": 2.311225084508684, + "grad_norm": 2.96875, + "learning_rate": 4.9993997254336104e-05, + "loss": 0.5651, + "mean_token_accuracy": 0.8935315608978271, + "num_tokens": 5538810.0, + "step": 9915 + }, + { + "entropy": 0.3349943034350872, + "epoch": 2.312390721529316, + "grad_norm": 5.3125, + "learning_rate": 4.99939498620101e-05, + "loss": 0.718, + "mean_token_accuracy": 0.868977564573288, + "num_tokens": 5554161.0, + "step": 9920 + }, + { + "entropy": 0.33131011128425597, + "epoch": 2.3135563585499477, + "grad_norm": 4.84375, + "learning_rate": 4.99939022833805e-05, + "loss": 0.6606, + "mean_token_accuracy": 0.886441296339035, + "num_tokens": 5564690.0, + "step": 9925 + }, + { + "entropy": 0.3090111125260592, + "epoch": 2.3147219955705793, + "grad_norm": 1.140625, + "learning_rate": 4.9993854518448033e-05, + "loss": 0.4239, + "mean_token_accuracy": 0.8963955879211426, + "num_tokens": 5594772.0, + "step": 9930 + }, + { + "entropy": 0.366110710054636, + "epoch": 2.315887632591211, + "grad_norm": 6.15625, + "learning_rate": 4.99938065672134e-05, + "loss": 0.7542, + "mean_token_accuracy": 0.8663625955581665, + "num_tokens": 5604796.0, + "step": 9935 + }, + { + "entropy": 0.2388034023344517, + "epoch": 2.3170532696118427, + "grad_norm": 2.671875, + "learning_rate": 4.9993758429677324e-05, + "loss": 0.4317, + "mean_token_accuracy": 0.9069565892219543, + "num_tokens": 5621314.0, + "step": 9940 + }, + { + "entropy": 0.2781688906252384, + "epoch": 2.3182189066324748, + "grad_norm": 3.265625, + "learning_rate": 4.999371010584051e-05, + "loss": 0.6039, + "mean_token_accuracy": 0.9053974390029907, + "num_tokens": 5634592.0, + "step": 9945 + }, + { + "entropy": 0.40243220534175633, + "epoch": 2.3193845436531064, + "grad_norm": 0.98828125, + "learning_rate": 4.999366159570369e-05, + "loss": 0.679, + "mean_token_accuracy": 0.8980438828468322, + "num_tokens": 5670943.0, + "step": 9950 + }, + { + "entropy": 0.2519418615847826, + "epoch": 2.320550180673738, + "grad_norm": 1.4921875, + "learning_rate": 4.999361289926759e-05, + "loss": 0.4197, + "mean_token_accuracy": 0.9068368792533874, + "num_tokens": 5690810.0, + "step": 9955 + }, + { + "entropy": 0.27322563864290716, + "epoch": 2.32171581769437, + "grad_norm": 1.4765625, + "learning_rate": 4.999356401653292e-05, + "loss": 0.4227, + "mean_token_accuracy": 0.8880882680416107, + "num_tokens": 5715025.0, + "step": 9960 + }, + { + "entropy": 0.26188406348228455, + "epoch": 2.322881454715002, + "grad_norm": 1.3828125, + "learning_rate": 4.999351494750043e-05, + "loss": 0.5393, + "mean_token_accuracy": 0.8850970089435577, + "num_tokens": 5728900.0, + "step": 9965 + }, + { + "entropy": 0.2880741149187088, + "epoch": 2.3240470917356335, + "grad_norm": 2.5, + "learning_rate": 4.9993465692170834e-05, + "loss": 0.5976, + "mean_token_accuracy": 0.8882794141769409, + "num_tokens": 5749009.0, + "step": 9970 + }, + { + "entropy": 0.24220139384269715, + "epoch": 2.325212728756265, + "grad_norm": 3.0625, + "learning_rate": 4.999341625054487e-05, + "loss": 0.3774, + "mean_token_accuracy": 0.9123858332633972, + "num_tokens": 5775137.0, + "step": 9975 + }, + { + "entropy": 0.4156968787312508, + "epoch": 2.326378365776897, + "grad_norm": 2.21875, + "learning_rate": 4.9993366622623286e-05, + "loss": 0.7635, + "mean_token_accuracy": 0.8408162415027618, + "num_tokens": 5796075.0, + "step": 9980 + }, + { + "entropy": 0.28073378577828406, + "epoch": 2.327544002797529, + "grad_norm": 1.5703125, + "learning_rate": 4.9993316808406807e-05, + "loss": 0.3659, + "mean_token_accuracy": 0.9161297023296356, + "num_tokens": 5823916.0, + "step": 9985 + }, + { + "entropy": 0.2217119388282299, + "epoch": 2.3287096398181606, + "grad_norm": 1.890625, + "learning_rate": 4.999326680789619e-05, + "loss": 0.2558, + "mean_token_accuracy": 0.9355304777622223, + "num_tokens": 5847977.0, + "step": 9990 + }, + { + "entropy": 0.29391138851642606, + "epoch": 2.3298752768387923, + "grad_norm": 0.80078125, + "learning_rate": 4.999321662109218e-05, + "loss": 0.4333, + "mean_token_accuracy": 0.9192675650119781, + "num_tokens": 5883540.0, + "step": 9995 + }, + { + "entropy": 0.3789336267858744, + "epoch": 2.3310409138594244, + "grad_norm": 5.28125, + "learning_rate": 4.999316624799551e-05, + "loss": 0.754, + "mean_token_accuracy": 0.8675079941749573, + "num_tokens": 5896164.0, + "step": 10000 + }, + { + "entropy": 0.35315763801336286, + "epoch": 2.332206550880056, + "grad_norm": 5.1875, + "learning_rate": 4.999311568860694e-05, + "loss": 0.7894, + "mean_token_accuracy": 0.8786959171295166, + "num_tokens": 5922708.0, + "step": 10005 + }, + { + "entropy": 0.346806064248085, + "epoch": 2.3333721879006877, + "grad_norm": 5.75, + "learning_rate": 4.999306494292722e-05, + "loss": 0.6594, + "mean_token_accuracy": 0.8650177419185638, + "num_tokens": 5942033.0, + "step": 10010 + }, + { + "entropy": 0.34452898651361463, + "epoch": 2.3345378249213193, + "grad_norm": 4.90625, + "learning_rate": 4.999301401095712e-05, + "loss": 0.7611, + "mean_token_accuracy": 0.8749968349933624, + "num_tokens": 5950976.0, + "step": 10015 + }, + { + "entropy": 0.22225252017378808, + "epoch": 2.3357034619419514, + "grad_norm": 0.65234375, + "learning_rate": 4.999296289269739e-05, + "loss": 0.4898, + "mean_token_accuracy": 0.9027444779872894, + "num_tokens": 5982814.0, + "step": 10020 + }, + { + "entropy": 0.23755434323102237, + "epoch": 2.336869098962583, + "grad_norm": 2.0, + "learning_rate": 4.999291158814879e-05, + "loss": 0.3463, + "mean_token_accuracy": 0.9088250041007996, + "num_tokens": 6005082.0, + "step": 10025 + }, + { + "entropy": 0.29438409954309464, + "epoch": 2.3380347359832148, + "grad_norm": 3.78125, + "learning_rate": 4.999286009731209e-05, + "loss": 0.6279, + "mean_token_accuracy": 0.8902266621589661, + "num_tokens": 6018880.0, + "step": 10030 + }, + { + "entropy": 0.3445430710911751, + "epoch": 2.3392003730038464, + "grad_norm": 2.21875, + "learning_rate": 4.9992808420188056e-05, + "loss": 0.4228, + "mean_token_accuracy": 0.8743822872638702, + "num_tokens": 6045068.0, + "step": 10035 + }, + { + "entropy": 0.24550675339996814, + "epoch": 2.3403660100244785, + "grad_norm": 1.7109375, + "learning_rate": 4.9992756556777457e-05, + "loss": 0.385, + "mean_token_accuracy": 0.9098658263683319, + "num_tokens": 6087604.0, + "step": 10040 + }, + { + "entropy": 0.21833256110548974, + "epoch": 2.34153164704511, + "grad_norm": 4.40625, + "learning_rate": 4.999270450708106e-05, + "loss": 0.3856, + "mean_token_accuracy": 0.9172523081302643, + "num_tokens": 6112153.0, + "step": 10045 + }, + { + "entropy": 0.18027405068278313, + "epoch": 2.342697284065742, + "grad_norm": 0.76953125, + "learning_rate": 4.999265227109966e-05, + "loss": 0.2369, + "mean_token_accuracy": 0.9305807530879975, + "num_tokens": 6142263.0, + "step": 10050 + }, + { + "entropy": 0.3012556668370962, + "epoch": 2.343862921086374, + "grad_norm": 2.5, + "learning_rate": 4.999259984883402e-05, + "loss": 0.3072, + "mean_token_accuracy": 0.9214911222457886, + "num_tokens": 6162712.0, + "step": 10055 + }, + { + "entropy": 0.2414890991523862, + "epoch": 2.3450285581070056, + "grad_norm": 3.8125, + "learning_rate": 4.9992547240284926e-05, + "loss": 0.4834, + "mean_token_accuracy": 0.8984517753124237, + "num_tokens": 6183240.0, + "step": 10060 + }, + { + "entropy": 0.29529370069503785, + "epoch": 2.3461941951276373, + "grad_norm": 0.8984375, + "learning_rate": 4.9992494445453166e-05, + "loss": 0.4558, + "mean_token_accuracy": 0.8840475618839264, + "num_tokens": 6220703.0, + "step": 10065 + }, + { + "entropy": 0.22695901058614254, + "epoch": 2.347359832148269, + "grad_norm": 1.4921875, + "learning_rate": 4.9992441464339516e-05, + "loss": 0.3135, + "mean_token_accuracy": 0.928707069158554, + "num_tokens": 6248979.0, + "step": 10070 + }, + { + "entropy": 0.2766278892755508, + "epoch": 2.3485254691689006, + "grad_norm": 1.953125, + "learning_rate": 4.999238829694478e-05, + "loss": 0.549, + "mean_token_accuracy": 0.9088833332061768, + "num_tokens": 6260919.0, + "step": 10075 + }, + { + "entropy": 0.1924386665225029, + "epoch": 2.3496911061895327, + "grad_norm": 1.03125, + "learning_rate": 4.9992334943269746e-05, + "loss": 0.2784, + "mean_token_accuracy": 0.929617577791214, + "num_tokens": 6286410.0, + "step": 10080 + }, + { + "entropy": 0.32888844013214114, + "epoch": 2.3508567432101644, + "grad_norm": 2.234375, + "learning_rate": 4.9992281403315206e-05, + "loss": 0.6281, + "mean_token_accuracy": 0.8847958862781524, + "num_tokens": 6296942.0, + "step": 10085 + }, + { + "entropy": 0.2987643375992775, + "epoch": 2.352022380230796, + "grad_norm": 4.0, + "learning_rate": 4.999222767708196e-05, + "loss": 0.5998, + "mean_token_accuracy": 0.8935287356376648, + "num_tokens": 6314166.0, + "step": 10090 + }, + { + "entropy": 0.15821853913366796, + "epoch": 2.353188017251428, + "grad_norm": 0.953125, + "learning_rate": 4.999217376457082e-05, + "loss": 0.1544, + "mean_token_accuracy": 0.9383697509765625, + "num_tokens": 6351218.0, + "step": 10095 + }, + { + "entropy": 0.3634513683617115, + "epoch": 2.35435365427206, + "grad_norm": 5.09375, + "learning_rate": 4.9992119665782564e-05, + "loss": 0.6301, + "mean_token_accuracy": 0.8913884639739991, + "num_tokens": 6369883.0, + "step": 10100 + }, + { + "entropy": 0.30150746181607246, + "epoch": 2.3555192912926914, + "grad_norm": 2.0625, + "learning_rate": 4.999206538071802e-05, + "loss": 0.5422, + "mean_token_accuracy": 0.899857884645462, + "num_tokens": 6390839.0, + "step": 10105 + }, + { + "entropy": 0.34220693577080963, + "epoch": 2.356684928313323, + "grad_norm": 3.71875, + "learning_rate": 4.999201090937799e-05, + "loss": 0.5697, + "mean_token_accuracy": 0.8890272676944733, + "num_tokens": 6409615.0, + "step": 10110 + }, + { + "entropy": 0.35875163078308103, + "epoch": 2.3578505653339548, + "grad_norm": 0.9296875, + "learning_rate": 4.99919562517633e-05, + "loss": 0.6268, + "mean_token_accuracy": 0.8982849538326263, + "num_tokens": 6437909.0, + "step": 10115 + }, + { + "entropy": 0.33476879745721816, + "epoch": 2.359016202354587, + "grad_norm": 8.875, + "learning_rate": 4.9991901407874744e-05, + "loss": 0.5472, + "mean_token_accuracy": 0.8878929436206817, + "num_tokens": 6455517.0, + "step": 10120 + }, + { + "entropy": 0.2315022237598896, + "epoch": 2.3601818393752185, + "grad_norm": 3.265625, + "learning_rate": 4.999184637771315e-05, + "loss": 0.3871, + "mean_token_accuracy": 0.9159708678722381, + "num_tokens": 6469407.0, + "step": 10125 + }, + { + "entropy": 0.3491888826712966, + "epoch": 2.36134747639585, + "grad_norm": 5.4375, + "learning_rate": 4.9991791161279335e-05, + "loss": 0.3751, + "mean_token_accuracy": 0.8821563601493836, + "num_tokens": 6502505.0, + "step": 10130 + }, + { + "entropy": 0.23251214995980263, + "epoch": 2.3625131134164823, + "grad_norm": 3.25, + "learning_rate": 4.999173575857413e-05, + "loss": 0.3658, + "mean_token_accuracy": 0.9205073654651642, + "num_tokens": 6529179.0, + "step": 10135 + }, + { + "entropy": 0.22121235951781273, + "epoch": 2.363678750437114, + "grad_norm": 1.21875, + "learning_rate": 4.999168016959836e-05, + "loss": 0.365, + "mean_token_accuracy": 0.9118550777435303, + "num_tokens": 6547764.0, + "step": 10140 + }, + { + "entropy": 0.3553631380200386, + "epoch": 2.3648443874577456, + "grad_norm": 4.34375, + "learning_rate": 4.999162439435284e-05, + "loss": 0.5203, + "mean_token_accuracy": 0.8921437501907349, + "num_tokens": 6566759.0, + "step": 10145 + }, + { + "entropy": 0.2485006831586361, + "epoch": 2.3660100244783773, + "grad_norm": 1.125, + "learning_rate": 4.999156843283842e-05, + "loss": 0.5423, + "mean_token_accuracy": 0.9096591651439667, + "num_tokens": 6582040.0, + "step": 10150 + }, + { + "entropy": 0.24684040918946265, + "epoch": 2.3671756614990094, + "grad_norm": 5.78125, + "learning_rate": 4.9991512285055925e-05, + "loss": 0.3533, + "mean_token_accuracy": 0.9087679147720337, + "num_tokens": 6612175.0, + "step": 10155 + }, + { + "entropy": 0.25654774941504, + "epoch": 2.368341298519641, + "grad_norm": 0.65234375, + "learning_rate": 4.999145595100619e-05, + "loss": 0.4405, + "mean_token_accuracy": 0.9002897024154664, + "num_tokens": 6633403.0, + "step": 10160 + }, + { + "entropy": 0.32544124498963356, + "epoch": 2.3695069355402727, + "grad_norm": 4.4375, + "learning_rate": 4.9991399430690065e-05, + "loss": 0.5944, + "mean_token_accuracy": 0.8853745698928833, + "num_tokens": 6651214.0, + "step": 10165 + }, + { + "entropy": 0.286426867544651, + "epoch": 2.3706725725609044, + "grad_norm": 3.234375, + "learning_rate": 4.9991342724108373e-05, + "loss": 0.5173, + "mean_token_accuracy": 0.9167122066020965, + "num_tokens": 6661904.0, + "step": 10170 + }, + { + "entropy": 0.23976799920201303, + "epoch": 2.3718382095815365, + "grad_norm": 0.90625, + "learning_rate": 4.999128583126199e-05, + "loss": 0.3663, + "mean_token_accuracy": 0.9295875430107117, + "num_tokens": 6689129.0, + "step": 10175 + }, + { + "entropy": 0.2436413548886776, + "epoch": 2.373003846602168, + "grad_norm": 6.21875, + "learning_rate": 4.999122875215173e-05, + "loss": 0.3969, + "mean_token_accuracy": 0.9143813967704773, + "num_tokens": 6704633.0, + "step": 10180 + }, + { + "entropy": 0.20087213069200516, + "epoch": 2.3741694836228, + "grad_norm": 3.9375, + "learning_rate": 4.9991171486778475e-05, + "loss": 0.2498, + "mean_token_accuracy": 0.9344460725784302, + "num_tokens": 6732545.0, + "step": 10185 + }, + { + "entropy": 0.2517100729048252, + "epoch": 2.3753351206434314, + "grad_norm": 3.34375, + "learning_rate": 4.999111403514306e-05, + "loss": 0.4214, + "mean_token_accuracy": 0.9093203961849212, + "num_tokens": 6753968.0, + "step": 10190 + }, + { + "entropy": 0.18460363671183586, + "epoch": 2.3765007576640635, + "grad_norm": 2.5, + "learning_rate": 4.999105639724635e-05, + "loss": 0.2715, + "mean_token_accuracy": 0.9433306694030762, + "num_tokens": 6778317.0, + "step": 10195 + }, + { + "entropy": 0.2778391644358635, + "epoch": 2.377666394684695, + "grad_norm": 2.328125, + "learning_rate": 4.99909985730892e-05, + "loss": 0.5476, + "mean_token_accuracy": 0.9003312945365906, + "num_tokens": 6790474.0, + "step": 10200 + }, + { + "entropy": 0.2117137961089611, + "epoch": 2.378832031705327, + "grad_norm": 2.78125, + "learning_rate": 4.9990940562672466e-05, + "loss": 0.3378, + "mean_token_accuracy": 0.9290489733219147, + "num_tokens": 6809944.0, + "step": 10205 + }, + { + "entropy": 0.25846754014492035, + "epoch": 2.3799976687259585, + "grad_norm": 3.9375, + "learning_rate": 4.999088236599703e-05, + "loss": 0.4026, + "mean_token_accuracy": 0.9089631319046021, + "num_tokens": 6826269.0, + "step": 10210 + }, + { + "entropy": 0.24787395931780337, + "epoch": 2.3811633057465906, + "grad_norm": 6.09375, + "learning_rate": 4.999082398306375e-05, + "loss": 0.4468, + "mean_token_accuracy": 0.9027640461921692, + "num_tokens": 6842642.0, + "step": 10215 + }, + { + "entropy": 0.31294687986373904, + "epoch": 2.3823289427672223, + "grad_norm": 2.734375, + "learning_rate": 4.99907654138735e-05, + "loss": 0.41, + "mean_token_accuracy": 0.9093664288520813, + "num_tokens": 6871017.0, + "step": 10220 + }, + { + "entropy": 0.4047424577176571, + "epoch": 2.383494579787854, + "grad_norm": 5.8125, + "learning_rate": 4.999070665842714e-05, + "loss": 0.6061, + "mean_token_accuracy": 0.8631377577781677, + "num_tokens": 6899208.0, + "step": 10225 + }, + { + "entropy": 0.32310840412974356, + "epoch": 2.384660216808486, + "grad_norm": 4.15625, + "learning_rate": 4.9990647716725565e-05, + "loss": 0.5589, + "mean_token_accuracy": 0.8941978096961976, + "num_tokens": 6915248.0, + "step": 10230 + }, + { + "entropy": 0.17672128304839135, + "epoch": 2.3858258538291177, + "grad_norm": 0.85546875, + "learning_rate": 4.9990588588769636e-05, + "loss": 0.1898, + "mean_token_accuracy": 0.932169246673584, + "num_tokens": 6948634.0, + "step": 10235 + }, + { + "entropy": 0.2601334646344185, + "epoch": 2.3869914908497494, + "grad_norm": 1.890625, + "learning_rate": 4.9990529274560256e-05, + "loss": 0.5304, + "mean_token_accuracy": 0.9085682332515717, + "num_tokens": 6959783.0, + "step": 10240 + }, + { + "entropy": 0.2183239098638296, + "epoch": 2.388157127870381, + "grad_norm": 0.70703125, + "learning_rate": 4.999046977409829e-05, + "loss": 0.3807, + "mean_token_accuracy": 0.9079346299171448, + "num_tokens": 6980023.0, + "step": 10245 + }, + { + "entropy": 0.2789805203676224, + "epoch": 2.3893227648910127, + "grad_norm": 4.3125, + "learning_rate": 4.999041008738464e-05, + "loss": 0.4494, + "mean_token_accuracy": 0.9105753004550934, + "num_tokens": 7001779.0, + "step": 10250 + }, + { + "entropy": 0.23990599066019058, + "epoch": 2.390488401911645, + "grad_norm": 1.2578125, + "learning_rate": 4.999035021442018e-05, + "loss": 0.2052, + "mean_token_accuracy": 0.9130673289299012, + "num_tokens": 7042231.0, + "step": 10255 + }, + { + "entropy": 0.33709273431450126, + "epoch": 2.3916540389322765, + "grad_norm": 3.28125, + "learning_rate": 4.999029015520582e-05, + "loss": 0.4839, + "mean_token_accuracy": 0.9022141516208648, + "num_tokens": 7064958.0, + "step": 10260 + }, + { + "entropy": 0.28133830726146697, + "epoch": 2.392819675952908, + "grad_norm": 3.34375, + "learning_rate": 4.999022990974244e-05, + "loss": 0.5064, + "mean_token_accuracy": 0.9095320522785186, + "num_tokens": 7075220.0, + "step": 10265 + }, + { + "entropy": 0.25721886157989504, + "epoch": 2.3939853129735402, + "grad_norm": 0.90625, + "learning_rate": 4.9990169478030955e-05, + "loss": 0.4456, + "mean_token_accuracy": 0.9056904733180999, + "num_tokens": 7103947.0, + "step": 10270 + }, + { + "entropy": 0.2558513440191746, + "epoch": 2.395150949994172, + "grad_norm": 5.90625, + "learning_rate": 4.999010886007225e-05, + "loss": 0.6014, + "mean_token_accuracy": 0.9011744916439056, + "num_tokens": 7115900.0, + "step": 10275 + }, + { + "entropy": 0.3650269165635109, + "epoch": 2.3963165870148035, + "grad_norm": 5.625, + "learning_rate": 4.9990048055867236e-05, + "loss": 0.8017, + "mean_token_accuracy": 0.8734424829483032, + "num_tokens": 7123489.0, + "step": 10280 + }, + { + "entropy": 0.41591184586286545, + "epoch": 2.397482224035435, + "grad_norm": 2.15625, + "learning_rate": 4.998998706541682e-05, + "loss": 0.7947, + "mean_token_accuracy": 0.8584976613521575, + "num_tokens": 7135823.0, + "step": 10285 + }, + { + "entropy": 0.27260911352932454, + "epoch": 2.3986478610560673, + "grad_norm": 0.8671875, + "learning_rate": 4.998992588872191e-05, + "loss": 0.4248, + "mean_token_accuracy": 0.9116097688674927, + "num_tokens": 7155099.0, + "step": 10290 + }, + { + "entropy": 0.3118731968104839, + "epoch": 2.399813498076699, + "grad_norm": 3.78125, + "learning_rate": 4.9989864525783426e-05, + "loss": 0.673, + "mean_token_accuracy": 0.8843840837478638, + "num_tokens": 7167022.0, + "step": 10295 + }, + { + "entropy": 0.2853694766759872, + "epoch": 2.4009791350973306, + "grad_norm": 1.9375, + "learning_rate": 4.998980297660227e-05, + "loss": 0.5487, + "mean_token_accuracy": 0.9019811570644378, + "num_tokens": 7178667.0, + "step": 10300 + }, + { + "entropy": 0.176923687197268, + "epoch": 2.4021447721179623, + "grad_norm": 3.109375, + "learning_rate": 4.998974124117937e-05, + "loss": 0.3027, + "mean_token_accuracy": 0.9455612003803253, + "num_tokens": 7213035.0, + "step": 10305 + }, + { + "entropy": 0.25026210620999334, + "epoch": 2.4033104091385944, + "grad_norm": 1.265625, + "learning_rate": 4.9989679319515637e-05, + "loss": 0.3769, + "mean_token_accuracy": 0.902446985244751, + "num_tokens": 7230402.0, + "step": 10310 + }, + { + "entropy": 0.31051740124821664, + "epoch": 2.404476046159226, + "grad_norm": 0.765625, + "learning_rate": 4.9989617211612e-05, + "loss": 0.6664, + "mean_token_accuracy": 0.8678971707820893, + "num_tokens": 7262503.0, + "step": 10315 + }, + { + "entropy": 0.3076247863471508, + "epoch": 2.4056416831798577, + "grad_norm": 2.671875, + "learning_rate": 4.998955491746938e-05, + "loss": 0.5008, + "mean_token_accuracy": 0.8952358782291412, + "num_tokens": 7276921.0, + "step": 10320 + }, + { + "entropy": 0.28582123182713987, + "epoch": 2.4068073202004894, + "grad_norm": 0.62109375, + "learning_rate": 4.9989492437088724e-05, + "loss": 0.4781, + "mean_token_accuracy": 0.9080114364624023, + "num_tokens": 7299222.0, + "step": 10325 + }, + { + "entropy": 0.22197759114205837, + "epoch": 2.4079729572211215, + "grad_norm": 3.84375, + "learning_rate": 4.998942977047094e-05, + "loss": 0.4596, + "mean_token_accuracy": 0.9132648468017578, + "num_tokens": 7330389.0, + "step": 10330 + }, + { + "entropy": 0.31743494495749475, + "epoch": 2.409138594241753, + "grad_norm": 4.84375, + "learning_rate": 4.998936691761698e-05, + "loss": 0.642, + "mean_token_accuracy": 0.8967314422130584, + "num_tokens": 7341450.0, + "step": 10335 + }, + { + "entropy": 0.23332953825592995, + "epoch": 2.410304231262385, + "grad_norm": 0.62890625, + "learning_rate": 4.9989303878527774e-05, + "loss": 0.3062, + "mean_token_accuracy": 0.9363838791847229, + "num_tokens": 7378009.0, + "step": 10340 + }, + { + "entropy": 0.2116334456950426, + "epoch": 2.4114698682830165, + "grad_norm": 1.0234375, + "learning_rate": 4.998924065320426e-05, + "loss": 0.4065, + "mean_token_accuracy": 0.9076114535331726, + "num_tokens": 7395486.0, + "step": 10345 + }, + { + "entropy": 0.37385508567094805, + "epoch": 2.4126355053036486, + "grad_norm": 5.125, + "learning_rate": 4.9989177241647376e-05, + "loss": 0.8335, + "mean_token_accuracy": 0.8719491124153137, + "num_tokens": 7403633.0, + "step": 10350 + }, + { + "entropy": 0.2373753260821104, + "epoch": 2.4138011423242802, + "grad_norm": 1.609375, + "learning_rate": 4.998911364385808e-05, + "loss": 0.3948, + "mean_token_accuracy": 0.9229023635387421, + "num_tokens": 7427889.0, + "step": 10355 + }, + { + "entropy": 0.26355861928313973, + "epoch": 2.414966779344912, + "grad_norm": 0.546875, + "learning_rate": 4.998904985983732e-05, + "loss": 0.3508, + "mean_token_accuracy": 0.9301092326641083, + "num_tokens": 7452324.0, + "step": 10360 + }, + { + "entropy": 0.2737158641219139, + "epoch": 2.416132416365544, + "grad_norm": 0.8828125, + "learning_rate": 4.998898588958604e-05, + "loss": 0.414, + "mean_token_accuracy": 0.9086792588233947, + "num_tokens": 7470897.0, + "step": 10365 + }, + { + "entropy": 0.19756668284535409, + "epoch": 2.4172980533861756, + "grad_norm": 2.671875, + "learning_rate": 4.9988921733105196e-05, + "loss": 0.35, + "mean_token_accuracy": 0.9091159641742707, + "num_tokens": 7499445.0, + "step": 10370 + }, + { + "entropy": 0.23579902928322555, + "epoch": 2.4184636904068073, + "grad_norm": 9.0625, + "learning_rate": 4.998885739039574e-05, + "loss": 0.5282, + "mean_token_accuracy": 0.9076445281505585, + "num_tokens": 7532011.0, + "step": 10375 + }, + { + "entropy": 0.32499233111739156, + "epoch": 2.419629327427439, + "grad_norm": 1.421875, + "learning_rate": 4.998879286145863e-05, + "loss": 0.5701, + "mean_token_accuracy": 0.8866388320922851, + "num_tokens": 7552376.0, + "step": 10380 + }, + { + "entropy": 0.24429988935589791, + "epoch": 2.4207949644480706, + "grad_norm": 4.125, + "learning_rate": 4.998872814629485e-05, + "loss": 0.4168, + "mean_token_accuracy": 0.9179956972599029, + "num_tokens": 7576024.0, + "step": 10385 + }, + { + "entropy": 0.2302600122988224, + "epoch": 2.4219606014687027, + "grad_norm": 1.4375, + "learning_rate": 4.998866324490534e-05, + "loss": 0.1357, + "mean_token_accuracy": 0.9121902585029602, + "num_tokens": 7622278.0, + "step": 10390 + }, + { + "entropy": 0.2962619811296463, + "epoch": 2.4231262384893344, + "grad_norm": 6.5625, + "learning_rate": 4.9988598157291076e-05, + "loss": 0.5138, + "mean_token_accuracy": 0.8958117604255676, + "num_tokens": 7644126.0, + "step": 10395 + }, + { + "entropy": 0.2693315804004669, + "epoch": 2.424291875509966, + "grad_norm": 0.69140625, + "learning_rate": 4.998853288345303e-05, + "loss": 0.4465, + "mean_token_accuracy": 0.9200535476207733, + "num_tokens": 7666542.0, + "step": 10400 + }, + { + "entropy": 0.25582237504422667, + "epoch": 2.425457512530598, + "grad_norm": 6.4375, + "learning_rate": 4.9988467423392184e-05, + "loss": 0.3563, + "mean_token_accuracy": 0.9283755540847778, + "num_tokens": 7687016.0, + "step": 10405 + }, + { + "entropy": 0.32354746013879776, + "epoch": 2.42662314955123, + "grad_norm": 5.0, + "learning_rate": 4.99884017771095e-05, + "loss": 0.658, + "mean_token_accuracy": 0.870765084028244, + "num_tokens": 7710754.0, + "step": 10410 + }, + { + "entropy": 0.28638599887490274, + "epoch": 2.4277887865718615, + "grad_norm": 5.59375, + "learning_rate": 4.998833594460596e-05, + "loss": 0.4625, + "mean_token_accuracy": 0.9126111924648285, + "num_tokens": 7722276.0, + "step": 10415 + }, + { + "entropy": 0.27470932640135287, + "epoch": 2.428954423592493, + "grad_norm": 0.8984375, + "learning_rate": 4.9988269925882545e-05, + "loss": 0.2832, + "mean_token_accuracy": 0.9213086009025574, + "num_tokens": 7746462.0, + "step": 10420 + }, + { + "entropy": 0.2574032604694366, + "epoch": 2.4301200606131252, + "grad_norm": 3.703125, + "learning_rate": 4.998820372094024e-05, + "loss": 0.5251, + "mean_token_accuracy": 0.906989449262619, + "num_tokens": 7766979.0, + "step": 10425 + }, + { + "entropy": 0.29713969230651854, + "epoch": 2.431285697633757, + "grad_norm": 4.96875, + "learning_rate": 4.9988137329780045e-05, + "loss": 0.4792, + "mean_token_accuracy": 0.9020000696182251, + "num_tokens": 7788246.0, + "step": 10430 + }, + { + "entropy": 0.29970705658197405, + "epoch": 2.4324513346543886, + "grad_norm": 6.25, + "learning_rate": 4.998807075240293e-05, + "loss": 0.5492, + "mean_token_accuracy": 0.8868921577930451, + "num_tokens": 7806839.0, + "step": 10435 + }, + { + "entropy": 0.24076540470123292, + "epoch": 2.4336169716750202, + "grad_norm": 1.2421875, + "learning_rate": 4.99880039888099e-05, + "loss": 0.2813, + "mean_token_accuracy": 0.9311257004737854, + "num_tokens": 7834109.0, + "step": 10440 + }, + { + "entropy": 0.36454578340053556, + "epoch": 2.4347826086956523, + "grad_norm": 4.25, + "learning_rate": 4.998793703900195e-05, + "loss": 0.7615, + "mean_token_accuracy": 0.8765661299228669, + "num_tokens": 7842120.0, + "step": 10445 + }, + { + "entropy": 0.27470249533653257, + "epoch": 2.435948245716284, + "grad_norm": 5.0625, + "learning_rate": 4.9987869902980075e-05, + "loss": 0.5855, + "mean_token_accuracy": 0.8889828503131867, + "num_tokens": 7853213.0, + "step": 10450 + }, + { + "entropy": 0.2980054959654808, + "epoch": 2.4371138827369156, + "grad_norm": 6.375, + "learning_rate": 4.998780258074527e-05, + "loss": 0.5448, + "mean_token_accuracy": 0.8802853941917419, + "num_tokens": 7871802.0, + "step": 10455 + }, + { + "entropy": 0.21270959489047528, + "epoch": 2.4382795197575473, + "grad_norm": 2.921875, + "learning_rate": 4.998773507229855e-05, + "loss": 0.3678, + "mean_token_accuracy": 0.9348537504673005, + "num_tokens": 7892438.0, + "step": 10460 + }, + { + "entropy": 0.21355258971452712, + "epoch": 2.4394451567781794, + "grad_norm": 2.875, + "learning_rate": 4.9987667377640924e-05, + "loss": 0.3796, + "mean_token_accuracy": 0.9237074673175811, + "num_tokens": 7931868.0, + "step": 10465 + }, + { + "entropy": 0.24989582411944866, + "epoch": 2.440610793798811, + "grad_norm": 6.84375, + "learning_rate": 4.9987599496773385e-05, + "loss": 0.473, + "mean_token_accuracy": 0.9103607594966888, + "num_tokens": 7953162.0, + "step": 10470 + }, + { + "entropy": 0.2442274335771799, + "epoch": 2.4417764308194427, + "grad_norm": 2.59375, + "learning_rate": 4.998753142969696e-05, + "loss": 0.5058, + "mean_token_accuracy": 0.914693397283554, + "num_tokens": 7983835.0, + "step": 10475 + }, + { + "entropy": 0.23022452220320702, + "epoch": 2.4429420678400744, + "grad_norm": 1.0390625, + "learning_rate": 4.9987463176412664e-05, + "loss": 0.2837, + "mean_token_accuracy": 0.9182737469673157, + "num_tokens": 8009550.0, + "step": 10480 + }, + { + "entropy": 0.3430052354931831, + "epoch": 2.4441077048607065, + "grad_norm": 2.859375, + "learning_rate": 4.99873947369215e-05, + "loss": 0.6588, + "mean_token_accuracy": 0.8753048777580261, + "num_tokens": 8021958.0, + "step": 10485 + }, + { + "entropy": 0.29453707188367845, + "epoch": 2.445273341881338, + "grad_norm": 4.4375, + "learning_rate": 4.9987326111224506e-05, + "loss": 0.4625, + "mean_token_accuracy": 0.9004810392856598, + "num_tokens": 8042746.0, + "step": 10490 + }, + { + "entropy": 0.2460491180419922, + "epoch": 2.44643897890197, + "grad_norm": 0.99609375, + "learning_rate": 4.998725729932269e-05, + "loss": 0.3338, + "mean_token_accuracy": 0.9131915867328644, + "num_tokens": 8061108.0, + "step": 10495 + }, + { + "entropy": 0.36034695208072665, + "epoch": 2.447604615922602, + "grad_norm": 1.578125, + "learning_rate": 4.9987188301217095e-05, + "loss": 0.6873, + "mean_token_accuracy": 0.8622980952262879, + "num_tokens": 8081809.0, + "step": 10500 + }, + { + "entropy": 0.30376859456300737, + "epoch": 2.4487702529432336, + "grad_norm": 3.359375, + "learning_rate": 4.9987119116908734e-05, + "loss": 0.4464, + "mean_token_accuracy": 0.9128959953784943, + "num_tokens": 8121401.0, + "step": 10505 + }, + { + "entropy": 0.2379370667040348, + "epoch": 2.4499358899638652, + "grad_norm": 4.125, + "learning_rate": 4.9987049746398645e-05, + "loss": 0.4646, + "mean_token_accuracy": 0.9152859389781952, + "num_tokens": 8145718.0, + "step": 10510 + }, + { + "entropy": 0.2736461482942104, + "epoch": 2.451101526984497, + "grad_norm": 4.78125, + "learning_rate": 4.9986980189687865e-05, + "loss": 0.3996, + "mean_token_accuracy": 0.9089068353176117, + "num_tokens": 8173641.0, + "step": 10515 + }, + { + "entropy": 0.24744717925786971, + "epoch": 2.4522671640051286, + "grad_norm": 1.234375, + "learning_rate": 4.998691044677743e-05, + "loss": 0.3728, + "mean_token_accuracy": 0.9204553902149201, + "num_tokens": 8200222.0, + "step": 10520 + }, + { + "entropy": 0.2574811242520809, + "epoch": 2.4534328010257607, + "grad_norm": 4.21875, + "learning_rate": 4.998684051766838e-05, + "loss": 0.5571, + "mean_token_accuracy": 0.9103271067142487, + "num_tokens": 8214116.0, + "step": 10525 + }, + { + "entropy": 0.31053818836808206, + "epoch": 2.4545984380463923, + "grad_norm": 4.0, + "learning_rate": 4.998677040236175e-05, + "loss": 0.6819, + "mean_token_accuracy": 0.8785336196422577, + "num_tokens": 8224261.0, + "step": 10530 + }, + { + "entropy": 0.35476598888635635, + "epoch": 2.455764075067024, + "grad_norm": 6.1875, + "learning_rate": 4.99867001008586e-05, + "loss": 0.6586, + "mean_token_accuracy": 0.8944872498512269, + "num_tokens": 8232938.0, + "step": 10535 + }, + { + "entropy": 0.2782547645270824, + "epoch": 2.456929712087656, + "grad_norm": 4.09375, + "learning_rate": 4.998662961315996e-05, + "loss": 0.5864, + "mean_token_accuracy": 0.8916796863079071, + "num_tokens": 8243533.0, + "step": 10540 + }, + { + "entropy": 0.34424693398177625, + "epoch": 2.4580953491082878, + "grad_norm": 64.5, + "learning_rate": 4.9986558939266906e-05, + "loss": 0.2403, + "mean_token_accuracy": 0.9354278743267059, + "num_tokens": 8285890.0, + "step": 10545 + }, + { + "entropy": 0.3282710451632738, + "epoch": 2.4592609861289194, + "grad_norm": 3.875, + "learning_rate": 4.9986488079180464e-05, + "loss": 0.441, + "mean_token_accuracy": 0.8940746068954468, + "num_tokens": 8310274.0, + "step": 10550 + }, + { + "entropy": 0.2680108778178692, + "epoch": 2.460426623149551, + "grad_norm": 3.671875, + "learning_rate": 4.9986417032901714e-05, + "loss": 0.5245, + "mean_token_accuracy": 0.8968818008899688, + "num_tokens": 8336585.0, + "step": 10555 + }, + { + "entropy": 0.2213997296988964, + "epoch": 2.461592260170183, + "grad_norm": 4.03125, + "learning_rate": 4.99863458004317e-05, + "loss": 0.3793, + "mean_token_accuracy": 0.911593621969223, + "num_tokens": 8369433.0, + "step": 10560 + }, + { + "entropy": 0.2799895711243153, + "epoch": 2.462757897190815, + "grad_norm": 1.8125, + "learning_rate": 4.998627438177149e-05, + "loss": 0.4496, + "mean_token_accuracy": 0.897063159942627, + "num_tokens": 8390854.0, + "step": 10565 + }, + { + "entropy": 0.3811504438519478, + "epoch": 2.4639235342114465, + "grad_norm": 7.8125, + "learning_rate": 4.998620277692215e-05, + "loss": 0.5295, + "mean_token_accuracy": 0.8757772445678711, + "num_tokens": 8417799.0, + "step": 10570 + }, + { + "entropy": 0.27391971051692965, + "epoch": 2.465089171232078, + "grad_norm": 2.734375, + "learning_rate": 4.998613098588475e-05, + "loss": 0.4542, + "mean_token_accuracy": 0.9075638949871063, + "num_tokens": 8432407.0, + "step": 10575 + }, + { + "entropy": 0.31272627636790273, + "epoch": 2.4662548082527103, + "grad_norm": 0.76171875, + "learning_rate": 4.998605900866035e-05, + "loss": 0.4501, + "mean_token_accuracy": 0.8941393792629242, + "num_tokens": 8454557.0, + "step": 10580 + }, + { + "entropy": 0.2570257782936096, + "epoch": 2.467420445273342, + "grad_norm": 4.78125, + "learning_rate": 4.998598684525003e-05, + "loss": 0.5738, + "mean_token_accuracy": 0.9066219568252564, + "num_tokens": 8467066.0, + "step": 10585 + }, + { + "entropy": 0.22484007887542248, + "epoch": 2.4685860822939736, + "grad_norm": 2.4375, + "learning_rate": 4.9985914495654865e-05, + "loss": 0.2423, + "mean_token_accuracy": 0.9273537278175354, + "num_tokens": 8492449.0, + "step": 10590 + }, + { + "entropy": 0.2611137468367815, + "epoch": 2.4697517193146052, + "grad_norm": 0.6015625, + "learning_rate": 4.998584195987595e-05, + "loss": 0.3819, + "mean_token_accuracy": 0.9144336521625519, + "num_tokens": 8520080.0, + "step": 10595 + }, + { + "entropy": 0.346079520508647, + "epoch": 2.4709173563352373, + "grad_norm": 0.85546875, + "learning_rate": 4.998576923791433e-05, + "loss": 0.5432, + "mean_token_accuracy": 0.8557630747556686, + "num_tokens": 8545422.0, + "step": 10600 + }, + { + "entropy": 0.3471344619989395, + "epoch": 2.472082993355869, + "grad_norm": 1.953125, + "learning_rate": 4.998569632977112e-05, + "loss": 0.7202, + "mean_token_accuracy": 0.8664121389389038, + "num_tokens": 8558882.0, + "step": 10605 + }, + { + "entropy": 0.2631093353033066, + "epoch": 2.4732486303765007, + "grad_norm": 4.46875, + "learning_rate": 4.9985623235447405e-05, + "loss": 0.6804, + "mean_token_accuracy": 0.8926944673061371, + "num_tokens": 8570144.0, + "step": 10610 + }, + { + "entropy": 0.23091779723763467, + "epoch": 2.4744142673971323, + "grad_norm": 1.390625, + "learning_rate": 4.998554995494426e-05, + "loss": 0.3858, + "mean_token_accuracy": 0.9179224729537964, + "num_tokens": 8605575.0, + "step": 10615 + }, + { + "entropy": 0.2587498303502798, + "epoch": 2.4755799044177644, + "grad_norm": 1.328125, + "learning_rate": 4.998547648826279e-05, + "loss": 0.5111, + "mean_token_accuracy": 0.9113299548625946, + "num_tokens": 8634576.0, + "step": 10620 + }, + { + "entropy": 0.3614876292645931, + "epoch": 2.476745541438396, + "grad_norm": 2.234375, + "learning_rate": 4.998540283540408e-05, + "loss": 0.4811, + "mean_token_accuracy": 0.892818284034729, + "num_tokens": 8653804.0, + "step": 10625 + }, + { + "entropy": 0.30118090808391573, + "epoch": 2.4779111784590278, + "grad_norm": 0.458984375, + "learning_rate": 4.998532899636925e-05, + "loss": 0.523, + "mean_token_accuracy": 0.8992532551288605, + "num_tokens": 8681806.0, + "step": 10630 + }, + { + "entropy": 0.29017008394002913, + "epoch": 2.47907681547966, + "grad_norm": 2.03125, + "learning_rate": 4.998525497115937e-05, + "loss": 0.5133, + "mean_token_accuracy": 0.8909639179706573, + "num_tokens": 8694925.0, + "step": 10635 + }, + { + "entropy": 0.23841165266931058, + "epoch": 2.4802424525002915, + "grad_norm": 3.46875, + "learning_rate": 4.9985180759775566e-05, + "loss": 0.3843, + "mean_token_accuracy": 0.9136591196060181, + "num_tokens": 8714809.0, + "step": 10640 + }, + { + "entropy": 0.2732238098978996, + "epoch": 2.481408089520923, + "grad_norm": 5.09375, + "learning_rate": 4.9985106362218935e-05, + "loss": 0.3983, + "mean_token_accuracy": 0.924947077035904, + "num_tokens": 8738299.0, + "step": 10645 + }, + { + "entropy": 0.3165955483913422, + "epoch": 2.482573726541555, + "grad_norm": 4.25, + "learning_rate": 4.998503177849059e-05, + "loss": 0.5408, + "mean_token_accuracy": 0.9061666548252105, + "num_tokens": 8747927.0, + "step": 10650 + }, + { + "entropy": 0.2890742287039757, + "epoch": 2.4837393635621865, + "grad_norm": 2.953125, + "learning_rate": 4.9984957008591644e-05, + "loss": 0.4619, + "mean_token_accuracy": 0.9118583619594574, + "num_tokens": 8770724.0, + "step": 10655 + }, + { + "entropy": 0.2947452884167433, + "epoch": 2.4849050005828186, + "grad_norm": 4.1875, + "learning_rate": 4.9984882052523206e-05, + "loss": 0.6179, + "mean_token_accuracy": 0.8985697567462921, + "num_tokens": 8786494.0, + "step": 10660 + }, + { + "entropy": 0.22197375893592836, + "epoch": 2.4860706376034503, + "grad_norm": 0.96484375, + "learning_rate": 4.9984806910286406e-05, + "loss": 0.4445, + "mean_token_accuracy": 0.9120029747486115, + "num_tokens": 8803727.0, + "step": 10665 + }, + { + "entropy": 0.26896577328443527, + "epoch": 2.487236274624082, + "grad_norm": 5.0, + "learning_rate": 4.9984731581882355e-05, + "loss": 0.5693, + "mean_token_accuracy": 0.8987299025058746, + "num_tokens": 8814973.0, + "step": 10670 + }, + { + "entropy": 0.23743227310478687, + "epoch": 2.488401911644714, + "grad_norm": 2.96875, + "learning_rate": 4.998465606731217e-05, + "loss": 0.4218, + "mean_token_accuracy": 0.9243048131465912, + "num_tokens": 8831458.0, + "step": 10675 + }, + { + "entropy": 0.21038546413183212, + "epoch": 2.4895675486653457, + "grad_norm": 3.71875, + "learning_rate": 4.9984580366576996e-05, + "loss": 0.454, + "mean_token_accuracy": 0.922557246685028, + "num_tokens": 8847184.0, + "step": 10680 + }, + { + "entropy": 0.23371059447526932, + "epoch": 2.4907331856859773, + "grad_norm": 4.90625, + "learning_rate": 4.998450447967794e-05, + "loss": 0.392, + "mean_token_accuracy": 0.8991166591644287, + "num_tokens": 8866368.0, + "step": 10685 + }, + { + "entropy": 0.2095944918692112, + "epoch": 2.491898822706609, + "grad_norm": 6.3125, + "learning_rate": 4.998442840661616e-05, + "loss": 0.5321, + "mean_token_accuracy": 0.9088472187519073, + "num_tokens": 8891458.0, + "step": 10690 + }, + { + "entropy": 0.275274308398366, + "epoch": 2.493064459727241, + "grad_norm": 1.4296875, + "learning_rate": 4.998435214739276e-05, + "loss": 0.3744, + "mean_token_accuracy": 0.9091500878334046, + "num_tokens": 8917181.0, + "step": 10695 + }, + { + "entropy": 0.19943911097943784, + "epoch": 2.4942300967478728, + "grad_norm": 4.15625, + "learning_rate": 4.99842757020089e-05, + "loss": 0.3194, + "mean_token_accuracy": 0.9300190031528472, + "num_tokens": 8948477.0, + "step": 10700 + }, + { + "entropy": 0.22412441447377204, + "epoch": 2.4953957337685044, + "grad_norm": 2.46875, + "learning_rate": 4.9984199070465707e-05, + "loss": 0.3621, + "mean_token_accuracy": 0.9259128868579865, + "num_tokens": 8970395.0, + "step": 10705 + }, + { + "entropy": 0.2479204297065735, + "epoch": 2.496561370789136, + "grad_norm": 0.62109375, + "learning_rate": 4.998412225276433e-05, + "loss": 0.3383, + "mean_token_accuracy": 0.9196534276008606, + "num_tokens": 8995002.0, + "step": 10710 + }, + { + "entropy": 0.1580579474568367, + "epoch": 2.497727007809768, + "grad_norm": 1.3125, + "learning_rate": 4.998404524890592e-05, + "loss": 0.144, + "mean_token_accuracy": 0.954412579536438, + "num_tokens": 9034707.0, + "step": 10715 + }, + { + "entropy": 0.24564942046999932, + "epoch": 2.4988926448304, + "grad_norm": 0.625, + "learning_rate": 4.998396805889161e-05, + "loss": 0.4918, + "mean_token_accuracy": 0.9128167867660523, + "num_tokens": 9055018.0, + "step": 10720 + }, + { + "entropy": 0.26528125405311587, + "epoch": 2.5000582818510315, + "grad_norm": 4.28125, + "learning_rate": 4.998389068272256e-05, + "loss": 0.4535, + "mean_token_accuracy": 0.9169175028800964, + "num_tokens": 9069174.0, + "step": 10725 + }, + { + "entropy": 0.2899410419166088, + "epoch": 2.5012239188716636, + "grad_norm": 1.6171875, + "learning_rate": 4.998381312039992e-05, + "loss": 0.5064, + "mean_token_accuracy": 0.9001691699028015, + "num_tokens": 9087041.0, + "step": 10730 + }, + { + "entropy": 0.19549113065004348, + "epoch": 2.5023895558922953, + "grad_norm": 2.1875, + "learning_rate": 4.998373537192486e-05, + "loss": 0.2986, + "mean_token_accuracy": 0.9320485651493072, + "num_tokens": 9104595.0, + "step": 10735 + }, + { + "entropy": 0.29923654198646543, + "epoch": 2.503555192912927, + "grad_norm": 2.078125, + "learning_rate": 4.998365743729852e-05, + "loss": 0.464, + "mean_token_accuracy": 0.917408549785614, + "num_tokens": 9116161.0, + "step": 10740 + }, + { + "entropy": 0.2908829629421234, + "epoch": 2.5047208299335586, + "grad_norm": 3.4375, + "learning_rate": 4.998357931652208e-05, + "loss": 0.5, + "mean_token_accuracy": 0.9038378894329071, + "num_tokens": 9133475.0, + "step": 10745 + }, + { + "entropy": 0.325673321634531, + "epoch": 2.5058864669541903, + "grad_norm": 3.6875, + "learning_rate": 4.998350100959669e-05, + "loss": 0.5424, + "mean_token_accuracy": 0.8966376125812531, + "num_tokens": 9146126.0, + "step": 10750 + }, + { + "entropy": 0.24616522938013077, + "epoch": 2.5070521039748224, + "grad_norm": 1.5625, + "learning_rate": 4.9983422516523524e-05, + "loss": 0.2964, + "mean_token_accuracy": 0.9170722723007202, + "num_tokens": 9177231.0, + "step": 10755 + }, + { + "entropy": 0.30409452468156817, + "epoch": 2.508217740995454, + "grad_norm": 0.46484375, + "learning_rate": 4.9983343837303755e-05, + "loss": 0.4707, + "mean_token_accuracy": 0.8934083580970764, + "num_tokens": 9201339.0, + "step": 10760 + }, + { + "entropy": 0.3546369731426239, + "epoch": 2.5093833780160857, + "grad_norm": 2.796875, + "learning_rate": 4.998326497193855e-05, + "loss": 0.7408, + "mean_token_accuracy": 0.8691607177257538, + "num_tokens": 9220413.0, + "step": 10765 + }, + { + "entropy": 0.3403468161821365, + "epoch": 2.510549015036718, + "grad_norm": 4.0625, + "learning_rate": 4.998318592042909e-05, + "loss": 0.7227, + "mean_token_accuracy": 0.8796669840812683, + "num_tokens": 9229228.0, + "step": 10770 + }, + { + "entropy": 0.19227586612105368, + "epoch": 2.5117146520573495, + "grad_norm": 0.58984375, + "learning_rate": 4.998310668277655e-05, + "loss": 0.2376, + "mean_token_accuracy": 0.9470361351966858, + "num_tokens": 9262657.0, + "step": 10775 + }, + { + "entropy": 0.2915323942899704, + "epoch": 2.512880289077981, + "grad_norm": 5.125, + "learning_rate": 4.998302725898211e-05, + "loss": 0.6645, + "mean_token_accuracy": 0.884123957157135, + "num_tokens": 9276618.0, + "step": 10780 + }, + { + "entropy": 0.2815995916724205, + "epoch": 2.5140459260986128, + "grad_norm": 4.5, + "learning_rate": 4.9982947649046965e-05, + "loss": 0.3543, + "mean_token_accuracy": 0.9117548644542695, + "num_tokens": 9297003.0, + "step": 10785 + }, + { + "entropy": 0.270504492521286, + "epoch": 2.5152115631192444, + "grad_norm": 6.03125, + "learning_rate": 4.998286785297229e-05, + "loss": 0.47, + "mean_token_accuracy": 0.909794807434082, + "num_tokens": 9316305.0, + "step": 10790 + }, + { + "entropy": 0.260904598236084, + "epoch": 2.5163772001398765, + "grad_norm": 3.1875, + "learning_rate": 4.9982787870759285e-05, + "loss": 0.6683, + "mean_token_accuracy": 0.891952395439148, + "num_tokens": 9327397.0, + "step": 10795 + }, + { + "entropy": 0.3234328027814627, + "epoch": 2.517542837160508, + "grad_norm": 3.515625, + "learning_rate": 4.998270770240914e-05, + "loss": 0.4856, + "mean_token_accuracy": 0.8988953709602356, + "num_tokens": 9362296.0, + "step": 10800 + }, + { + "entropy": 0.3260183773934841, + "epoch": 2.51870847418114, + "grad_norm": 0.6328125, + "learning_rate": 4.998262734792304e-05, + "loss": 0.5501, + "mean_token_accuracy": 0.8724212884902954, + "num_tokens": 9386671.0, + "step": 10805 + }, + { + "entropy": 0.23665817752480506, + "epoch": 2.519874111201772, + "grad_norm": 2.40625, + "learning_rate": 4.99825468073022e-05, + "loss": 0.2604, + "mean_token_accuracy": 0.9227067410945893, + "num_tokens": 9406591.0, + "step": 10810 + }, + { + "entropy": 0.3936871213838458, + "epoch": 2.5210397482224036, + "grad_norm": 1.4921875, + "learning_rate": 4.998246608054781e-05, + "loss": 0.7339, + "mean_token_accuracy": 0.8664057910442352, + "num_tokens": 9436139.0, + "step": 10815 + }, + { + "entropy": 0.35211950838565825, + "epoch": 2.5222053852430353, + "grad_norm": 2.90625, + "learning_rate": 4.9982385167661075e-05, + "loss": 0.6026, + "mean_token_accuracy": 0.894438910484314, + "num_tokens": 9448411.0, + "step": 10820 + }, + { + "entropy": 0.2504681311547756, + "epoch": 2.523371022263667, + "grad_norm": 2.09375, + "learning_rate": 4.998230406864319e-05, + "loss": 0.4038, + "mean_token_accuracy": 0.9226616859436035, + "num_tokens": 9472537.0, + "step": 10825 + }, + { + "entropy": 0.3123652219772339, + "epoch": 2.5245366592842986, + "grad_norm": 0.75390625, + "learning_rate": 4.998222278349539e-05, + "loss": 0.4818, + "mean_token_accuracy": 0.8915808320045471, + "num_tokens": 9500630.0, + "step": 10830 + }, + { + "entropy": 0.30130189210176467, + "epoch": 2.5257022963049307, + "grad_norm": 5.5, + "learning_rate": 4.9982141312218875e-05, + "loss": 0.5678, + "mean_token_accuracy": 0.897449004650116, + "num_tokens": 9514002.0, + "step": 10835 + }, + { + "entropy": 0.25896636620163915, + "epoch": 2.5268679333255624, + "grad_norm": 1.9765625, + "learning_rate": 4.998205965481486e-05, + "loss": 0.4619, + "mean_token_accuracy": 0.902980488538742, + "num_tokens": 9529717.0, + "step": 10840 + }, + { + "entropy": 0.40680873990058897, + "epoch": 2.528033570346194, + "grad_norm": 2.59375, + "learning_rate": 4.998197781128455e-05, + "loss": 0.649, + "mean_token_accuracy": 0.8700441300868988, + "num_tokens": 9542535.0, + "step": 10845 + }, + { + "entropy": 0.26954654008150103, + "epoch": 2.529199207366826, + "grad_norm": 5.71875, + "learning_rate": 4.9981895781629186e-05, + "loss": 0.5731, + "mean_token_accuracy": 0.8942017436027527, + "num_tokens": 9553660.0, + "step": 10850 + }, + { + "entropy": 0.6126003712415695, + "epoch": 2.530364844387458, + "grad_norm": 2.078125, + "learning_rate": 4.9981813565849985e-05, + "loss": 0.8439, + "mean_token_accuracy": 0.8094934105873108, + "num_tokens": 9583350.0, + "step": 10855 + }, + { + "entropy": 0.256740565598011, + "epoch": 2.5315304814080895, + "grad_norm": 7.625, + "learning_rate": 4.998173116394816e-05, + "loss": 0.5263, + "mean_token_accuracy": 0.8800818026065826, + "num_tokens": 9603578.0, + "step": 10860 + }, + { + "entropy": 0.24329185411334037, + "epoch": 2.5326961184287216, + "grad_norm": 2.25, + "learning_rate": 4.9981648575924956e-05, + "loss": 0.4655, + "mean_token_accuracy": 0.9080992102622986, + "num_tokens": 9615426.0, + "step": 10865 + }, + { + "entropy": 0.18521431982517242, + "epoch": 2.533861755449353, + "grad_norm": 2.4375, + "learning_rate": 4.99815658017816e-05, + "loss": 0.2018, + "mean_token_accuracy": 0.9450451016426087, + "num_tokens": 9643789.0, + "step": 10870 + }, + { + "entropy": 0.25476645715534685, + "epoch": 2.535027392469985, + "grad_norm": 1.8359375, + "learning_rate": 4.9981482841519325e-05, + "loss": 0.2879, + "mean_token_accuracy": 0.9163881063461303, + "num_tokens": 9669216.0, + "step": 10875 + }, + { + "entropy": 0.206333290040493, + "epoch": 2.5361930294906165, + "grad_norm": 5.6875, + "learning_rate": 4.998139969513936e-05, + "loss": 0.2351, + "mean_token_accuracy": 0.9330012440681458, + "num_tokens": 9701874.0, + "step": 10880 + }, + { + "entropy": 0.3087175332009792, + "epoch": 2.537358666511248, + "grad_norm": 0.416015625, + "learning_rate": 4.998131636264296e-05, + "loss": 0.6032, + "mean_token_accuracy": 0.8812095761299134, + "num_tokens": 9743844.0, + "step": 10885 + }, + { + "entropy": 0.30272372961044314, + "epoch": 2.5385243035318803, + "grad_norm": 4.90625, + "learning_rate": 4.9981232844031357e-05, + "loss": 0.6009, + "mean_token_accuracy": 0.8849609076976777, + "num_tokens": 9762894.0, + "step": 10890 + }, + { + "entropy": 0.18856640644371508, + "epoch": 2.539689940552512, + "grad_norm": 4.15625, + "learning_rate": 4.998114913930579e-05, + "loss": 0.3114, + "mean_token_accuracy": 0.9368799209594727, + "num_tokens": 9781940.0, + "step": 10895 + }, + { + "entropy": 0.2122984491288662, + "epoch": 2.5408555775731436, + "grad_norm": 0.91015625, + "learning_rate": 4.998106524846753e-05, + "loss": 0.293, + "mean_token_accuracy": 0.9302926242351532, + "num_tokens": 9826919.0, + "step": 10900 + }, + { + "entropy": 0.3222149532288313, + "epoch": 2.5420212145937757, + "grad_norm": 1.0, + "learning_rate": 4.998098117151782e-05, + "loss": 0.4287, + "mean_token_accuracy": 0.900217491388321, + "num_tokens": 9844075.0, + "step": 10905 + }, + { + "entropy": 0.23695617094635962, + "epoch": 2.5431868516144074, + "grad_norm": 0.412109375, + "learning_rate": 4.998089690845789e-05, + "loss": 0.3804, + "mean_token_accuracy": 0.9084706485271454, + "num_tokens": 9869917.0, + "step": 10910 + }, + { + "entropy": 0.28979590311646464, + "epoch": 2.544352488635039, + "grad_norm": 9.875, + "learning_rate": 4.998081245928903e-05, + "loss": 0.5758, + "mean_token_accuracy": 0.8905119478702546, + "num_tokens": 9885346.0, + "step": 10915 + }, + { + "entropy": 0.2431908842176199, + "epoch": 2.5455181256556707, + "grad_norm": 1.390625, + "learning_rate": 4.998072782401248e-05, + "loss": 0.3191, + "mean_token_accuracy": 0.8980298042297363, + "num_tokens": 9912476.0, + "step": 10920 + }, + { + "entropy": 0.23238140121102333, + "epoch": 2.5466837626763024, + "grad_norm": 3.671875, + "learning_rate": 4.99806430026295e-05, + "loss": 0.4056, + "mean_token_accuracy": 0.909502238035202, + "num_tokens": 9927690.0, + "step": 10925 + }, + { + "entropy": 0.23011131063103676, + "epoch": 2.5478493996969345, + "grad_norm": 0.94140625, + "learning_rate": 4.9980557995141364e-05, + "loss": 0.569, + "mean_token_accuracy": 0.887377279996872, + "num_tokens": 9945641.0, + "step": 10930 + }, + { + "entropy": 0.27550188899040223, + "epoch": 2.549015036717566, + "grad_norm": 3.078125, + "learning_rate": 4.998047280154934e-05, + "loss": 0.4363, + "mean_token_accuracy": 0.9162426710128784, + "num_tokens": 9959509.0, + "step": 10935 + }, + { + "entropy": 0.29937262982130053, + "epoch": 2.550180673738198, + "grad_norm": 6.25, + "learning_rate": 4.998038742185469e-05, + "loss": 0.5956, + "mean_token_accuracy": 0.9001171708106994, + "num_tokens": 9972523.0, + "step": 10940 + }, + { + "entropy": 0.20945103876292706, + "epoch": 2.55134631075883, + "grad_norm": 2.03125, + "learning_rate": 4.998030185605869e-05, + "loss": 0.2486, + "mean_token_accuracy": 0.9239367246627808, + "num_tokens": 10003818.0, + "step": 10945 + }, + { + "entropy": 0.2474493345245719, + "epoch": 2.5525119477794616, + "grad_norm": 0.46875, + "learning_rate": 4.9980216104162627e-05, + "loss": 0.3619, + "mean_token_accuracy": 0.9106339871883392, + "num_tokens": 10029914.0, + "step": 10950 + }, + { + "entropy": 0.2488720454275608, + "epoch": 2.553677584800093, + "grad_norm": 5.3125, + "learning_rate": 4.998013016616776e-05, + "loss": 0.5409, + "mean_token_accuracy": 0.8993008673191071, + "num_tokens": 10041210.0, + "step": 10955 + }, + { + "entropy": 0.20905282385647297, + "epoch": 2.554843221820725, + "grad_norm": 2.734375, + "learning_rate": 4.998004404207539e-05, + "loss": 0.2344, + "mean_token_accuracy": 0.9224939465522766, + "num_tokens": 10065172.0, + "step": 10960 + }, + { + "entropy": 0.27664083521813154, + "epoch": 2.5560088588413565, + "grad_norm": 5.3125, + "learning_rate": 4.997995773188679e-05, + "loss": 0.443, + "mean_token_accuracy": 0.8938161969184876, + "num_tokens": 10083266.0, + "step": 10965 + }, + { + "entropy": 0.2526406615972519, + "epoch": 2.5571744958619886, + "grad_norm": 4.65625, + "learning_rate": 4.997987123560325e-05, + "loss": 0.5609, + "mean_token_accuracy": 0.8953101098537445, + "num_tokens": 10093397.0, + "step": 10970 + }, + { + "entropy": 0.2602951280772686, + "epoch": 2.5583401328826203, + "grad_norm": 4.75, + "learning_rate": 4.997978455322605e-05, + "loss": 0.5184, + "mean_token_accuracy": 0.8793947219848632, + "num_tokens": 10124435.0, + "step": 10975 + }, + { + "entropy": 0.2883452221751213, + "epoch": 2.559505769903252, + "grad_norm": 1.4140625, + "learning_rate": 4.99796976847565e-05, + "loss": 0.4473, + "mean_token_accuracy": 0.9102245748043061, + "num_tokens": 10147972.0, + "step": 10980 + }, + { + "entropy": 0.25813480094075203, + "epoch": 2.560671406923884, + "grad_norm": 5.1875, + "learning_rate": 4.9979610630195886e-05, + "loss": 0.5154, + "mean_token_accuracy": 0.9058370411396026, + "num_tokens": 10163425.0, + "step": 10985 + }, + { + "entropy": 0.2699072379618883, + "epoch": 2.5618370439445157, + "grad_norm": 0.60546875, + "learning_rate": 4.9979523389545514e-05, + "loss": 0.276, + "mean_token_accuracy": 0.9388366758823394, + "num_tokens": 10189255.0, + "step": 10990 + }, + { + "entropy": 0.19736984223127366, + "epoch": 2.5630026809651474, + "grad_norm": 3.109375, + "learning_rate": 4.9979435962806664e-05, + "loss": 0.2373, + "mean_token_accuracy": 0.9463130176067353, + "num_tokens": 10214635.0, + "step": 10995 + }, + { + "entropy": 0.24606726691126823, + "epoch": 2.5641683179857795, + "grad_norm": 5.15625, + "learning_rate": 4.997934834998067e-05, + "loss": 0.3809, + "mean_token_accuracy": 0.9238426387310028, + "num_tokens": 10234190.0, + "step": 11000 + }, + { + "entropy": 0.3558468287810683, + "epoch": 2.565333955006411, + "grad_norm": 5.6875, + "learning_rate": 4.997926055106881e-05, + "loss": 0.678, + "mean_token_accuracy": 0.8645303964614868, + "num_tokens": 10254717.0, + "step": 11005 + }, + { + "entropy": 0.24228348731994628, + "epoch": 2.566499592027043, + "grad_norm": 1.40625, + "learning_rate": 4.9979172566072404e-05, + "loss": 0.3428, + "mean_token_accuracy": 0.9213858485221863, + "num_tokens": 10280458.0, + "step": 11010 + }, + { + "entropy": 0.1987355647608638, + "epoch": 2.5676652290476745, + "grad_norm": 5.875, + "learning_rate": 4.997908439499277e-05, + "loss": 0.4407, + "mean_token_accuracy": 0.9114084959030151, + "num_tokens": 10302557.0, + "step": 11015 + }, + { + "entropy": 0.25376520678400993, + "epoch": 2.568830866068306, + "grad_norm": 2.890625, + "learning_rate": 4.9978996037831215e-05, + "loss": 0.572, + "mean_token_accuracy": 0.9063213229179382, + "num_tokens": 10312816.0, + "step": 11020 + }, + { + "entropy": 0.1864607885479927, + "epoch": 2.5699965030889382, + "grad_norm": 1.7578125, + "learning_rate": 4.9978907494589066e-05, + "loss": 0.2917, + "mean_token_accuracy": 0.9306336164474487, + "num_tokens": 10337096.0, + "step": 11025 + }, + { + "entropy": 0.21803455986082554, + "epoch": 2.57116214010957, + "grad_norm": 4.1875, + "learning_rate": 4.997881876526763e-05, + "loss": 0.4001, + "mean_token_accuracy": 0.9231861233711243, + "num_tokens": 10360434.0, + "step": 11030 + }, + { + "entropy": 0.37254651412367823, + "epoch": 2.5723277771302016, + "grad_norm": 0.83984375, + "learning_rate": 4.997872984986825e-05, + "loss": 0.5981, + "mean_token_accuracy": 0.8917929172515869, + "num_tokens": 10385687.0, + "step": 11035 + }, + { + "entropy": 0.2153875719755888, + "epoch": 2.5734934141508337, + "grad_norm": 0.58203125, + "learning_rate": 4.997864074839222e-05, + "loss": 0.2672, + "mean_token_accuracy": 0.9379128754138947, + "num_tokens": 10423671.0, + "step": 11040 + }, + { + "entropy": 0.21650544218719006, + "epoch": 2.5746590511714653, + "grad_norm": 3.71875, + "learning_rate": 4.9978551460840895e-05, + "loss": 0.3612, + "mean_token_accuracy": 0.9156126797199249, + "num_tokens": 10443832.0, + "step": 11045 + }, + { + "entropy": 0.24414923898875712, + "epoch": 2.575824688192097, + "grad_norm": 5.3125, + "learning_rate": 4.99784619872156e-05, + "loss": 0.4182, + "mean_token_accuracy": 0.9150607287883759, + "num_tokens": 10464733.0, + "step": 11050 + }, + { + "entropy": 0.2163931004703045, + "epoch": 2.5769903252127286, + "grad_norm": 1.8984375, + "learning_rate": 4.997837232751767e-05, + "loss": 0.313, + "mean_token_accuracy": 0.919309800863266, + "num_tokens": 10483052.0, + "step": 11055 + }, + { + "entropy": 0.22960688844323157, + "epoch": 2.5781559622333603, + "grad_norm": 4.0, + "learning_rate": 4.997828248174844e-05, + "loss": 0.3334, + "mean_token_accuracy": 0.928122466802597, + "num_tokens": 10497890.0, + "step": 11060 + }, + { + "entropy": 0.21383021138608455, + "epoch": 2.5793215992539924, + "grad_norm": 0.875, + "learning_rate": 4.997819244990925e-05, + "loss": 0.2845, + "mean_token_accuracy": 0.929515129327774, + "num_tokens": 10517123.0, + "step": 11065 + }, + { + "entropy": 0.1973729684948921, + "epoch": 2.580487236274624, + "grad_norm": 0.60546875, + "learning_rate": 4.997810223200144e-05, + "loss": 0.3806, + "mean_token_accuracy": 0.9143800556659698, + "num_tokens": 10536675.0, + "step": 11070 + }, + { + "entropy": 0.30555715411901474, + "epoch": 2.5816528732952557, + "grad_norm": 1.5859375, + "learning_rate": 4.997801182802635e-05, + "loss": 0.5375, + "mean_token_accuracy": 0.8875338017940522, + "num_tokens": 10564485.0, + "step": 11075 + }, + { + "entropy": 0.23367904797196387, + "epoch": 2.582818510315888, + "grad_norm": 3.34375, + "learning_rate": 4.997792123798535e-05, + "loss": 0.2932, + "mean_token_accuracy": 0.9282478809356689, + "num_tokens": 10592613.0, + "step": 11080 + }, + { + "entropy": 0.246141454577446, + "epoch": 2.5839841473365195, + "grad_norm": 5.0625, + "learning_rate": 4.997783046187977e-05, + "loss": 0.4879, + "mean_token_accuracy": 0.898841404914856, + "num_tokens": 10620697.0, + "step": 11085 + }, + { + "entropy": 0.3161863937973976, + "epoch": 2.585149784357151, + "grad_norm": 3.96875, + "learning_rate": 4.997773949971097e-05, + "loss": 0.5144, + "mean_token_accuracy": 0.901045823097229, + "num_tokens": 10630312.0, + "step": 11090 + }, + { + "entropy": 0.27218768149614336, + "epoch": 2.586315421377783, + "grad_norm": 0.67578125, + "learning_rate": 4.997764835148031e-05, + "loss": 0.3202, + "mean_token_accuracy": 0.9121147453784942, + "num_tokens": 10652774.0, + "step": 11095 + }, + { + "entropy": 0.31504967212677004, + "epoch": 2.5874810583984145, + "grad_norm": 3.21875, + "learning_rate": 4.997755701718914e-05, + "loss": 0.7055, + "mean_token_accuracy": 0.8863767445087433, + "num_tokens": 10661302.0, + "step": 11100 + }, + { + "entropy": 0.278329698368907, + "epoch": 2.5886466954190466, + "grad_norm": 2.8125, + "learning_rate": 4.9977465496838835e-05, + "loss": 0.5463, + "mean_token_accuracy": 0.8992919802665711, + "num_tokens": 10680681.0, + "step": 11105 + }, + { + "entropy": 0.2613543540239334, + "epoch": 2.5898123324396782, + "grad_norm": 0.9921875, + "learning_rate": 4.997737379043074e-05, + "loss": 0.507, + "mean_token_accuracy": 0.9085290372371674, + "num_tokens": 10693761.0, + "step": 11110 + }, + { + "entropy": 0.26452816352248193, + "epoch": 2.59097796946031, + "grad_norm": 3.65625, + "learning_rate": 4.9977281897966246e-05, + "loss": 0.4636, + "mean_token_accuracy": 0.905288678407669, + "num_tokens": 10706089.0, + "step": 11115 + }, + { + "entropy": 0.25550087746232747, + "epoch": 2.592143606480942, + "grad_norm": 1.0703125, + "learning_rate": 4.997718981944671e-05, + "loss": 0.3773, + "mean_token_accuracy": 0.9075290858745575, + "num_tokens": 10740894.0, + "step": 11120 + }, + { + "entropy": 0.26064078956842424, + "epoch": 2.5933092435015737, + "grad_norm": 0.6484375, + "learning_rate": 4.99770975548735e-05, + "loss": 0.5987, + "mean_token_accuracy": 0.8941443741321564, + "num_tokens": 10758815.0, + "step": 11125 + }, + { + "entropy": 0.222178865224123, + "epoch": 2.5944748805222053, + "grad_norm": 5.71875, + "learning_rate": 4.997700510424801e-05, + "loss": 0.4556, + "mean_token_accuracy": 0.9190146684646606, + "num_tokens": 10775070.0, + "step": 11130 + }, + { + "entropy": 0.33019725382328036, + "epoch": 2.5956405175428374, + "grad_norm": 3.296875, + "learning_rate": 4.9976912467571605e-05, + "loss": 0.5227, + "mean_token_accuracy": 0.8845232129096985, + "num_tokens": 10789487.0, + "step": 11135 + }, + { + "entropy": 0.3958972916007042, + "epoch": 2.596806154563469, + "grad_norm": 4.28125, + "learning_rate": 4.997681964484566e-05, + "loss": 0.8221, + "mean_token_accuracy": 0.8452759385108948, + "num_tokens": 10802456.0, + "step": 11140 + }, + { + "entropy": 0.2777945823967457, + "epoch": 2.5979717915841007, + "grad_norm": 0.625, + "learning_rate": 4.997672663607157e-05, + "loss": 0.3402, + "mean_token_accuracy": 0.9086602687835693, + "num_tokens": 10831872.0, + "step": 11145 + }, + { + "entropy": 0.3942961137741804, + "epoch": 2.5991374286047324, + "grad_norm": 5.5, + "learning_rate": 4.997663344125072e-05, + "loss": 0.6087, + "mean_token_accuracy": 0.8671041011810303, + "num_tokens": 10846779.0, + "step": 11150 + }, + { + "entropy": 0.3075502276420593, + "epoch": 2.600303065625364, + "grad_norm": 1.3359375, + "learning_rate": 4.9976540060384506e-05, + "loss": 0.547, + "mean_token_accuracy": 0.8964202046394348, + "num_tokens": 10862884.0, + "step": 11155 + }, + { + "entropy": 0.27614784575998785, + "epoch": 2.601468702645996, + "grad_norm": 0.6953125, + "learning_rate": 4.997644649347431e-05, + "loss": 0.5464, + "mean_token_accuracy": 0.9018550217151642, + "num_tokens": 10885264.0, + "step": 11160 + }, + { + "entropy": 0.1691011071205139, + "epoch": 2.602634339666628, + "grad_norm": 0.466796875, + "learning_rate": 4.9976352740521536e-05, + "loss": 0.2883, + "mean_token_accuracy": 0.93958500623703, + "num_tokens": 10913916.0, + "step": 11165 + }, + { + "entropy": 0.247661954164505, + "epoch": 2.6037999766872595, + "grad_norm": 3.421875, + "learning_rate": 4.997625880152757e-05, + "loss": 0.347, + "mean_token_accuracy": 0.9147677958011627, + "num_tokens": 10946733.0, + "step": 11170 + }, + { + "entropy": 0.3043744258582592, + "epoch": 2.6049656137078916, + "grad_norm": 3.421875, + "learning_rate": 4.997616467649382e-05, + "loss": 0.6684, + "mean_token_accuracy": 0.8797974646091461, + "num_tokens": 10958671.0, + "step": 11175 + }, + { + "entropy": 0.16336182728409768, + "epoch": 2.6061312507285233, + "grad_norm": 1.234375, + "learning_rate": 4.99760703654217e-05, + "loss": 0.1688, + "mean_token_accuracy": 0.9447501301765442, + "num_tokens": 10984420.0, + "step": 11180 + }, + { + "entropy": 0.2802146412432194, + "epoch": 2.607296887749155, + "grad_norm": 6.375, + "learning_rate": 4.997597586831259e-05, + "loss": 0.4913, + "mean_token_accuracy": 0.8942979097366333, + "num_tokens": 10998949.0, + "step": 11185 + }, + { + "entropy": 0.34349353760480883, + "epoch": 2.6084625247697866, + "grad_norm": 2.03125, + "learning_rate": 4.9975881185167926e-05, + "loss": 0.5006, + "mean_token_accuracy": 0.9019432067871094, + "num_tokens": 11017593.0, + "step": 11190 + }, + { + "entropy": 0.26689242795109747, + "epoch": 2.6096281617904182, + "grad_norm": 1.96875, + "learning_rate": 4.99757863159891e-05, + "loss": 0.5055, + "mean_token_accuracy": 0.908399498462677, + "num_tokens": 11034394.0, + "step": 11195 + }, + { + "entropy": 0.15737572349607945, + "epoch": 2.6107937988110503, + "grad_norm": 0.7421875, + "learning_rate": 4.997569126077754e-05, + "loss": 0.2025, + "mean_token_accuracy": 0.9473043262958527, + "num_tokens": 11080961.0, + "step": 11200 + }, + { + "entropy": 0.19748403411358595, + "epoch": 2.611959435831682, + "grad_norm": 0.828125, + "learning_rate": 4.9975596019534666e-05, + "loss": 0.2864, + "mean_token_accuracy": 0.9375809073448181, + "num_tokens": 11102483.0, + "step": 11205 + }, + { + "entropy": 0.30650331676006315, + "epoch": 2.6131250728523137, + "grad_norm": 7.21875, + "learning_rate": 4.997550059226188e-05, + "loss": 0.7507, + "mean_token_accuracy": 0.8718282222747803, + "num_tokens": 11121897.0, + "step": 11210 + }, + { + "entropy": 0.24957393128424882, + "epoch": 2.6142907098729458, + "grad_norm": 5.21875, + "learning_rate": 4.9975404978960626e-05, + "loss": 0.4278, + "mean_token_accuracy": 0.902422821521759, + "num_tokens": 11142371.0, + "step": 11215 + }, + { + "entropy": 0.2951029822230339, + "epoch": 2.6154563468935774, + "grad_norm": 4.84375, + "learning_rate": 4.997530917963231e-05, + "loss": 0.6542, + "mean_token_accuracy": 0.8763157844543457, + "num_tokens": 11151119.0, + "step": 11220 + }, + { + "entropy": 0.2446484286338091, + "epoch": 2.616621983914209, + "grad_norm": 7.6875, + "learning_rate": 4.997521319427838e-05, + "loss": 0.5281, + "mean_token_accuracy": 0.9024474620819092, + "num_tokens": 11173901.0, + "step": 11225 + }, + { + "entropy": 0.2768650382757187, + "epoch": 2.6177876209348407, + "grad_norm": 5.25, + "learning_rate": 4.9975117022900256e-05, + "loss": 0.543, + "mean_token_accuracy": 0.9006280660629272, + "num_tokens": 11192978.0, + "step": 11230 + }, + { + "entropy": 0.2325539279729128, + "epoch": 2.6189532579554724, + "grad_norm": 5.0, + "learning_rate": 4.997502066549936e-05, + "loss": 0.4164, + "mean_token_accuracy": 0.9057820856571197, + "num_tokens": 11209052.0, + "step": 11235 + }, + { + "entropy": 0.2782548785209656, + "epoch": 2.6201188949761045, + "grad_norm": 4.5625, + "learning_rate": 4.9974924122077154e-05, + "loss": 0.6466, + "mean_token_accuracy": 0.8900192558765412, + "num_tokens": 11224304.0, + "step": 11240 + }, + { + "entropy": 0.18933181073516608, + "epoch": 2.621284531996736, + "grad_norm": 4.25, + "learning_rate": 4.9974827392635064e-05, + "loss": 0.2213, + "mean_token_accuracy": 0.9298280954360962, + "num_tokens": 11252668.0, + "step": 11245 + }, + { + "entropy": 0.2963331826031208, + "epoch": 2.622450169017368, + "grad_norm": 2.125, + "learning_rate": 4.997473047717454e-05, + "loss": 0.6452, + "mean_token_accuracy": 0.8773207426071167, + "num_tokens": 11262336.0, + "step": 11250 + }, + { + "entropy": 0.3698550321161747, + "epoch": 2.623615806038, + "grad_norm": 2.109375, + "learning_rate": 4.997463337569701e-05, + "loss": 0.4824, + "mean_token_accuracy": 0.8727827727794647, + "num_tokens": 11283605.0, + "step": 11255 + }, + { + "entropy": 0.29769267812371253, + "epoch": 2.6247814430586316, + "grad_norm": 0.50390625, + "learning_rate": 4.997453608820394e-05, + "loss": 0.4903, + "mean_token_accuracy": 0.8827204525470733, + "num_tokens": 11311531.0, + "step": 11260 + }, + { + "entropy": 0.33199090249836444, + "epoch": 2.6259470800792633, + "grad_norm": 7.5625, + "learning_rate": 4.9974438614696775e-05, + "loss": 0.4359, + "mean_token_accuracy": 0.9025272786617279, + "num_tokens": 11335227.0, + "step": 11265 + }, + { + "entropy": 0.2078126695007086, + "epoch": 2.6271127170998954, + "grad_norm": 0.78125, + "learning_rate": 4.997434095517697e-05, + "loss": 0.3611, + "mean_token_accuracy": 0.9238122761249542, + "num_tokens": 11370878.0, + "step": 11270 + }, + { + "entropy": 0.2649641171097755, + "epoch": 2.628278354120527, + "grad_norm": 1.953125, + "learning_rate": 4.9974243109645966e-05, + "loss": 0.4839, + "mean_token_accuracy": 0.9140565156936645, + "num_tokens": 11386737.0, + "step": 11275 + }, + { + "entropy": 0.26932487562298774, + "epoch": 2.6294439911411587, + "grad_norm": 6.71875, + "learning_rate": 4.997414507810525e-05, + "loss": 0.494, + "mean_token_accuracy": 0.9060956716537476, + "num_tokens": 11398109.0, + "step": 11280 + }, + { + "entropy": 0.3336785912513733, + "epoch": 2.6306096281617903, + "grad_norm": 5.96875, + "learning_rate": 4.997404686055626e-05, + "loss": 0.6023, + "mean_token_accuracy": 0.8886629402637481, + "num_tokens": 11418540.0, + "step": 11285 + }, + { + "entropy": 0.27329583168029786, + "epoch": 2.631775265182422, + "grad_norm": 2.9375, + "learning_rate": 4.9973948457000476e-05, + "loss": 0.5387, + "mean_token_accuracy": 0.895511794090271, + "num_tokens": 11430073.0, + "step": 11290 + }, + { + "entropy": 0.3408983126282692, + "epoch": 2.632940902203054, + "grad_norm": 10.625, + "learning_rate": 4.9973849867439346e-05, + "loss": 0.6893, + "mean_token_accuracy": 0.8785698235034942, + "num_tokens": 11449043.0, + "step": 11295 + }, + { + "entropy": 0.2924422096461058, + "epoch": 2.6341065392236858, + "grad_norm": 7.5, + "learning_rate": 4.997375109187437e-05, + "loss": 0.5025, + "mean_token_accuracy": 0.8872777104377747, + "num_tokens": 11464992.0, + "step": 11300 + }, + { + "entropy": 0.2665913224220276, + "epoch": 2.6352721762443174, + "grad_norm": 6.59375, + "learning_rate": 4.9973652130306994e-05, + "loss": 0.7234, + "mean_token_accuracy": 0.8719295203685761, + "num_tokens": 11474815.0, + "step": 11305 + }, + { + "entropy": 0.288311780244112, + "epoch": 2.6364378132649495, + "grad_norm": 1.5234375, + "learning_rate": 4.9973552982738705e-05, + "loss": 0.4171, + "mean_token_accuracy": 0.9102599263191223, + "num_tokens": 11490700.0, + "step": 11310 + }, + { + "entropy": 0.26549833118915556, + "epoch": 2.637603450285581, + "grad_norm": 0.69140625, + "learning_rate": 4.9973453649170974e-05, + "loss": 0.4156, + "mean_token_accuracy": 0.9002930223941803, + "num_tokens": 11512555.0, + "step": 11315 + }, + { + "entropy": 0.2690326914191246, + "epoch": 2.638769087306213, + "grad_norm": 1.3828125, + "learning_rate": 4.9973354129605296e-05, + "loss": 0.5101, + "mean_token_accuracy": 0.9137054443359375, + "num_tokens": 11526262.0, + "step": 11320 + }, + { + "entropy": 0.23037639632821083, + "epoch": 2.6399347243268445, + "grad_norm": 2.6875, + "learning_rate": 4.9973254424043144e-05, + "loss": 0.4401, + "mean_token_accuracy": 0.9020182192325592, + "num_tokens": 11541236.0, + "step": 11325 + }, + { + "entropy": 0.18799546770751477, + "epoch": 2.641100361347476, + "grad_norm": 5.28125, + "learning_rate": 4.997315453248601e-05, + "loss": 0.2115, + "mean_token_accuracy": 0.9374987721443176, + "num_tokens": 11575904.0, + "step": 11330 + }, + { + "entropy": 0.21427544951438904, + "epoch": 2.6422659983681083, + "grad_norm": 3.390625, + "learning_rate": 4.997305445493538e-05, + "loss": 0.3937, + "mean_token_accuracy": 0.9206536293029786, + "num_tokens": 11597724.0, + "step": 11335 + }, + { + "entropy": 0.2719633191823959, + "epoch": 2.64343163538874, + "grad_norm": 0.51953125, + "learning_rate": 4.997295419139274e-05, + "loss": 0.3489, + "mean_token_accuracy": 0.9026310503482818, + "num_tokens": 11624071.0, + "step": 11340 + }, + { + "entropy": 0.2608350694179535, + "epoch": 2.6445972724093716, + "grad_norm": 1.375, + "learning_rate": 4.99728537418596e-05, + "loss": 0.5964, + "mean_token_accuracy": 0.8827403604984283, + "num_tokens": 11637682.0, + "step": 11345 + }, + { + "entropy": 0.2973989363759756, + "epoch": 2.6457629094300037, + "grad_norm": 5.03125, + "learning_rate": 4.997275310633745e-05, + "loss": 0.4141, + "mean_token_accuracy": 0.9107711672782898, + "num_tokens": 11661335.0, + "step": 11350 + }, + { + "entropy": 0.2372315250337124, + "epoch": 2.6469285464506354, + "grad_norm": 1.1875, + "learning_rate": 4.997265228482779e-05, + "loss": 0.2801, + "mean_token_accuracy": 0.9254857778549195, + "num_tokens": 11680929.0, + "step": 11355 + }, + { + "entropy": 0.2743291571736336, + "epoch": 2.648094183471267, + "grad_norm": 6.84375, + "learning_rate": 4.997255127733212e-05, + "loss": 0.6052, + "mean_token_accuracy": 0.8960570693016052, + "num_tokens": 11691031.0, + "step": 11360 + }, + { + "entropy": 0.18517222441732883, + "epoch": 2.6492598204918987, + "grad_norm": 3.0625, + "learning_rate": 4.9972450083851965e-05, + "loss": 0.3446, + "mean_token_accuracy": 0.9325447082519531, + "num_tokens": 11710257.0, + "step": 11365 + }, + { + "entropy": 0.31101489067077637, + "epoch": 2.6504254575125303, + "grad_norm": 1.09375, + "learning_rate": 4.9972348704388805e-05, + "loss": 0.5734, + "mean_token_accuracy": 0.897227269411087, + "num_tokens": 11720796.0, + "step": 11370 + }, + { + "entropy": 0.2728283330798149, + "epoch": 2.6515910945331624, + "grad_norm": 3.578125, + "learning_rate": 4.997224713894417e-05, + "loss": 0.638, + "mean_token_accuracy": 0.8861160099506378, + "num_tokens": 11738627.0, + "step": 11375 + }, + { + "entropy": 0.16300744675099849, + "epoch": 2.652756731553794, + "grad_norm": 4.59375, + "learning_rate": 4.997214538751958e-05, + "loss": 0.2844, + "mean_token_accuracy": 0.9426489353179932, + "num_tokens": 11785879.0, + "step": 11380 + }, + { + "entropy": 0.25653175823390484, + "epoch": 2.6539223685744258, + "grad_norm": 3.3125, + "learning_rate": 4.997204345011653e-05, + "loss": 0.3171, + "mean_token_accuracy": 0.8947100043296814, + "num_tokens": 11824201.0, + "step": 11385 + }, + { + "entropy": 0.34817626476287844, + "epoch": 2.655088005595058, + "grad_norm": 4.40625, + "learning_rate": 4.997194132673656e-05, + "loss": 0.7191, + "mean_token_accuracy": 0.8866174995899201, + "num_tokens": 11832982.0, + "step": 11390 + }, + { + "entropy": 0.28169769085943697, + "epoch": 2.6562536426156895, + "grad_norm": 0.71875, + "learning_rate": 4.9971839017381185e-05, + "loss": 0.3534, + "mean_token_accuracy": 0.9082461953163147, + "num_tokens": 11854447.0, + "step": 11395 + }, + { + "entropy": 0.2898616187274456, + "epoch": 2.657419279636321, + "grad_norm": 1.3203125, + "learning_rate": 4.997173652205193e-05, + "loss": 0.3934, + "mean_token_accuracy": 0.9070635378360749, + "num_tokens": 11877699.0, + "step": 11400 + }, + { + "entropy": 0.3388654485344887, + "epoch": 2.6585849166569533, + "grad_norm": 5.59375, + "learning_rate": 4.997163384075033e-05, + "loss": 0.7418, + "mean_token_accuracy": 0.8552682518959045, + "num_tokens": 11889240.0, + "step": 11405 + }, + { + "entropy": 0.3616279847919941, + "epoch": 2.659750553677585, + "grad_norm": 7.875, + "learning_rate": 4.997153097347791e-05, + "loss": 0.5852, + "mean_token_accuracy": 0.8689880043268203, + "num_tokens": 11919570.0, + "step": 11410 + }, + { + "entropy": 0.2243417389690876, + "epoch": 2.6609161906982166, + "grad_norm": 1.1015625, + "learning_rate": 4.99714279202362e-05, + "loss": 0.5047, + "mean_token_accuracy": 0.9167357087135315, + "num_tokens": 11938429.0, + "step": 11415 + }, + { + "entropy": 0.25835750699043275, + "epoch": 2.6620818277188483, + "grad_norm": 3.78125, + "learning_rate": 4.997132468102674e-05, + "loss": 0.4717, + "mean_token_accuracy": 0.889584195613861, + "num_tokens": 11950186.0, + "step": 11420 + }, + { + "entropy": 0.2650266006588936, + "epoch": 2.66324746473948, + "grad_norm": 2.71875, + "learning_rate": 4.997122125585108e-05, + "loss": 0.5369, + "mean_token_accuracy": 0.8962382078170776, + "num_tokens": 11969804.0, + "step": 11425 + }, + { + "entropy": 0.34565363321453335, + "epoch": 2.664413101760112, + "grad_norm": 0.5625, + "learning_rate": 4.9971117644710745e-05, + "loss": 0.5914, + "mean_token_accuracy": 0.861111444234848, + "num_tokens": 12004415.0, + "step": 11430 + }, + { + "entropy": 0.3819424480199814, + "epoch": 2.6655787387807437, + "grad_norm": 3.53125, + "learning_rate": 4.99710138476073e-05, + "loss": 0.7971, + "mean_token_accuracy": 0.8668591558933259, + "num_tokens": 12014170.0, + "step": 11435 + }, + { + "entropy": 0.22254518344998359, + "epoch": 2.6667443758013754, + "grad_norm": 4.75, + "learning_rate": 4.997090986454227e-05, + "loss": 0.4434, + "mean_token_accuracy": 0.927557897567749, + "num_tokens": 12029565.0, + "step": 11440 + }, + { + "entropy": 0.25394832119345667, + "epoch": 2.6679100128220075, + "grad_norm": 0.388671875, + "learning_rate": 4.997080569551721e-05, + "loss": 0.3403, + "mean_token_accuracy": 0.9082087457180024, + "num_tokens": 12055656.0, + "step": 11445 + }, + { + "entropy": 0.24122598469257356, + "epoch": 2.669075649842639, + "grad_norm": 1.09375, + "learning_rate": 4.9970701340533694e-05, + "loss": 0.3257, + "mean_token_accuracy": 0.9278246402740479, + "num_tokens": 12073906.0, + "step": 11450 + }, + { + "entropy": 0.22207538709044455, + "epoch": 2.670241286863271, + "grad_norm": 1.1796875, + "learning_rate": 4.997059679959326e-05, + "loss": 0.3995, + "mean_token_accuracy": 0.9133785367012024, + "num_tokens": 12086888.0, + "step": 11455 + }, + { + "entropy": 0.2285417139530182, + "epoch": 2.6714069238839024, + "grad_norm": 1.015625, + "learning_rate": 4.997049207269747e-05, + "loss": 0.378, + "mean_token_accuracy": 0.9328377544879913, + "num_tokens": 12112474.0, + "step": 11460 + }, + { + "entropy": 0.23440224751830102, + "epoch": 2.672572560904534, + "grad_norm": 2.171875, + "learning_rate": 4.997038715984789e-05, + "loss": 0.3692, + "mean_token_accuracy": 0.920467472076416, + "num_tokens": 12134446.0, + "step": 11465 + }, + { + "entropy": 0.34350576922297477, + "epoch": 2.673738197925166, + "grad_norm": 6.71875, + "learning_rate": 4.997028206104607e-05, + "loss": 0.6887, + "mean_token_accuracy": 0.8498728811740875, + "num_tokens": 12152362.0, + "step": 11470 + }, + { + "entropy": 0.2824136093258858, + "epoch": 2.674903834945798, + "grad_norm": 3.0, + "learning_rate": 4.99701767762936e-05, + "loss": 0.4878, + "mean_token_accuracy": 0.9034132361412048, + "num_tokens": 12162776.0, + "step": 11475 + }, + { + "entropy": 0.22825291007757187, + "epoch": 2.6760694719664295, + "grad_norm": 0.890625, + "learning_rate": 4.997007130559203e-05, + "loss": 0.507, + "mean_token_accuracy": 0.9103431522846221, + "num_tokens": 12178504.0, + "step": 11480 + }, + { + "entropy": 0.2933326005935669, + "epoch": 2.6772351089870616, + "grad_norm": 2.8125, + "learning_rate": 4.9969965648942944e-05, + "loss": 0.644, + "mean_token_accuracy": 0.8840182542800903, + "num_tokens": 12187928.0, + "step": 11485 + }, + { + "entropy": 0.2727994412183762, + "epoch": 2.6784007460076933, + "grad_norm": 0.69921875, + "learning_rate": 4.996985980634792e-05, + "loss": 0.4404, + "mean_token_accuracy": 0.898718672990799, + "num_tokens": 12208235.0, + "step": 11490 + }, + { + "entropy": 0.24456576406955718, + "epoch": 2.679566383028325, + "grad_norm": 4.4375, + "learning_rate": 4.9969753777808524e-05, + "loss": 0.358, + "mean_token_accuracy": 0.9193500220775604, + "num_tokens": 12233805.0, + "step": 11495 + }, + { + "entropy": 0.24613263513892888, + "epoch": 2.6807320200489566, + "grad_norm": 1.7265625, + "learning_rate": 4.996964756332634e-05, + "loss": 0.5134, + "mean_token_accuracy": 0.9112427711486817, + "num_tokens": 12254031.0, + "step": 11500 + }, + { + "entropy": 0.33642361164093015, + "epoch": 2.6818976570695883, + "grad_norm": 8.3125, + "learning_rate": 4.9969541162902964e-05, + "loss": 0.5481, + "mean_token_accuracy": 0.8862937986850739, + "num_tokens": 12272378.0, + "step": 11505 + }, + { + "entropy": 0.2044012688100338, + "epoch": 2.6830632940902204, + "grad_norm": 1.8671875, + "learning_rate": 4.996943457653997e-05, + "loss": 0.3709, + "mean_token_accuracy": 0.9268832981586457, + "num_tokens": 12291946.0, + "step": 11510 + }, + { + "entropy": 0.24931253343820572, + "epoch": 2.684228931110852, + "grad_norm": 1.40625, + "learning_rate": 4.9969327804238956e-05, + "loss": 0.4929, + "mean_token_accuracy": 0.9052222788333892, + "num_tokens": 12303818.0, + "step": 11515 + }, + { + "entropy": 0.20881590992212296, + "epoch": 2.6853945681314837, + "grad_norm": 2.46875, + "learning_rate": 4.99692208460015e-05, + "loss": 0.2622, + "mean_token_accuracy": 0.9323864638805389, + "num_tokens": 12324390.0, + "step": 11520 + }, + { + "entropy": 0.2012042384594679, + "epoch": 2.686560205152116, + "grad_norm": 0.59765625, + "learning_rate": 4.996911370182921e-05, + "loss": 0.1416, + "mean_token_accuracy": 0.9535678446292877, + "num_tokens": 12359545.0, + "step": 11525 + }, + { + "entropy": 0.3009677939116955, + "epoch": 2.6877258421727475, + "grad_norm": 1.171875, + "learning_rate": 4.996900637172369e-05, + "loss": 0.5281, + "mean_token_accuracy": 0.875782185792923, + "num_tokens": 12373919.0, + "step": 11530 + }, + { + "entropy": 0.22892187163233757, + "epoch": 2.688891479193379, + "grad_norm": 4.625, + "learning_rate": 4.996889885568652e-05, + "loss": 0.4858, + "mean_token_accuracy": 0.8961142063140869, + "num_tokens": 12389644.0, + "step": 11535 + }, + { + "entropy": 0.1797600243240595, + "epoch": 2.6900571162140112, + "grad_norm": 0.92578125, + "learning_rate": 4.996879115371931e-05, + "loss": 0.3608, + "mean_token_accuracy": 0.9254206895828248, + "num_tokens": 12427754.0, + "step": 11540 + }, + { + "entropy": 0.23216586112976073, + "epoch": 2.691222753234643, + "grad_norm": 5.15625, + "learning_rate": 4.996868326582368e-05, + "loss": 0.4581, + "mean_token_accuracy": 0.9157405376434327, + "num_tokens": 12449553.0, + "step": 11545 + }, + { + "entropy": 0.27715420797467233, + "epoch": 2.6923883902552745, + "grad_norm": 4.0, + "learning_rate": 4.996857519200122e-05, + "loss": 0.3604, + "mean_token_accuracy": 0.909223598241806, + "num_tokens": 12469756.0, + "step": 11550 + }, + { + "entropy": 0.2619450569152832, + "epoch": 2.693554027275906, + "grad_norm": 5.90625, + "learning_rate": 4.9968466932253564e-05, + "loss": 0.5006, + "mean_token_accuracy": 0.9076269268989563, + "num_tokens": 12490124.0, + "step": 11555 + }, + { + "entropy": 0.2845116063952446, + "epoch": 2.694719664296538, + "grad_norm": 0.90234375, + "learning_rate": 4.99683584865823e-05, + "loss": 0.3196, + "mean_token_accuracy": 0.9181459307670593, + "num_tokens": 12512201.0, + "step": 11560 + }, + { + "entropy": 0.3422266826033592, + "epoch": 2.69588530131717, + "grad_norm": 4.75, + "learning_rate": 4.9968249854989054e-05, + "loss": 0.6682, + "mean_token_accuracy": 0.8535523563623428, + "num_tokens": 12528970.0, + "step": 11565 + }, + { + "entropy": 0.47459706813097, + "epoch": 2.6970509383378016, + "grad_norm": 0.703125, + "learning_rate": 4.996814103747546e-05, + "loss": 0.8362, + "mean_token_accuracy": 0.8407033622264862, + "num_tokens": 12556540.0, + "step": 11570 + }, + { + "entropy": 0.22336104661226272, + "epoch": 2.6982165753584333, + "grad_norm": 1.2578125, + "learning_rate": 4.996803203404313e-05, + "loss": 0.305, + "mean_token_accuracy": 0.9277842044830322, + "num_tokens": 12579634.0, + "step": 11575 + }, + { + "entropy": 0.30175293795764446, + "epoch": 2.6993822123790654, + "grad_norm": 9.1875, + "learning_rate": 4.996792284469368e-05, + "loss": 0.4639, + "mean_token_accuracy": 0.9062645494937897, + "num_tokens": 12595242.0, + "step": 11580 + }, + { + "entropy": 0.22965750470757484, + "epoch": 2.700547849399697, + "grad_norm": 3.6875, + "learning_rate": 4.996781346942875e-05, + "loss": 0.4088, + "mean_token_accuracy": 0.9102014899253845, + "num_tokens": 12622902.0, + "step": 11585 + }, + { + "entropy": 0.3243948698043823, + "epoch": 2.7017134864203287, + "grad_norm": 5.40625, + "learning_rate": 4.996770390824998e-05, + "loss": 0.5887, + "mean_token_accuracy": 0.8744899332523346, + "num_tokens": 12633690.0, + "step": 11590 + }, + { + "entropy": 0.23037907853722572, + "epoch": 2.7028791234409604, + "grad_norm": 2.609375, + "learning_rate": 4.996759416115898e-05, + "loss": 0.4174, + "mean_token_accuracy": 0.9255319714546204, + "num_tokens": 12645802.0, + "step": 11595 + }, + { + "entropy": 0.3350222710520029, + "epoch": 2.704044760461592, + "grad_norm": 0.640625, + "learning_rate": 4.99674842281574e-05, + "loss": 0.4574, + "mean_token_accuracy": 0.9005424678325653, + "num_tokens": 12671417.0, + "step": 11600 + }, + { + "entropy": 0.20586702935397624, + "epoch": 2.705210397482224, + "grad_norm": 3.71875, + "learning_rate": 4.996737410924688e-05, + "loss": 0.3393, + "mean_token_accuracy": 0.9213223278522491, + "num_tokens": 12697041.0, + "step": 11605 + }, + { + "entropy": 0.21617397107183933, + "epoch": 2.706376034502856, + "grad_norm": 0.703125, + "learning_rate": 4.996726380442906e-05, + "loss": 0.3141, + "mean_token_accuracy": 0.9104293942451477, + "num_tokens": 12727529.0, + "step": 11610 + }, + { + "entropy": 0.20006494000554084, + "epoch": 2.7075416715234875, + "grad_norm": 2.1875, + "learning_rate": 4.996715331370558e-05, + "loss": 0.2495, + "mean_token_accuracy": 0.9327197790145874, + "num_tokens": 12756256.0, + "step": 11615 + }, + { + "entropy": 0.16451627649366857, + "epoch": 2.7087073085441196, + "grad_norm": 1.7109375, + "learning_rate": 4.9967042637078104e-05, + "loss": 0.279, + "mean_token_accuracy": 0.9395867764949799, + "num_tokens": 12777458.0, + "step": 11620 + }, + { + "entropy": 0.17803059592843057, + "epoch": 2.7098729455647512, + "grad_norm": 2.609375, + "learning_rate": 4.996693177454827e-05, + "loss": 0.2705, + "mean_token_accuracy": 0.9386874735355377, + "num_tokens": 12794989.0, + "step": 11625 + }, + { + "entropy": 0.3340757980942726, + "epoch": 2.711038582585383, + "grad_norm": 6.6875, + "learning_rate": 4.996682072611772e-05, + "loss": 0.578, + "mean_token_accuracy": 0.8854405045509338, + "num_tokens": 12806814.0, + "step": 11630 + }, + { + "entropy": 0.30254448503255843, + "epoch": 2.7122042196060145, + "grad_norm": 6.9375, + "learning_rate": 4.996670949178813e-05, + "loss": 0.5859, + "mean_token_accuracy": 0.8814816117286682, + "num_tokens": 12819725.0, + "step": 11635 + }, + { + "entropy": 0.2472503509372473, + "epoch": 2.713369856626646, + "grad_norm": 1.296875, + "learning_rate": 4.996659807156115e-05, + "loss": 0.2973, + "mean_token_accuracy": 0.9119050920009613, + "num_tokens": 12843042.0, + "step": 11640 + }, + { + "entropy": 0.25286680161952974, + "epoch": 2.7145354936472783, + "grad_norm": 4.6875, + "learning_rate": 4.9966486465438437e-05, + "loss": 0.4863, + "mean_token_accuracy": 0.9123594641685486, + "num_tokens": 12854570.0, + "step": 11645 + }, + { + "entropy": 0.13867413848638535, + "epoch": 2.71570113066791, + "grad_norm": 0.9921875, + "learning_rate": 4.9966374673421665e-05, + "loss": 0.1569, + "mean_token_accuracy": 0.9393254101276398, + "num_tokens": 12888729.0, + "step": 11650 + }, + { + "entropy": 0.27010712325572966, + "epoch": 2.7168667676885416, + "grad_norm": 2.015625, + "learning_rate": 4.9966262695512494e-05, + "loss": 0.4142, + "mean_token_accuracy": 0.906298041343689, + "num_tokens": 12914069.0, + "step": 11655 + }, + { + "entropy": 0.2274789983406663, + "epoch": 2.7180324047091737, + "grad_norm": 2.625, + "learning_rate": 4.99661505317126e-05, + "loss": 0.469, + "mean_token_accuracy": 0.8987535774707794, + "num_tokens": 12934396.0, + "step": 11660 + }, + { + "entropy": 0.23609692528843879, + "epoch": 2.7191980417298054, + "grad_norm": 1.78125, + "learning_rate": 4.9966038182023646e-05, + "loss": 0.4503, + "mean_token_accuracy": 0.9209963023662567, + "num_tokens": 12950142.0, + "step": 11665 + }, + { + "entropy": 0.21072908565402032, + "epoch": 2.720363678750437, + "grad_norm": 1.1484375, + "learning_rate": 4.9965925646447316e-05, + "loss": 0.2943, + "mean_token_accuracy": 0.9385083317756653, + "num_tokens": 12986129.0, + "step": 11670 + }, + { + "entropy": 0.2646354716271162, + "epoch": 2.721529315771069, + "grad_norm": 2.375, + "learning_rate": 4.996581292498528e-05, + "loss": 0.3633, + "mean_token_accuracy": 0.9122869968414307, + "num_tokens": 13011072.0, + "step": 11675 + }, + { + "entropy": 0.3415803790092468, + "epoch": 2.722694952791701, + "grad_norm": 4.375, + "learning_rate": 4.9965700017639226e-05, + "loss": 0.5222, + "mean_token_accuracy": 0.8990362405776977, + "num_tokens": 13029584.0, + "step": 11680 + }, + { + "entropy": 0.24723002538084984, + "epoch": 2.7238605898123325, + "grad_norm": 4.375, + "learning_rate": 4.996558692441084e-05, + "loss": 0.4214, + "mean_token_accuracy": 0.9199587941169739, + "num_tokens": 13056668.0, + "step": 11685 + }, + { + "entropy": 0.16878330241888762, + "epoch": 2.725026226832964, + "grad_norm": 0.45703125, + "learning_rate": 4.99654736453018e-05, + "loss": 0.3618, + "mean_token_accuracy": 0.9300837516784668, + "num_tokens": 13081977.0, + "step": 11690 + }, + { + "entropy": 0.25726300925016404, + "epoch": 2.726191863853596, + "grad_norm": 3.921875, + "learning_rate": 4.9965360180313804e-05, + "loss": 0.4976, + "mean_token_accuracy": 0.9045258402824402, + "num_tokens": 13092587.0, + "step": 11695 + }, + { + "entropy": 0.4012131579220295, + "epoch": 2.727357500874228, + "grad_norm": 4.6875, + "learning_rate": 4.996524652944853e-05, + "loss": 0.6169, + "mean_token_accuracy": 0.8621207654476166, + "num_tokens": 13115618.0, + "step": 11700 + }, + { + "entropy": 0.26291630491614343, + "epoch": 2.7285231378948596, + "grad_norm": 5.71875, + "learning_rate": 4.9965132692707686e-05, + "loss": 0.5346, + "mean_token_accuracy": 0.9023758828639984, + "num_tokens": 13126747.0, + "step": 11705 + }, + { + "entropy": 0.2871785953640938, + "epoch": 2.7296887749154912, + "grad_norm": 3.796875, + "learning_rate": 4.996501867009296e-05, + "loss": 0.5377, + "mean_token_accuracy": 0.908946031332016, + "num_tokens": 13136640.0, + "step": 11710 + }, + { + "entropy": 0.34161866158246995, + "epoch": 2.7308544119361233, + "grad_norm": 4.46875, + "learning_rate": 4.9964904461606066e-05, + "loss": 0.6964, + "mean_token_accuracy": 0.8712542295455933, + "num_tokens": 13144769.0, + "step": 11715 + }, + { + "entropy": 0.24447038322687148, + "epoch": 2.732020048956755, + "grad_norm": 9.6875, + "learning_rate": 4.996479006724869e-05, + "loss": 0.5576, + "mean_token_accuracy": 0.9012001335620881, + "num_tokens": 13165742.0, + "step": 11720 + }, + { + "entropy": 0.19883279278874397, + "epoch": 2.7331856859773866, + "grad_norm": 7.59375, + "learning_rate": 4.996467548702255e-05, + "loss": 0.4156, + "mean_token_accuracy": 0.9189273893833161, + "num_tokens": 13179850.0, + "step": 11725 + }, + { + "entropy": 0.1922806277871132, + "epoch": 2.7343513229980183, + "grad_norm": 1.5390625, + "learning_rate": 4.9964560720929355e-05, + "loss": 0.2315, + "mean_token_accuracy": 0.9458309173583984, + "num_tokens": 13207309.0, + "step": 11730 + }, + { + "entropy": 0.2514540385454893, + "epoch": 2.73551696001865, + "grad_norm": 0.70703125, + "learning_rate": 4.99644457689708e-05, + "loss": 0.3651, + "mean_token_accuracy": 0.9023769974708558, + "num_tokens": 13239563.0, + "step": 11735 + }, + { + "entropy": 0.19213453009724618, + "epoch": 2.736682597039282, + "grad_norm": 4.25, + "learning_rate": 4.996433063114862e-05, + "loss": 0.3256, + "mean_token_accuracy": 0.9336883068084717, + "num_tokens": 13262413.0, + "step": 11740 + }, + { + "entropy": 0.3445202559232712, + "epoch": 2.7378482340599137, + "grad_norm": 6.0625, + "learning_rate": 4.996421530746452e-05, + "loss": 0.5881, + "mean_token_accuracy": 0.8524076044559479, + "num_tokens": 13278522.0, + "step": 11745 + }, + { + "entropy": 0.17778736725449562, + "epoch": 2.7390138710805454, + "grad_norm": 1.421875, + "learning_rate": 4.9964099797920224e-05, + "loss": 0.2799, + "mean_token_accuracy": 0.9420393466949463, + "num_tokens": 13299599.0, + "step": 11750 + }, + { + "entropy": 0.33492958918213844, + "epoch": 2.7401795081011775, + "grad_norm": 1.6328125, + "learning_rate": 4.9963984102517456e-05, + "loss": 0.4627, + "mean_token_accuracy": 0.8947258800268173, + "num_tokens": 13327734.0, + "step": 11755 + }, + { + "entropy": 0.24462985396385192, + "epoch": 2.741345145121809, + "grad_norm": 1.8828125, + "learning_rate": 4.996386822125794e-05, + "loss": 0.3535, + "mean_token_accuracy": 0.9099364161491394, + "num_tokens": 13344731.0, + "step": 11760 + }, + { + "entropy": 0.18358724601566792, + "epoch": 2.742510782142441, + "grad_norm": 1.6484375, + "learning_rate": 4.996375215414339e-05, + "loss": 0.2308, + "mean_token_accuracy": 0.9195159077644348, + "num_tokens": 13379085.0, + "step": 11765 + }, + { + "entropy": 0.15232093259692192, + "epoch": 2.7436764191630725, + "grad_norm": 2.703125, + "learning_rate": 4.996363590117556e-05, + "loss": 0.2483, + "mean_token_accuracy": 0.9433568239212036, + "num_tokens": 13397236.0, + "step": 11770 + }, + { + "entropy": 0.3235445529222488, + "epoch": 2.744842056183704, + "grad_norm": 15.1875, + "learning_rate": 4.996351946235616e-05, + "loss": 0.7299, + "mean_token_accuracy": 0.8768341243267059, + "num_tokens": 13404816.0, + "step": 11775 + }, + { + "entropy": 0.27746813744306564, + "epoch": 2.7460076932043362, + "grad_norm": 3.53125, + "learning_rate": 4.996340283768695e-05, + "loss": 0.3638, + "mean_token_accuracy": 0.8914911150932312, + "num_tokens": 13434499.0, + "step": 11780 + }, + { + "entropy": 0.2395051196217537, + "epoch": 2.747173330224968, + "grad_norm": 0.609375, + "learning_rate": 4.996328602716965e-05, + "loss": 0.295, + "mean_token_accuracy": 0.9167731881141663, + "num_tokens": 13462582.0, + "step": 11785 + }, + { + "entropy": 0.20807163119316102, + "epoch": 2.7483389672455996, + "grad_norm": 0.7578125, + "learning_rate": 4.996316903080602e-05, + "loss": 0.25, + "mean_token_accuracy": 0.933128297328949, + "num_tokens": 13481898.0, + "step": 11790 + }, + { + "entropy": 0.27841252386569976, + "epoch": 2.7495046042662317, + "grad_norm": 2.09375, + "learning_rate": 4.9963051848597785e-05, + "loss": 0.3034, + "mean_token_accuracy": 0.8982349395751953, + "num_tokens": 13505194.0, + "step": 11795 + }, + { + "entropy": 0.22490386068820953, + "epoch": 2.7506702412868633, + "grad_norm": 3.859375, + "learning_rate": 4.996293448054671e-05, + "loss": 0.424, + "mean_token_accuracy": 0.9229201734066009, + "num_tokens": 13523744.0, + "step": 11800 + }, + { + "entropy": 0.23477228432893754, + "epoch": 2.751835878307495, + "grad_norm": 6.25, + "learning_rate": 4.9962816926654525e-05, + "loss": 0.4653, + "mean_token_accuracy": 0.9090815782546997, + "num_tokens": 13535843.0, + "step": 11805 + }, + { + "entropy": 0.2388323299586773, + "epoch": 2.753001515328127, + "grad_norm": 1.3125, + "learning_rate": 4.9962699186923e-05, + "loss": 0.2872, + "mean_token_accuracy": 0.9178419411182404, + "num_tokens": 13585893.0, + "step": 11810 + }, + { + "entropy": 0.19899034015834333, + "epoch": 2.7541671523487588, + "grad_norm": 2.328125, + "learning_rate": 4.996258126135388e-05, + "loss": 0.2889, + "mean_token_accuracy": 0.9384098529815674, + "num_tokens": 13608252.0, + "step": 11815 + }, + { + "entropy": 0.28082182705402375, + "epoch": 2.7553327893693904, + "grad_norm": 5.875, + "learning_rate": 4.996246314994894e-05, + "loss": 0.5502, + "mean_token_accuracy": 0.8987651646137238, + "num_tokens": 13618617.0, + "step": 11820 + }, + { + "entropy": 0.28569440320134165, + "epoch": 2.756498426390022, + "grad_norm": 3.046875, + "learning_rate": 4.9962344852709926e-05, + "loss": 0.6317, + "mean_token_accuracy": 0.8887876272201538, + "num_tokens": 13633228.0, + "step": 11825 + }, + { + "entropy": 0.21487718708813192, + "epoch": 2.7576640634106537, + "grad_norm": 2.28125, + "learning_rate": 4.9962226369638604e-05, + "loss": 0.4465, + "mean_token_accuracy": 0.9066687405109406, + "num_tokens": 13655881.0, + "step": 11830 + }, + { + "entropy": 0.29074698835611346, + "epoch": 2.758829700431286, + "grad_norm": 2.15625, + "learning_rate": 4.996210770073674e-05, + "loss": 0.3076, + "mean_token_accuracy": 0.9369310677051544, + "num_tokens": 13678361.0, + "step": 11835 + }, + { + "entropy": 0.2757992595434189, + "epoch": 2.7599953374519175, + "grad_norm": 2.109375, + "learning_rate": 4.996198884600611e-05, + "loss": 0.4553, + "mean_token_accuracy": 0.9155687928199768, + "num_tokens": 13689044.0, + "step": 11840 + }, + { + "entropy": 0.23771463334560394, + "epoch": 2.761160974472549, + "grad_norm": 4.59375, + "learning_rate": 4.9961869805448483e-05, + "loss": 0.28, + "mean_token_accuracy": 0.9290770351886749, + "num_tokens": 13702931.0, + "step": 11845 + }, + { + "entropy": 0.21810585632920265, + "epoch": 2.7623266114931813, + "grad_norm": 0.91796875, + "learning_rate": 4.996175057906563e-05, + "loss": 0.2698, + "mean_token_accuracy": 0.9399433553218841, + "num_tokens": 13744305.0, + "step": 11850 + }, + { + "entropy": 0.20287301018834114, + "epoch": 2.763492248513813, + "grad_norm": 1.015625, + "learning_rate": 4.996163116685933e-05, + "loss": 0.2539, + "mean_token_accuracy": 0.9406480073928833, + "num_tokens": 13763052.0, + "step": 11855 + }, + { + "entropy": 0.22470155581831933, + "epoch": 2.7646578855344446, + "grad_norm": 5.03125, + "learning_rate": 4.996151156883137e-05, + "loss": 0.417, + "mean_token_accuracy": 0.9212198674678802, + "num_tokens": 13781973.0, + "step": 11860 + }, + { + "entropy": 0.20656557697802783, + "epoch": 2.7658235225550762, + "grad_norm": 1.4765625, + "learning_rate": 4.996139178498353e-05, + "loss": 0.283, + "mean_token_accuracy": 0.9304579198360443, + "num_tokens": 13811417.0, + "step": 11865 + }, + { + "entropy": 0.22034696750342847, + "epoch": 2.766989159575708, + "grad_norm": 0.5234375, + "learning_rate": 4.9961271815317594e-05, + "loss": 0.3176, + "mean_token_accuracy": 0.91060671210289, + "num_tokens": 13835697.0, + "step": 11870 + }, + { + "entropy": 0.2568760313093662, + "epoch": 2.76815479659634, + "grad_norm": 5.25, + "learning_rate": 4.996115165983535e-05, + "loss": 0.3898, + "mean_token_accuracy": 0.9200484037399292, + "num_tokens": 13849117.0, + "step": 11875 + }, + { + "entropy": 0.3268521726131439, + "epoch": 2.7693204336169717, + "grad_norm": 5.46875, + "learning_rate": 4.99610313185386e-05, + "loss": 0.5835, + "mean_token_accuracy": 0.8949032425880432, + "num_tokens": 13857116.0, + "step": 11880 + }, + { + "entropy": 0.2411774557083845, + "epoch": 2.7704860706376033, + "grad_norm": 0.86328125, + "learning_rate": 4.996091079142913e-05, + "loss": 0.3025, + "mean_token_accuracy": 0.9226352453231812, + "num_tokens": 13897630.0, + "step": 11885 + }, + { + "entropy": 0.19736510664224624, + "epoch": 2.7716517076582354, + "grad_norm": 2.4375, + "learning_rate": 4.996079007850873e-05, + "loss": 0.2608, + "mean_token_accuracy": 0.9343535602092743, + "num_tokens": 13921097.0, + "step": 11890 + }, + { + "entropy": 0.2812109630554914, + "epoch": 2.772817344678867, + "grad_norm": 6.5, + "learning_rate": 4.9960669179779205e-05, + "loss": 0.3654, + "mean_token_accuracy": 0.9012946605682373, + "num_tokens": 13944408.0, + "step": 11895 + }, + { + "entropy": 0.2541120745241642, + "epoch": 2.7739829816994988, + "grad_norm": 2.828125, + "learning_rate": 4.996054809524237e-05, + "loss": 0.4858, + "mean_token_accuracy": 0.9016520738601684, + "num_tokens": 13959937.0, + "step": 11900 + }, + { + "entropy": 0.2620885193347931, + "epoch": 2.7751486187201304, + "grad_norm": 4.375, + "learning_rate": 4.996042682490002e-05, + "loss": 0.3841, + "mean_token_accuracy": 0.910321581363678, + "num_tokens": 13975779.0, + "step": 11905 + }, + { + "entropy": 0.2132838958874345, + "epoch": 2.776314255740762, + "grad_norm": 6.0, + "learning_rate": 4.996030536875396e-05, + "loss": 0.3785, + "mean_token_accuracy": 0.9210842311382293, + "num_tokens": 13997478.0, + "step": 11910 + }, + { + "entropy": 0.21150433290749787, + "epoch": 2.777479892761394, + "grad_norm": 4.5, + "learning_rate": 4.996018372680601e-05, + "loss": 0.3659, + "mean_token_accuracy": 0.9263874173164368, + "num_tokens": 14017546.0, + "step": 11915 + }, + { + "entropy": 0.24220035672187806, + "epoch": 2.778645529782026, + "grad_norm": 7.0625, + "learning_rate": 4.996006189905798e-05, + "loss": 0.3332, + "mean_token_accuracy": 0.9200899660587311, + "num_tokens": 14038106.0, + "step": 11920 + }, + { + "entropy": 0.24553507566452026, + "epoch": 2.7798111668026575, + "grad_norm": 4.78125, + "learning_rate": 4.995993988551168e-05, + "loss": 0.3851, + "mean_token_accuracy": 0.9134992480278015, + "num_tokens": 14059942.0, + "step": 11925 + }, + { + "entropy": 0.19736606925725936, + "epoch": 2.7809768038232896, + "grad_norm": 0.57421875, + "learning_rate": 4.9959817686168945e-05, + "loss": 0.3491, + "mean_token_accuracy": 0.9388951897621155, + "num_tokens": 14084931.0, + "step": 11930 + }, + { + "entropy": 0.36639479398727415, + "epoch": 2.7821424408439213, + "grad_norm": 7.1875, + "learning_rate": 4.995969530103158e-05, + "loss": 0.5662, + "mean_token_accuracy": 0.8725821018218994, + "num_tokens": 14100686.0, + "step": 11935 + }, + { + "entropy": 0.2210806304588914, + "epoch": 2.783308077864553, + "grad_norm": 5.09375, + "learning_rate": 4.9959572730101416e-05, + "loss": 0.3382, + "mean_token_accuracy": 0.9269168257713318, + "num_tokens": 14127188.0, + "step": 11940 + }, + { + "entropy": 0.2088917564600706, + "epoch": 2.784473714885185, + "grad_norm": 3.84375, + "learning_rate": 4.995944997338029e-05, + "loss": 0.3704, + "mean_token_accuracy": 0.9320711851119995, + "num_tokens": 14145313.0, + "step": 11945 + }, + { + "entropy": 0.2135708898305893, + "epoch": 2.7856393519058167, + "grad_norm": 1.046875, + "learning_rate": 4.9959327030870016e-05, + "loss": 0.3219, + "mean_token_accuracy": 0.9356307864189148, + "num_tokens": 14168427.0, + "step": 11950 + }, + { + "entropy": 0.20443826280534266, + "epoch": 2.7868049889264483, + "grad_norm": 1.7890625, + "learning_rate": 4.9959203902572446e-05, + "loss": 0.2976, + "mean_token_accuracy": 0.9165285766124726, + "num_tokens": 14188997.0, + "step": 11955 + }, + { + "entropy": 0.23184158205986022, + "epoch": 2.78797062594708, + "grad_norm": 2.171875, + "learning_rate": 4.99590805884894e-05, + "loss": 0.3338, + "mean_token_accuracy": 0.923123162984848, + "num_tokens": 14218749.0, + "step": 11960 + }, + { + "entropy": 0.30974789895117283, + "epoch": 2.7891362629677117, + "grad_norm": 0.53515625, + "learning_rate": 4.995895708862272e-05, + "loss": 0.6093, + "mean_token_accuracy": 0.8700804620981216, + "num_tokens": 14247068.0, + "step": 11965 + }, + { + "entropy": 0.2754995569586754, + "epoch": 2.7903018999883438, + "grad_norm": 4.875, + "learning_rate": 4.9958833402974255e-05, + "loss": 0.7198, + "mean_token_accuracy": 0.8932251870632172, + "num_tokens": 14260782.0, + "step": 11970 + }, + { + "entropy": 0.2595931053161621, + "epoch": 2.7914675370089754, + "grad_norm": 3.546875, + "learning_rate": 4.995870953154585e-05, + "loss": 0.5462, + "mean_token_accuracy": 0.9064160704612731, + "num_tokens": 14270942.0, + "step": 11975 + }, + { + "entropy": 0.3655819445848465, + "epoch": 2.792633174029607, + "grad_norm": 5.5, + "learning_rate": 4.995858547433934e-05, + "loss": 0.7201, + "mean_token_accuracy": 0.8726738154888153, + "num_tokens": 14279499.0, + "step": 11980 + }, + { + "entropy": 0.23394382521510124, + "epoch": 2.793798811050239, + "grad_norm": 0.380859375, + "learning_rate": 4.995846123135658e-05, + "loss": 0.405, + "mean_token_accuracy": 0.9207181870937348, + "num_tokens": 14301846.0, + "step": 11985 + }, + { + "entropy": 0.29523569345474243, + "epoch": 2.794964448070871, + "grad_norm": 1.640625, + "learning_rate": 4.9958336802599426e-05, + "loss": 0.572, + "mean_token_accuracy": 0.8870051562786102, + "num_tokens": 14314057.0, + "step": 11990 + }, + { + "entropy": 0.1730364289134741, + "epoch": 2.7961300850915025, + "grad_norm": 3.609375, + "learning_rate": 4.995821218806973e-05, + "loss": 0.2169, + "mean_token_accuracy": 0.9259053111076355, + "num_tokens": 14334832.0, + "step": 11995 + }, + { + "entropy": 0.24064609967172146, + "epoch": 2.797295722112134, + "grad_norm": 5.625, + "learning_rate": 4.995808738776936e-05, + "loss": 0.3937, + "mean_token_accuracy": 0.9231860220432282, + "num_tokens": 14349962.0, + "step": 12000 + }, + { + "entropy": 0.20124710947275162, + "epoch": 2.798461359132766, + "grad_norm": 5.125, + "learning_rate": 4.9957962401700165e-05, + "loss": 0.3264, + "mean_token_accuracy": 0.9315138995647431, + "num_tokens": 14375096.0, + "step": 12005 + }, + { + "entropy": 0.28214637413620947, + "epoch": 2.799626996153398, + "grad_norm": 3.734375, + "learning_rate": 4.9957837229864006e-05, + "loss": 0.4613, + "mean_token_accuracy": 0.9064045310020447, + "num_tokens": 14390327.0, + "step": 12010 + }, + { + "entropy": 0.2217831529676914, + "epoch": 2.8007926331740296, + "grad_norm": 4.4375, + "learning_rate": 4.995771187226277e-05, + "loss": 0.3759, + "mean_token_accuracy": 0.9254004776477813, + "num_tokens": 14408761.0, + "step": 12015 + }, + { + "entropy": 0.22049227058887483, + "epoch": 2.8019582701946613, + "grad_norm": 6.28125, + "learning_rate": 4.99575863288983e-05, + "loss": 0.4114, + "mean_token_accuracy": 0.9224657356739044, + "num_tokens": 14419772.0, + "step": 12020 + }, + { + "entropy": 0.19187535513192416, + "epoch": 2.8031239072152934, + "grad_norm": 0.6484375, + "learning_rate": 4.9957460599772484e-05, + "loss": 0.256, + "mean_token_accuracy": 0.9346370995044708, + "num_tokens": 14445762.0, + "step": 12025 + }, + { + "entropy": 0.1792112410068512, + "epoch": 2.804289544235925, + "grad_norm": 4.90625, + "learning_rate": 4.9957334684887195e-05, + "loss": 0.2564, + "mean_token_accuracy": 0.9480877816677094, + "num_tokens": 14466302.0, + "step": 12030 + }, + { + "entropy": 0.23121217228472232, + "epoch": 2.8054551812565567, + "grad_norm": 5.53125, + "learning_rate": 4.995720858424431e-05, + "loss": 0.3067, + "mean_token_accuracy": 0.9157763183116913, + "num_tokens": 14485887.0, + "step": 12035 + }, + { + "entropy": 0.2949991822242737, + "epoch": 2.8066208182771883, + "grad_norm": 3.34375, + "learning_rate": 4.9957082297845706e-05, + "loss": 0.4807, + "mean_token_accuracy": 0.9019329190254212, + "num_tokens": 14499730.0, + "step": 12040 + }, + { + "entropy": 0.2044808973558247, + "epoch": 2.80778645529782, + "grad_norm": 7.1875, + "learning_rate": 4.9956955825693267e-05, + "loss": 0.3778, + "mean_token_accuracy": 0.9192876040935516, + "num_tokens": 14530684.0, + "step": 12045 + }, + { + "entropy": 0.2982170686125755, + "epoch": 2.808952092318452, + "grad_norm": 5.59375, + "learning_rate": 4.995682916778889e-05, + "loss": 0.5481, + "mean_token_accuracy": 0.8952968299388886, + "num_tokens": 14546286.0, + "step": 12050 + }, + { + "entropy": 0.2020443793386221, + "epoch": 2.8101177293390838, + "grad_norm": 3.03125, + "learning_rate": 4.995670232413444e-05, + "loss": 0.378, + "mean_token_accuracy": 0.9278689801692963, + "num_tokens": 14561116.0, + "step": 12055 + }, + { + "entropy": 0.42926357612013816, + "epoch": 2.8112833663597154, + "grad_norm": 1.3125, + "learning_rate": 4.9956575294731836e-05, + "loss": 0.6813, + "mean_token_accuracy": 0.8541025578975677, + "num_tokens": 14586798.0, + "step": 12060 + }, + { + "entropy": 0.24253510124981403, + "epoch": 2.8124490033803475, + "grad_norm": 7.25, + "learning_rate": 4.9956448079582946e-05, + "loss": 0.5191, + "mean_token_accuracy": 0.9105861961841584, + "num_tokens": 14606560.0, + "step": 12065 + }, + { + "entropy": 0.32413134127855303, + "epoch": 2.813614640400979, + "grad_norm": 5.3125, + "learning_rate": 4.995632067868969e-05, + "loss": 0.3997, + "mean_token_accuracy": 0.8935954630374908, + "num_tokens": 14631434.0, + "step": 12070 + }, + { + "entropy": 0.19401369988918304, + "epoch": 2.814780277421611, + "grad_norm": 5.9375, + "learning_rate": 4.995619309205395e-05, + "loss": 0.4599, + "mean_token_accuracy": 0.9210140883922577, + "num_tokens": 14653039.0, + "step": 12075 + }, + { + "entropy": 0.2677511714398861, + "epoch": 2.815945914442243, + "grad_norm": 5.28125, + "learning_rate": 4.995606531967764e-05, + "loss": 0.5118, + "mean_token_accuracy": 0.9019227385520935, + "num_tokens": 14663412.0, + "step": 12080 + }, + { + "entropy": 0.21213569901883603, + "epoch": 2.8171115514628746, + "grad_norm": 5.125, + "learning_rate": 4.995593736156266e-05, + "loss": 0.2504, + "mean_token_accuracy": 0.9331638395786286, + "num_tokens": 14694986.0, + "step": 12085 + }, + { + "entropy": 0.22414507642388343, + "epoch": 2.8182771884835063, + "grad_norm": 4.3125, + "learning_rate": 4.995580921771091e-05, + "loss": 0.4155, + "mean_token_accuracy": 0.9196042895317078, + "num_tokens": 14707633.0, + "step": 12090 + }, + { + "entropy": 0.31253494918346403, + "epoch": 2.819442825504138, + "grad_norm": 6.03125, + "learning_rate": 4.9955680888124324e-05, + "loss": 0.735, + "mean_token_accuracy": 0.8750035762786865, + "num_tokens": 14716172.0, + "step": 12095 + }, + { + "entropy": 0.2718751896172762, + "epoch": 2.8206084625247696, + "grad_norm": 6.1875, + "learning_rate": 4.9955552372804796e-05, + "loss": 0.36, + "mean_token_accuracy": 0.9038479328155518, + "num_tokens": 14746025.0, + "step": 12100 + }, + { + "entropy": 0.25656766891479493, + "epoch": 2.8217740995454017, + "grad_norm": 5.21875, + "learning_rate": 4.9955423671754254e-05, + "loss": 0.4172, + "mean_token_accuracy": 0.9210475564002991, + "num_tokens": 14758139.0, + "step": 12105 + }, + { + "entropy": 0.22321480922400952, + "epoch": 2.8229397365660334, + "grad_norm": 5.5, + "learning_rate": 4.995529478497461e-05, + "loss": 0.4913, + "mean_token_accuracy": 0.9010767698287964, + "num_tokens": 14777336.0, + "step": 12110 + }, + { + "entropy": 0.25825431272387506, + "epoch": 2.824105373586665, + "grad_norm": 3.25, + "learning_rate": 4.9955165712467774e-05, + "loss": 0.5626, + "mean_token_accuracy": 0.8835008561611175, + "num_tokens": 14797899.0, + "step": 12115 + }, + { + "entropy": 0.3582488939166069, + "epoch": 2.825271010607297, + "grad_norm": 5.84375, + "learning_rate": 4.995503645423569e-05, + "loss": 0.8463, + "mean_token_accuracy": 0.8668466031551361, + "num_tokens": 14807031.0, + "step": 12120 + }, + { + "entropy": 0.2489135056734085, + "epoch": 2.826436647627929, + "grad_norm": 5.15625, + "learning_rate": 4.995490701028028e-05, + "loss": 0.4786, + "mean_token_accuracy": 0.9056442618370056, + "num_tokens": 14819392.0, + "step": 12125 + }, + { + "entropy": 0.19951955564320087, + "epoch": 2.8276022846485604, + "grad_norm": 6.1875, + "learning_rate": 4.9954777380603476e-05, + "loss": 0.3804, + "mean_token_accuracy": 0.922314727306366, + "num_tokens": 14837431.0, + "step": 12130 + }, + { + "entropy": 0.26699568033218385, + "epoch": 2.828767921669192, + "grad_norm": 6.0, + "learning_rate": 4.995464756520721e-05, + "loss": 0.4694, + "mean_token_accuracy": 0.918603527545929, + "num_tokens": 14848180.0, + "step": 12135 + }, + { + "entropy": 0.46731987968087196, + "epoch": 2.8299335586898238, + "grad_norm": 6.21875, + "learning_rate": 4.9954517564093406e-05, + "loss": 0.8151, + "mean_token_accuracy": 0.8681573092937469, + "num_tokens": 14867566.0, + "step": 12140 + }, + { + "entropy": 0.30520070940256117, + "epoch": 2.831099195710456, + "grad_norm": 5.71875, + "learning_rate": 4.9954387377264024e-05, + "loss": 0.6726, + "mean_token_accuracy": 0.8926642715930939, + "num_tokens": 14876554.0, + "step": 12145 + }, + { + "entropy": 0.26313832104206086, + "epoch": 2.8322648327310875, + "grad_norm": 6.65625, + "learning_rate": 4.995425700472098e-05, + "loss": 0.6288, + "mean_token_accuracy": 0.880269593000412, + "num_tokens": 14893948.0, + "step": 12150 + }, + { + "entropy": 0.26269229091703894, + "epoch": 2.833430469751719, + "grad_norm": 0.64453125, + "learning_rate": 4.995412644646625e-05, + "loss": 0.3725, + "mean_token_accuracy": 0.9041543662548065, + "num_tokens": 14928840.0, + "step": 12155 + }, + { + "entropy": 0.24252827167510987, + "epoch": 2.8345961067723513, + "grad_norm": 3.0625, + "learning_rate": 4.9953995702501746e-05, + "loss": 0.3369, + "mean_token_accuracy": 0.9155629277229309, + "num_tokens": 14945946.0, + "step": 12160 + }, + { + "entropy": 0.22055025771260262, + "epoch": 2.835761743792983, + "grad_norm": 1.234375, + "learning_rate": 4.9953864772829444e-05, + "loss": 0.3357, + "mean_token_accuracy": 0.901597386598587, + "num_tokens": 14965542.0, + "step": 12165 + }, + { + "entropy": 0.18117874898016453, + "epoch": 2.8369273808136146, + "grad_norm": 1.125, + "learning_rate": 4.9953733657451286e-05, + "loss": 0.2497, + "mean_token_accuracy": 0.9453995823860168, + "num_tokens": 14983251.0, + "step": 12170 + }, + { + "entropy": 0.18524248637259005, + "epoch": 2.8380930178342463, + "grad_norm": 0.59765625, + "learning_rate": 4.9953602356369225e-05, + "loss": 0.3304, + "mean_token_accuracy": 0.9263292074203491, + "num_tokens": 15008932.0, + "step": 12175 + }, + { + "entropy": 0.2279165990650654, + "epoch": 2.839258654854878, + "grad_norm": 1.7421875, + "learning_rate": 4.995347086958522e-05, + "loss": 0.3454, + "mean_token_accuracy": 0.9205220639705658, + "num_tokens": 15038147.0, + "step": 12180 + }, + { + "entropy": 0.23292369097471238, + "epoch": 2.84042429187551, + "grad_norm": 11.1875, + "learning_rate": 4.9953339197101235e-05, + "loss": 0.5381, + "mean_token_accuracy": 0.9021409928798676, + "num_tokens": 15050882.0, + "step": 12185 + }, + { + "entropy": 0.17848289832472802, + "epoch": 2.8415899288961417, + "grad_norm": 0.515625, + "learning_rate": 4.9953207338919235e-05, + "loss": 0.2198, + "mean_token_accuracy": 0.94959676861763, + "num_tokens": 15077393.0, + "step": 12190 + }, + { + "entropy": 0.2969540163874626, + "epoch": 2.8427555659167734, + "grad_norm": 7.15625, + "learning_rate": 4.995307529504117e-05, + "loss": 0.6852, + "mean_token_accuracy": 0.8878288805484772, + "num_tokens": 15091979.0, + "step": 12195 + }, + { + "entropy": 0.20284539386630057, + "epoch": 2.8439212029374055, + "grad_norm": 3.53125, + "learning_rate": 4.995294306546904e-05, + "loss": 0.3946, + "mean_token_accuracy": 0.9282336890697479, + "num_tokens": 15114542.0, + "step": 12200 + }, + { + "entropy": 0.20276891775429248, + "epoch": 2.845086839958037, + "grad_norm": 7.0625, + "learning_rate": 4.995281065020479e-05, + "loss": 0.3412, + "mean_token_accuracy": 0.9302069902420044, + "num_tokens": 15138680.0, + "step": 12205 + }, + { + "entropy": 0.19186981171369552, + "epoch": 2.846252476978669, + "grad_norm": 7.4375, + "learning_rate": 4.99526780492504e-05, + "loss": 0.524, + "mean_token_accuracy": 0.91390061378479, + "num_tokens": 15169710.0, + "step": 12210 + }, + { + "entropy": 0.2559481278061867, + "epoch": 2.847418113999301, + "grad_norm": 2.328125, + "learning_rate": 4.995254526260786e-05, + "loss": 0.5895, + "mean_token_accuracy": 0.8948764860630035, + "num_tokens": 15180608.0, + "step": 12215 + }, + { + "entropy": 0.2940599586814642, + "epoch": 2.8485837510199326, + "grad_norm": 5.1875, + "learning_rate": 4.995241229027913e-05, + "loss": 0.4582, + "mean_token_accuracy": 0.892432689666748, + "num_tokens": 15213859.0, + "step": 12220 + }, + { + "entropy": 0.22507907301187516, + "epoch": 2.849749388040564, + "grad_norm": 1.9296875, + "learning_rate": 4.995227913226621e-05, + "loss": 0.3666, + "mean_token_accuracy": 0.9292806625366211, + "num_tokens": 15235352.0, + "step": 12225 + }, + { + "entropy": 0.23611784502863883, + "epoch": 2.850915025061196, + "grad_norm": 7.0625, + "learning_rate": 4.9952145788571074e-05, + "loss": 0.311, + "mean_token_accuracy": 0.9268670618534088, + "num_tokens": 15264026.0, + "step": 12230 + }, + { + "entropy": 0.22443998456001282, + "epoch": 2.8520806620818275, + "grad_norm": 4.96875, + "learning_rate": 4.995201225919572e-05, + "loss": 0.4376, + "mean_token_accuracy": 0.9172143876552582, + "num_tokens": 15274274.0, + "step": 12235 + }, + { + "entropy": 0.2179919883608818, + "epoch": 2.8532462991024596, + "grad_norm": 3.9375, + "learning_rate": 4.995187854414213e-05, + "loss": 0.3341, + "mean_token_accuracy": 0.9146465182304382, + "num_tokens": 15291454.0, + "step": 12240 + }, + { + "entropy": 0.12379531040787697, + "epoch": 2.8544119361230913, + "grad_norm": 0.9296875, + "learning_rate": 4.9951744643412304e-05, + "loss": 0.1584, + "mean_token_accuracy": 0.964534604549408, + "num_tokens": 15315876.0, + "step": 12245 + }, + { + "entropy": 0.14848388805985452, + "epoch": 2.855577573143723, + "grad_norm": 1.34375, + "learning_rate": 4.995161055700824e-05, + "loss": 0.1625, + "mean_token_accuracy": 0.954338264465332, + "num_tokens": 15343677.0, + "step": 12250 + }, + { + "entropy": 0.2934903770685196, + "epoch": 2.856743210164355, + "grad_norm": 2.453125, + "learning_rate": 4.995147628493193e-05, + "loss": 0.5294, + "mean_token_accuracy": 0.9005158245563507, + "num_tokens": 15353076.0, + "step": 12255 + }, + { + "entropy": 0.18981227725744249, + "epoch": 2.8579088471849867, + "grad_norm": 4.09375, + "learning_rate": 4.995134182718538e-05, + "loss": 0.2938, + "mean_token_accuracy": 0.9368978083133698, + "num_tokens": 15372999.0, + "step": 12260 + }, + { + "entropy": 0.21589052006602288, + "epoch": 2.8590744842056184, + "grad_norm": 0.6328125, + "learning_rate": 4.99512071837706e-05, + "loss": 0.3413, + "mean_token_accuracy": 0.9222669243812561, + "num_tokens": 15394936.0, + "step": 12265 + }, + { + "entropy": 0.41172001510858536, + "epoch": 2.86024012122625, + "grad_norm": 5.3125, + "learning_rate": 4.9951072354689585e-05, + "loss": 0.6215, + "mean_token_accuracy": 0.8982397258281708, + "num_tokens": 15428941.0, + "step": 12270 + }, + { + "entropy": 0.21318286955356597, + "epoch": 2.8614057582468817, + "grad_norm": 5.53125, + "learning_rate": 4.9950937339944365e-05, + "loss": 0.4136, + "mean_token_accuracy": 0.926407641172409, + "num_tokens": 15446441.0, + "step": 12275 + }, + { + "entropy": 0.2053906600922346, + "epoch": 2.862571395267514, + "grad_norm": 1.75, + "learning_rate": 4.9950802139536937e-05, + "loss": 0.3031, + "mean_token_accuracy": 0.9017199754714966, + "num_tokens": 15468009.0, + "step": 12280 + }, + { + "entropy": 0.11248552985489368, + "epoch": 2.8637370322881455, + "grad_norm": 1.296875, + "learning_rate": 4.9950666753469325e-05, + "loss": 0.141, + "mean_token_accuracy": 0.967561000585556, + "num_tokens": 15517507.0, + "step": 12285 + }, + { + "entropy": 0.30170712172985076, + "epoch": 2.864902669308777, + "grad_norm": 6.0625, + "learning_rate": 4.995053118174353e-05, + "loss": 0.5021, + "mean_token_accuracy": 0.882989478111267, + "num_tokens": 15528528.0, + "step": 12290 + }, + { + "entropy": 0.35192948803305624, + "epoch": 2.8660683063294092, + "grad_norm": 1.4140625, + "learning_rate": 4.9950395424361604e-05, + "loss": 0.6302, + "mean_token_accuracy": 0.8598722696304322, + "num_tokens": 15557410.0, + "step": 12295 + }, + { + "entropy": 0.32548448368906974, + "epoch": 2.867233943350041, + "grad_norm": 2.953125, + "learning_rate": 4.9950259481325554e-05, + "loss": 0.6216, + "mean_token_accuracy": 0.8807716012001038, + "num_tokens": 15579910.0, + "step": 12300 + }, + { + "entropy": 0.27001427859067917, + "epoch": 2.8683995803706726, + "grad_norm": 3.46875, + "learning_rate": 4.99501233526374e-05, + "loss": 0.4658, + "mean_token_accuracy": 0.8968069672584533, + "num_tokens": 15602595.0, + "step": 12305 + }, + { + "entropy": 0.22677491679787637, + "epoch": 2.869565217391304, + "grad_norm": 7.28125, + "learning_rate": 4.994998703829919e-05, + "loss": 0.5024, + "mean_token_accuracy": 0.9079581201076508, + "num_tokens": 15614836.0, + "step": 12310 + }, + { + "entropy": 0.22498192228376865, + "epoch": 2.870730854411936, + "grad_norm": 4.53125, + "learning_rate": 4.994985053831295e-05, + "loss": 0.4057, + "mean_token_accuracy": 0.9227579593658447, + "num_tokens": 15629028.0, + "step": 12315 + }, + { + "entropy": 0.21907037869095802, + "epoch": 2.871896491432568, + "grad_norm": 1.90625, + "learning_rate": 4.99497138526807e-05, + "loss": 0.2842, + "mean_token_accuracy": 0.9375577330589294, + "num_tokens": 15670225.0, + "step": 12320 + }, + { + "entropy": 0.21589868012815713, + "epoch": 2.8730621284531996, + "grad_norm": 3.078125, + "learning_rate": 4.99495769814045e-05, + "loss": 0.3292, + "mean_token_accuracy": 0.9117425501346588, + "num_tokens": 15700279.0, + "step": 12325 + }, + { + "entropy": 0.19196809008717536, + "epoch": 2.8742277654738313, + "grad_norm": 6.0, + "learning_rate": 4.994943992448638e-05, + "loss": 0.3625, + "mean_token_accuracy": 0.9180894255638122, + "num_tokens": 15714776.0, + "step": 12330 + }, + { + "entropy": 0.19620308466255665, + "epoch": 2.8753934024944634, + "grad_norm": 2.890625, + "learning_rate": 4.994930268192839e-05, + "loss": 0.3987, + "mean_token_accuracy": 0.9193916916847229, + "num_tokens": 15741913.0, + "step": 12335 + }, + { + "entropy": 0.28176852613687514, + "epoch": 2.876559039515095, + "grad_norm": 1.0, + "learning_rate": 4.9949165253732565e-05, + "loss": 0.4444, + "mean_token_accuracy": 0.904612535238266, + "num_tokens": 15763861.0, + "step": 12340 + }, + { + "entropy": 0.18242901414632798, + "epoch": 2.8777246765357267, + "grad_norm": 6.21875, + "learning_rate": 4.994902763990097e-05, + "loss": 0.4322, + "mean_token_accuracy": 0.911017370223999, + "num_tokens": 15778174.0, + "step": 12345 + }, + { + "entropy": 0.29833044596016406, + "epoch": 2.878890313556359, + "grad_norm": 0.99609375, + "learning_rate": 4.994888984043564e-05, + "loss": 0.3599, + "mean_token_accuracy": 0.9066588163375855, + "num_tokens": 15797512.0, + "step": 12350 + }, + { + "entropy": 0.19635452032089235, + "epoch": 2.8800559505769905, + "grad_norm": 2.765625, + "learning_rate": 4.994875185533864e-05, + "loss": 0.254, + "mean_token_accuracy": 0.9306394994258881, + "num_tokens": 15827727.0, + "step": 12355 + }, + { + "entropy": 0.27425159960985185, + "epoch": 2.881221587597622, + "grad_norm": 3.671875, + "learning_rate": 4.994861368461203e-05, + "loss": 0.5238, + "mean_token_accuracy": 0.9004498362541199, + "num_tokens": 15841398.0, + "step": 12360 + }, + { + "entropy": 0.2255168441683054, + "epoch": 2.882387224618254, + "grad_norm": 4.53125, + "learning_rate": 4.994847532825786e-05, + "loss": 0.4827, + "mean_token_accuracy": 0.9102801382541656, + "num_tokens": 15858611.0, + "step": 12365 + }, + { + "entropy": 0.25632177069783213, + "epoch": 2.8835528616388855, + "grad_norm": 0.77734375, + "learning_rate": 4.9948336786278204e-05, + "loss": 0.5552, + "mean_token_accuracy": 0.9066780388355256, + "num_tokens": 15878703.0, + "step": 12370 + }, + { + "entropy": 0.27798714153468607, + "epoch": 2.8847184986595176, + "grad_norm": 5.90625, + "learning_rate": 4.994819805867512e-05, + "loss": 0.4013, + "mean_token_accuracy": 0.9049887299537659, + "num_tokens": 15903115.0, + "step": 12375 + }, + { + "entropy": 0.27522594928741456, + "epoch": 2.8858841356801492, + "grad_norm": 7.125, + "learning_rate": 4.994805914545068e-05, + "loss": 0.5589, + "mean_token_accuracy": 0.8919727146625519, + "num_tokens": 15912967.0, + "step": 12380 + }, + { + "entropy": 0.2859261706471443, + "epoch": 2.887049772700781, + "grad_norm": 4.34375, + "learning_rate": 4.994792004660696e-05, + "loss": 0.6367, + "mean_token_accuracy": 0.8898451209068299, + "num_tokens": 15922730.0, + "step": 12385 + }, + { + "entropy": 0.21028335131704806, + "epoch": 2.888215409721413, + "grad_norm": 4.71875, + "learning_rate": 4.9947780762146024e-05, + "loss": 0.2133, + "mean_token_accuracy": 0.9456324815750122, + "num_tokens": 15952867.0, + "step": 12390 + }, + { + "entropy": 0.2555417686700821, + "epoch": 2.8893810467420447, + "grad_norm": 0.59375, + "learning_rate": 4.994764129206996e-05, + "loss": 0.4049, + "mean_token_accuracy": 0.9144223809242249, + "num_tokens": 15974845.0, + "step": 12395 + }, + { + "entropy": 0.2397582620382309, + "epoch": 2.8905466837626763, + "grad_norm": 1.7734375, + "learning_rate": 4.994750163638084e-05, + "loss": 0.2899, + "mean_token_accuracy": 0.9156318068504333, + "num_tokens": 15993164.0, + "step": 12400 + }, + { + "entropy": 0.2651869673281908, + "epoch": 2.891712320783308, + "grad_norm": 0.87109375, + "learning_rate": 4.9947361795080746e-05, + "loss": 0.3185, + "mean_token_accuracy": 0.9095015168190003, + "num_tokens": 16026317.0, + "step": 12405 + }, + { + "entropy": 0.19921966940164565, + "epoch": 2.8928779578039396, + "grad_norm": 3.21875, + "learning_rate": 4.994722176817177e-05, + "loss": 0.2915, + "mean_token_accuracy": 0.9439963281154633, + "num_tokens": 16043945.0, + "step": 12410 + }, + { + "entropy": 0.27657449916005133, + "epoch": 2.8940435948245717, + "grad_norm": 8.3125, + "learning_rate": 4.994708155565599e-05, + "loss": 0.2701, + "mean_token_accuracy": 0.9275880873203277, + "num_tokens": 16071942.0, + "step": 12415 + }, + { + "entropy": 0.22481039054691793, + "epoch": 2.8952092318452034, + "grad_norm": 0.57421875, + "learning_rate": 4.9946941157535504e-05, + "loss": 0.3073, + "mean_token_accuracy": 0.9280721783638001, + "num_tokens": 16104638.0, + "step": 12420 + }, + { + "entropy": 0.1833694539964199, + "epoch": 2.896374868865835, + "grad_norm": 4.125, + "learning_rate": 4.994680057381241e-05, + "loss": 0.3048, + "mean_token_accuracy": 0.9380892634391784, + "num_tokens": 16128815.0, + "step": 12425 + }, + { + "entropy": 0.2773961193859577, + "epoch": 2.897540505886467, + "grad_norm": 1.125, + "learning_rate": 4.994665980448879e-05, + "loss": 0.4982, + "mean_token_accuracy": 0.9037495434284211, + "num_tokens": 16149828.0, + "step": 12430 + }, + { + "entropy": 0.3096873864531517, + "epoch": 2.898706142907099, + "grad_norm": 3.71875, + "learning_rate": 4.994651884956675e-05, + "loss": 0.4956, + "mean_token_accuracy": 0.8928325831890106, + "num_tokens": 16167468.0, + "step": 12435 + }, + { + "entropy": 0.3326524205505848, + "epoch": 2.8998717799277305, + "grad_norm": 0.6875, + "learning_rate": 4.9946377709048404e-05, + "loss": 0.5563, + "mean_token_accuracy": 0.8875405877828598, + "num_tokens": 16195599.0, + "step": 12440 + }, + { + "entropy": 0.23935375213623047, + "epoch": 2.901037416948362, + "grad_norm": 1.3359375, + "learning_rate": 4.994623638293584e-05, + "loss": 0.4938, + "mean_token_accuracy": 0.9130324602127076, + "num_tokens": 16212825.0, + "step": 12445 + }, + { + "entropy": 0.20335310474038124, + "epoch": 2.902203053968994, + "grad_norm": 0.828125, + "learning_rate": 4.994609487123118e-05, + "loss": 0.2376, + "mean_token_accuracy": 0.9485228300094605, + "num_tokens": 16248299.0, + "step": 12450 + }, + { + "entropy": 0.20197730883955956, + "epoch": 2.903368690989626, + "grad_norm": 6.5625, + "learning_rate": 4.994595317393651e-05, + "loss": 0.3259, + "mean_token_accuracy": 0.9246832191944122, + "num_tokens": 16269044.0, + "step": 12455 + }, + { + "entropy": 0.2346881277859211, + "epoch": 2.9045343280102576, + "grad_norm": 4.4375, + "learning_rate": 4.994581129105397e-05, + "loss": 0.3763, + "mean_token_accuracy": 0.9140804708003998, + "num_tokens": 16281323.0, + "step": 12460 + }, + { + "entropy": 0.16434445828199387, + "epoch": 2.9056999650308892, + "grad_norm": 0.87890625, + "learning_rate": 4.9945669222585656e-05, + "loss": 0.2848, + "mean_token_accuracy": 0.9422541975975036, + "num_tokens": 16312296.0, + "step": 12465 + }, + { + "entropy": 0.24725492149591446, + "epoch": 2.9068656020515213, + "grad_norm": 5.96875, + "learning_rate": 4.9945526968533694e-05, + "loss": 0.5421, + "mean_token_accuracy": 0.9044016242027283, + "num_tokens": 16323044.0, + "step": 12470 + }, + { + "entropy": 0.2235550694167614, + "epoch": 2.908031239072153, + "grad_norm": 6.40625, + "learning_rate": 4.9945384528900214e-05, + "loss": 0.4786, + "mean_token_accuracy": 0.9164490222930908, + "num_tokens": 16335856.0, + "step": 12475 + }, + { + "entropy": 0.23978302478790284, + "epoch": 2.9091968760927847, + "grad_norm": 6.21875, + "learning_rate": 4.994524190368733e-05, + "loss": 0.303, + "mean_token_accuracy": 0.9199592411518097, + "num_tokens": 16358188.0, + "step": 12480 + }, + { + "entropy": 0.2748368114233017, + "epoch": 2.9103625131134163, + "grad_norm": 4.71875, + "learning_rate": 4.994509909289716e-05, + "loss": 0.4885, + "mean_token_accuracy": 0.9092333793640137, + "num_tokens": 16368468.0, + "step": 12485 + }, + { + "entropy": 0.21827991865575314, + "epoch": 2.9115281501340484, + "grad_norm": 3.9375, + "learning_rate": 4.9944956096531856e-05, + "loss": 0.263, + "mean_token_accuracy": 0.9356384515762329, + "num_tokens": 16415219.0, + "step": 12490 + }, + { + "entropy": 0.24743718206882476, + "epoch": 2.91269378715468, + "grad_norm": 2.9375, + "learning_rate": 4.994481291459354e-05, + "loss": 0.5131, + "mean_token_accuracy": 0.8959756314754486, + "num_tokens": 16427255.0, + "step": 12495 + }, + { + "entropy": 0.2174059823155403, + "epoch": 2.9138594241753117, + "grad_norm": 9.6875, + "learning_rate": 4.9944669547084335e-05, + "loss": 0.4839, + "mean_token_accuracy": 0.9092626631259918, + "num_tokens": 16447024.0, + "step": 12500 + }, + { + "entropy": 0.18768667057156563, + "epoch": 2.9150250611959434, + "grad_norm": 2.484375, + "learning_rate": 4.9944525994006395e-05, + "loss": 0.2285, + "mean_token_accuracy": 0.9509588420391083, + "num_tokens": 16464340.0, + "step": 12505 + }, + { + "entropy": 0.1717726208269596, + "epoch": 2.9161906982165755, + "grad_norm": 5.625, + "learning_rate": 4.994438225536185e-05, + "loss": 0.2924, + "mean_token_accuracy": 0.9374354660511017, + "num_tokens": 16482817.0, + "step": 12510 + }, + { + "entropy": 0.2385638602077961, + "epoch": 2.917356335237207, + "grad_norm": 7.59375, + "learning_rate": 4.9944238331152856e-05, + "loss": 0.2932, + "mean_token_accuracy": 0.9031573534011841, + "num_tokens": 16508927.0, + "step": 12515 + }, + { + "entropy": 0.15317164026200772, + "epoch": 2.918521972257839, + "grad_norm": 1.6796875, + "learning_rate": 4.9944094221381546e-05, + "loss": 0.1075, + "mean_token_accuracy": 0.9532822012901306, + "num_tokens": 16543572.0, + "step": 12520 + }, + { + "entropy": 0.23142515420913695, + "epoch": 2.919687609278471, + "grad_norm": 0.80859375, + "learning_rate": 4.9943949926050085e-05, + "loss": 0.2399, + "mean_token_accuracy": 0.9371400833129883, + "num_tokens": 16576364.0, + "step": 12525 + }, + { + "entropy": 0.2312719516456127, + "epoch": 2.9208532462991026, + "grad_norm": 1.21875, + "learning_rate": 4.99438054451606e-05, + "loss": 0.3978, + "mean_token_accuracy": 0.9079957783222199, + "num_tokens": 16606211.0, + "step": 12530 + }, + { + "entropy": 0.19937541633844375, + "epoch": 2.9220188833197343, + "grad_norm": 2.703125, + "learning_rate": 4.994366077871526e-05, + "loss": 0.4725, + "mean_token_accuracy": 0.9072546362876892, + "num_tokens": 16627977.0, + "step": 12535 + }, + { + "entropy": 0.306121326982975, + "epoch": 2.923184520340366, + "grad_norm": 8.375, + "learning_rate": 4.9943515926716225e-05, + "loss": 0.4601, + "mean_token_accuracy": 0.8869144976139068, + "num_tokens": 16650660.0, + "step": 12540 + }, + { + "entropy": 0.2306956544518471, + "epoch": 2.9243501573609976, + "grad_norm": 4.96875, + "learning_rate": 4.994337088916566e-05, + "loss": 0.3131, + "mean_token_accuracy": 0.9148689210414886, + "num_tokens": 16675489.0, + "step": 12545 + }, + { + "entropy": 0.21608939282596112, + "epoch": 2.9255157943816297, + "grad_norm": 1.0546875, + "learning_rate": 4.994322566606571e-05, + "loss": 0.3405, + "mean_token_accuracy": 0.9272489190101624, + "num_tokens": 16707904.0, + "step": 12550 + }, + { + "entropy": 0.23308916240930558, + "epoch": 2.9266814314022613, + "grad_norm": 1.5078125, + "learning_rate": 4.994308025741855e-05, + "loss": 0.2674, + "mean_token_accuracy": 0.9193484723567963, + "num_tokens": 16742210.0, + "step": 12555 + }, + { + "entropy": 0.37350448220968246, + "epoch": 2.927847068422893, + "grad_norm": 3.046875, + "learning_rate": 4.994293466322635e-05, + "loss": 0.7759, + "mean_token_accuracy": 0.8571257382631302, + "num_tokens": 16761736.0, + "step": 12560 + }, + { + "entropy": 0.2372276846319437, + "epoch": 2.929012705443525, + "grad_norm": 5.40625, + "learning_rate": 4.994278888349128e-05, + "loss": 0.4803, + "mean_token_accuracy": 0.9188945889472961, + "num_tokens": 16776070.0, + "step": 12565 + }, + { + "entropy": 0.26332382038235663, + "epoch": 2.9301783424641568, + "grad_norm": 8.25, + "learning_rate": 4.99426429182155e-05, + "loss": 0.396, + "mean_token_accuracy": 0.908951860666275, + "num_tokens": 16797430.0, + "step": 12570 + }, + { + "entropy": 0.23919696919620037, + "epoch": 2.9313439794847884, + "grad_norm": 1.3515625, + "learning_rate": 4.994249676740121e-05, + "loss": 0.3972, + "mean_token_accuracy": 0.9126874089241028, + "num_tokens": 16818531.0, + "step": 12575 + }, + { + "entropy": 0.16979910265654324, + "epoch": 2.93250961650542, + "grad_norm": 0.65625, + "learning_rate": 4.994235043105058e-05, + "loss": 0.3149, + "mean_token_accuracy": 0.9449944615364074, + "num_tokens": 16843185.0, + "step": 12580 + }, + { + "entropy": 0.26157992482185366, + "epoch": 2.9336752535260517, + "grad_norm": 2.640625, + "learning_rate": 4.994220390916579e-05, + "loss": 0.4379, + "mean_token_accuracy": 0.9097245037555695, + "num_tokens": 16856276.0, + "step": 12585 + }, + { + "entropy": 0.3574902083724737, + "epoch": 2.934840890546684, + "grad_norm": 6.84375, + "learning_rate": 4.9942057201749024e-05, + "loss": 0.8333, + "mean_token_accuracy": 0.8867159873247147, + "num_tokens": 16897168.0, + "step": 12590 + }, + { + "entropy": 0.26900804713368415, + "epoch": 2.9360065275673155, + "grad_norm": 1.46875, + "learning_rate": 4.9941910308802474e-05, + "loss": 0.4726, + "mean_token_accuracy": 0.911369127035141, + "num_tokens": 16913957.0, + "step": 12595 + }, + { + "entropy": 0.20102691166102887, + "epoch": 2.937172164587947, + "grad_norm": 1.1640625, + "learning_rate": 4.9941763230328325e-05, + "loss": 0.4644, + "mean_token_accuracy": 0.9198138236999511, + "num_tokens": 16933970.0, + "step": 12600 + }, + { + "entropy": 0.2640152137726545, + "epoch": 2.9383378016085793, + "grad_norm": 3.71875, + "learning_rate": 4.994161596632877e-05, + "loss": 0.2972, + "mean_token_accuracy": 0.9337838649749756, + "num_tokens": 16955313.0, + "step": 12605 + }, + { + "entropy": 0.22733655273914338, + "epoch": 2.939503438629211, + "grad_norm": 6.53125, + "learning_rate": 4.9941468516806015e-05, + "loss": 0.4618, + "mean_token_accuracy": 0.9171272695064545, + "num_tokens": 16967898.0, + "step": 12610 + }, + { + "entropy": 0.1735091220587492, + "epoch": 2.9406690756498426, + "grad_norm": 3.9375, + "learning_rate": 4.9941320881762244e-05, + "loss": 0.3008, + "mean_token_accuracy": 0.9320429384708404, + "num_tokens": 16993312.0, + "step": 12615 + }, + { + "entropy": 0.23960115239024163, + "epoch": 2.9418347126704743, + "grad_norm": 4.0, + "learning_rate": 4.994117306119967e-05, + "loss": 0.442, + "mean_token_accuracy": 0.9177091896533967, + "num_tokens": 17005693.0, + "step": 12620 + }, + { + "entropy": 0.23045891597867013, + "epoch": 2.9430003496911064, + "grad_norm": 0.91015625, + "learning_rate": 4.9941025055120496e-05, + "loss": 0.3941, + "mean_token_accuracy": 0.9129961967468262, + "num_tokens": 17027734.0, + "step": 12625 + }, + { + "entropy": 0.23398528508841993, + "epoch": 2.944165986711738, + "grad_norm": 1.2109375, + "learning_rate": 4.994087686352692e-05, + "loss": 0.3152, + "mean_token_accuracy": 0.9212701916694641, + "num_tokens": 17056712.0, + "step": 12630 + }, + { + "entropy": 0.2694935627281666, + "epoch": 2.9453316237323697, + "grad_norm": 5.75, + "learning_rate": 4.994072848642116e-05, + "loss": 0.4259, + "mean_token_accuracy": 0.8920988500118255, + "num_tokens": 17070954.0, + "step": 12635 + }, + { + "entropy": 0.20029729194939136, + "epoch": 2.9464972607530013, + "grad_norm": 7.5625, + "learning_rate": 4.994057992380542e-05, + "loss": 0.4291, + "mean_token_accuracy": 0.9236281037330627, + "num_tokens": 17091261.0, + "step": 12640 + }, + { + "entropy": 0.23001744896173476, + "epoch": 2.9476628977736334, + "grad_norm": 0.5546875, + "learning_rate": 4.994043117568194e-05, + "loss": 0.3814, + "mean_token_accuracy": 0.9207939684391022, + "num_tokens": 17112862.0, + "step": 12645 + }, + { + "entropy": 0.22075748890638353, + "epoch": 2.948828534794265, + "grad_norm": 1.5546875, + "learning_rate": 4.99402822420529e-05, + "loss": 0.5102, + "mean_token_accuracy": 0.9173259139060974, + "num_tokens": 17125175.0, + "step": 12650 + }, + { + "entropy": 0.21131634674966335, + "epoch": 2.9499941718148968, + "grad_norm": 7.71875, + "learning_rate": 4.994013312292055e-05, + "loss": 0.4822, + "mean_token_accuracy": 0.9125283896923065, + "num_tokens": 17142485.0, + "step": 12655 + }, + { + "entropy": 0.21084157526493072, + "epoch": 2.951159808835529, + "grad_norm": 0.94140625, + "learning_rate": 4.99399838182871e-05, + "loss": 0.4204, + "mean_token_accuracy": 0.9137578189373017, + "num_tokens": 17170566.0, + "step": 12660 + }, + { + "entropy": 0.34762868024408816, + "epoch": 2.9523254458561605, + "grad_norm": 8.0625, + "learning_rate": 4.9939834328154786e-05, + "loss": 0.5129, + "mean_token_accuracy": 0.8656055808067322, + "num_tokens": 17193691.0, + "step": 12665 + }, + { + "entropy": 0.2598606400191784, + "epoch": 2.953491082876792, + "grad_norm": 2.4375, + "learning_rate": 4.993968465252583e-05, + "loss": 0.3879, + "mean_token_accuracy": 0.9147687435150147, + "num_tokens": 17207788.0, + "step": 12670 + }, + { + "entropy": 0.2572662975639105, + "epoch": 2.954656719897424, + "grad_norm": 4.4375, + "learning_rate": 4.9939534791402464e-05, + "loss": 0.4912, + "mean_token_accuracy": 0.8920653343200684, + "num_tokens": 17234544.0, + "step": 12675 + }, + { + "entropy": 0.2845245450735092, + "epoch": 2.9558223569180555, + "grad_norm": 6.03125, + "learning_rate": 4.9939384744786924e-05, + "loss": 0.4841, + "mean_token_accuracy": 0.8954847276210784, + "num_tokens": 17254682.0, + "step": 12680 + }, + { + "entropy": 0.20575838908553123, + "epoch": 2.9569879939386876, + "grad_norm": 6.9375, + "learning_rate": 4.993923451268144e-05, + "loss": 0.4489, + "mean_token_accuracy": 0.9146290481090545, + "num_tokens": 17270762.0, + "step": 12685 + }, + { + "entropy": 0.23746194317936897, + "epoch": 2.9581536309593193, + "grad_norm": 5.375, + "learning_rate": 4.993908409508827e-05, + "loss": 0.4141, + "mean_token_accuracy": 0.9214969217777252, + "num_tokens": 17282257.0, + "step": 12690 + }, + { + "entropy": 0.25392072908580304, + "epoch": 2.959319267979951, + "grad_norm": 6.28125, + "learning_rate": 4.9938933492009646e-05, + "loss": 0.3244, + "mean_token_accuracy": 0.930941891670227, + "num_tokens": 17296030.0, + "step": 12695 + }, + { + "entropy": 0.30570143423974516, + "epoch": 2.960484905000583, + "grad_norm": 7.9375, + "learning_rate": 4.993878270344781e-05, + "loss": 0.4522, + "mean_token_accuracy": 0.9098774671554566, + "num_tokens": 17317480.0, + "step": 12700 + }, + { + "entropy": 0.4143160991370678, + "epoch": 2.9616505420212147, + "grad_norm": 5.28125, + "learning_rate": 4.993863172940502e-05, + "loss": 0.7403, + "mean_token_accuracy": 0.8491365373134613, + "num_tokens": 17342663.0, + "step": 12705 + }, + { + "entropy": 0.2186424985527992, + "epoch": 2.9628161790418464, + "grad_norm": 2.34375, + "learning_rate": 4.9938480569883525e-05, + "loss": 0.3605, + "mean_token_accuracy": 0.932968944311142, + "num_tokens": 17355353.0, + "step": 12710 + }, + { + "entropy": 0.14407789390534162, + "epoch": 2.963981816062478, + "grad_norm": 0.7578125, + "learning_rate": 4.993832922488557e-05, + "loss": 0.1612, + "mean_token_accuracy": 0.9588046312332154, + "num_tokens": 17394368.0, + "step": 12715 + }, + { + "entropy": 0.20839739665389062, + "epoch": 2.9651474530831097, + "grad_norm": 3.6875, + "learning_rate": 4.993817769441341e-05, + "loss": 0.4005, + "mean_token_accuracy": 0.9120141923427582, + "num_tokens": 17406822.0, + "step": 12720 + }, + { + "entropy": 0.18153918944299222, + "epoch": 2.966313090103742, + "grad_norm": 0.63671875, + "learning_rate": 4.993802597846933e-05, + "loss": 0.2279, + "mean_token_accuracy": 0.9474944174289703, + "num_tokens": 17428079.0, + "step": 12725 + }, + { + "entropy": 0.19154103100299835, + "epoch": 2.9674787271243734, + "grad_norm": 4.09375, + "learning_rate": 4.993787407705556e-05, + "loss": 0.2736, + "mean_token_accuracy": 0.9374960124492645, + "num_tokens": 17447557.0, + "step": 12730 + }, + { + "entropy": 0.1938954271376133, + "epoch": 2.968644364145005, + "grad_norm": 1.4609375, + "learning_rate": 4.993772199017439e-05, + "loss": 0.2667, + "mean_token_accuracy": 0.9410579323768615, + "num_tokens": 17463243.0, + "step": 12735 + }, + { + "entropy": 0.261495478451252, + "epoch": 2.969810001165637, + "grad_norm": 5.53125, + "learning_rate": 4.993756971782807e-05, + "loss": 0.5759, + "mean_token_accuracy": 0.896853107213974, + "num_tokens": 17472768.0, + "step": 12740 + }, + { + "entropy": 0.187485178001225, + "epoch": 2.970975638186269, + "grad_norm": 4.5625, + "learning_rate": 4.993741726001888e-05, + "loss": 0.4988, + "mean_token_accuracy": 0.9167073547840119, + "num_tokens": 17496526.0, + "step": 12745 + }, + { + "entropy": 0.18627914749085903, + "epoch": 2.9721412752069005, + "grad_norm": 0.7734375, + "learning_rate": 4.99372646167491e-05, + "loss": 0.2441, + "mean_token_accuracy": 0.9456724762916565, + "num_tokens": 17534475.0, + "step": 12750 + }, + { + "entropy": 0.1453916199505329, + "epoch": 2.973306912227532, + "grad_norm": 4.46875, + "learning_rate": 4.993711178802099e-05, + "loss": 0.253, + "mean_token_accuracy": 0.9423149466514588, + "num_tokens": 17559219.0, + "step": 12755 + }, + { + "entropy": 0.19363697096705437, + "epoch": 2.9744725492481643, + "grad_norm": 1.4296875, + "learning_rate": 4.993695877383684e-05, + "loss": 0.3023, + "mean_token_accuracy": 0.9373092234134675, + "num_tokens": 17576680.0, + "step": 12760 + }, + { + "entropy": 0.2151080034673214, + "epoch": 2.975638186268796, + "grad_norm": 0.451171875, + "learning_rate": 4.9936805574198925e-05, + "loss": 0.2392, + "mean_token_accuracy": 0.9359769761562348, + "num_tokens": 17606807.0, + "step": 12765 + }, + { + "entropy": 0.2870198667049408, + "epoch": 2.9768038232894276, + "grad_norm": 4.28125, + "learning_rate": 4.993665218910953e-05, + "loss": 0.5378, + "mean_token_accuracy": 0.8911948084831238, + "num_tokens": 17624236.0, + "step": 12770 + }, + { + "entropy": 0.24478442054241895, + "epoch": 2.9779694603100593, + "grad_norm": 0.54296875, + "learning_rate": 4.9936498618570956e-05, + "loss": 0.4086, + "mean_token_accuracy": 0.9145197212696076, + "num_tokens": 17661090.0, + "step": 12775 + }, + { + "entropy": 0.22513844072818756, + "epoch": 2.9791350973306914, + "grad_norm": 8.875, + "learning_rate": 4.9936344862585474e-05, + "loss": 0.4294, + "mean_token_accuracy": 0.9144389748573303, + "num_tokens": 17673989.0, + "step": 12780 + }, + { + "entropy": 0.2467608630657196, + "epoch": 2.980300734351323, + "grad_norm": 7.8125, + "learning_rate": 4.993619092115539e-05, + "loss": 0.4108, + "mean_token_accuracy": 0.9199643731117249, + "num_tokens": 17694785.0, + "step": 12785 + }, + { + "entropy": 0.19479428604245186, + "epoch": 2.9814663713719547, + "grad_norm": 2.890625, + "learning_rate": 4.9936036794282984e-05, + "loss": 0.3532, + "mean_token_accuracy": 0.9233082294464111, + "num_tokens": 17708483.0, + "step": 12790 + }, + { + "entropy": 0.2902163352817297, + "epoch": 2.982632008392587, + "grad_norm": 14.625, + "learning_rate": 4.993588248197058e-05, + "loss": 0.4875, + "mean_token_accuracy": 0.8970877230167389, + "num_tokens": 17722973.0, + "step": 12795 + }, + { + "entropy": 0.2668305665254593, + "epoch": 2.9837976454132185, + "grad_norm": 9.0625, + "learning_rate": 4.993572798422045e-05, + "loss": 0.6425, + "mean_token_accuracy": 0.8881903946399688, + "num_tokens": 17732394.0, + "step": 12800 + }, + { + "entropy": 0.2405627638101578, + "epoch": 2.98496328243385, + "grad_norm": 6.9375, + "learning_rate": 4.9935573301034916e-05, + "loss": 0.4664, + "mean_token_accuracy": 0.9050011456012725, + "num_tokens": 17745099.0, + "step": 12805 + }, + { + "entropy": 0.3110461011528969, + "epoch": 2.986128919454482, + "grad_norm": 7.4375, + "learning_rate": 4.993541843241628e-05, + "loss": 0.5163, + "mean_token_accuracy": 0.9056003808975219, + "num_tokens": 17757821.0, + "step": 12810 + }, + { + "entropy": 0.19741073474287987, + "epoch": 2.9872945564751134, + "grad_norm": 2.6875, + "learning_rate": 4.9935263378366854e-05, + "loss": 0.1996, + "mean_token_accuracy": 0.9366816639900207, + "num_tokens": 17783464.0, + "step": 12815 + }, + { + "entropy": 0.2888368025422096, + "epoch": 2.9884601934957455, + "grad_norm": 5.28125, + "learning_rate": 4.993510813888894e-05, + "loss": 0.5255, + "mean_token_accuracy": 0.9123888671398163, + "num_tokens": 17791795.0, + "step": 12820 + }, + { + "entropy": 0.20527897477149964, + "epoch": 2.989625830516377, + "grad_norm": 1.59375, + "learning_rate": 4.9934952713984865e-05, + "loss": 0.2343, + "mean_token_accuracy": 0.9191898763179779, + "num_tokens": 17815861.0, + "step": 12825 + }, + { + "entropy": 0.23771159574389458, + "epoch": 2.990791467537009, + "grad_norm": 8.125, + "learning_rate": 4.993479710365695e-05, + "loss": 0.4917, + "mean_token_accuracy": 0.9103189170360565, + "num_tokens": 17834933.0, + "step": 12830 + }, + { + "entropy": 0.2657750204205513, + "epoch": 2.991957104557641, + "grad_norm": 10.5, + "learning_rate": 4.9934641307907495e-05, + "loss": 0.5174, + "mean_token_accuracy": 0.8834215998649597, + "num_tokens": 17845860.0, + "step": 12835 + }, + { + "entropy": 0.16792013086378574, + "epoch": 2.9931227415782726, + "grad_norm": 1.3984375, + "learning_rate": 4.993448532673884e-05, + "loss": 0.1993, + "mean_token_accuracy": 0.9521631598472595, + "num_tokens": 17866162.0, + "step": 12840 + }, + { + "entropy": 0.22297072932124137, + "epoch": 2.9942883785989043, + "grad_norm": 4.75, + "learning_rate": 4.99343291601533e-05, + "loss": 0.3531, + "mean_token_accuracy": 0.9260120809078216, + "num_tokens": 17879511.0, + "step": 12845 + }, + { + "entropy": 0.19128753133118154, + "epoch": 2.995454015619536, + "grad_norm": 3.328125, + "learning_rate": 4.9934172808153224e-05, + "loss": 0.2676, + "mean_token_accuracy": 0.9386341869831085, + "num_tokens": 17899042.0, + "step": 12850 + }, + { + "entropy": 0.20055814646184444, + "epoch": 2.9966196526401676, + "grad_norm": 3.5625, + "learning_rate": 4.993401627074092e-05, + "loss": 0.3542, + "mean_token_accuracy": 0.9243052423000335, + "num_tokens": 17919406.0, + "step": 12855 + }, + { + "entropy": 0.25509237721562383, + "epoch": 2.9977852896607997, + "grad_norm": 6.375, + "learning_rate": 4.993385954791873e-05, + "loss": 0.5408, + "mean_token_accuracy": 0.8878538310527802, + "num_tokens": 17934902.0, + "step": 12860 + }, + { + "entropy": 0.2672757588326931, + "epoch": 2.9989509266814314, + "grad_norm": 6.0, + "learning_rate": 4.9933702639689e-05, + "loss": 0.487, + "mean_token_accuracy": 0.9051804602146148, + "num_tokens": 17950700.0, + "step": 12865 + }, + { + "entropy": 0.1652403457297219, + "epoch": 3.0, + "grad_norm": 0.609375, + "learning_rate": 4.993354554605405e-05, + "loss": 0.2207, + "mean_token_accuracy": 0.9246720737881131, + "num_tokens": 17980750.0, + "step": 12870 + }, + { + "entropy": 0.1433550551533699, + "epoch": 3.0011656370206317, + "grad_norm": 4.75, + "learning_rate": 4.9933388267016246e-05, + "loss": 0.1786, + "mean_token_accuracy": 0.9587549924850464, + "num_tokens": 17995622.0, + "step": 12875 + }, + { + "entropy": 0.21168565675616263, + "epoch": 3.0023312740412638, + "grad_norm": 4.46875, + "learning_rate": 4.993323080257792e-05, + "loss": 0.3623, + "mean_token_accuracy": 0.9260616600513458, + "num_tokens": 18015010.0, + "step": 12880 + }, + { + "entropy": 0.212300780788064, + "epoch": 3.0034969110618954, + "grad_norm": 9.5, + "learning_rate": 4.993307315274142e-05, + "loss": 0.3162, + "mean_token_accuracy": 0.9363721132278442, + "num_tokens": 18029427.0, + "step": 12885 + }, + { + "entropy": 0.1385662153363228, + "epoch": 3.004662548082527, + "grad_norm": 8.1875, + "learning_rate": 4.99329153175091e-05, + "loss": 0.2069, + "mean_token_accuracy": 0.942263925075531, + "num_tokens": 18051017.0, + "step": 12890 + }, + { + "entropy": 0.12007400654256344, + "epoch": 3.0058281851031587, + "grad_norm": 2.296875, + "learning_rate": 4.9932757296883306e-05, + "loss": 0.2103, + "mean_token_accuracy": 0.9540059566497803, + "num_tokens": 18065526.0, + "step": 12895 + }, + { + "entropy": 0.13780878372490407, + "epoch": 3.006993822123791, + "grad_norm": 2.40625, + "learning_rate": 4.993259909086641e-05, + "loss": 0.1755, + "mean_token_accuracy": 0.9513382017612457, + "num_tokens": 18082091.0, + "step": 12900 + }, + { + "entropy": 0.09654690567404031, + "epoch": 3.0081594591444225, + "grad_norm": 0.71875, + "learning_rate": 4.993244069946076e-05, + "loss": 0.1812, + "mean_token_accuracy": 0.9618777215480805, + "num_tokens": 18117232.0, + "step": 12905 + }, + { + "entropy": 0.0864265939220786, + "epoch": 3.009325096165054, + "grad_norm": 0.8046875, + "learning_rate": 4.9932282122668715e-05, + "loss": 0.0861, + "mean_token_accuracy": 0.9753880679607392, + "num_tokens": 18151282.0, + "step": 12910 + }, + { + "entropy": 0.1271039350889623, + "epoch": 3.010490733185686, + "grad_norm": 0.326171875, + "learning_rate": 4.993212336049265e-05, + "loss": 0.2512, + "mean_token_accuracy": 0.9457149386405945, + "num_tokens": 18170371.0, + "step": 12915 + }, + { + "entropy": 0.1285300085321069, + "epoch": 3.011656370206318, + "grad_norm": 3.109375, + "learning_rate": 4.993196441293492e-05, + "loss": 0.1651, + "mean_token_accuracy": 0.9519018471240998, + "num_tokens": 18191983.0, + "step": 12920 + }, + { + "entropy": 0.11733712311834096, + "epoch": 3.0128220072269496, + "grad_norm": 1.03125, + "learning_rate": 4.993180527999791e-05, + "loss": 0.064, + "mean_token_accuracy": 0.958944684267044, + "num_tokens": 18223670.0, + "step": 12925 + }, + { + "entropy": 0.11386382738128305, + "epoch": 3.0139876442475813, + "grad_norm": 0.8515625, + "learning_rate": 4.9931645961683984e-05, + "loss": 0.1375, + "mean_token_accuracy": 0.956270831823349, + "num_tokens": 18251671.0, + "step": 12930 + }, + { + "entropy": 0.10516616962850094, + "epoch": 3.015153281268213, + "grad_norm": 1.0078125, + "learning_rate": 4.9931486457995515e-05, + "loss": 0.1278, + "mean_token_accuracy": 0.9688599169254303, + "num_tokens": 18271125.0, + "step": 12935 + }, + { + "entropy": 0.08430785313248634, + "epoch": 3.016318918288845, + "grad_norm": 5.5, + "learning_rate": 4.993132676893488e-05, + "loss": 0.1188, + "mean_token_accuracy": 0.9666883111000061, + "num_tokens": 18312079.0, + "step": 12940 + }, + { + "entropy": 0.16214878372848035, + "epoch": 3.0174845553094767, + "grad_norm": 5.625, + "learning_rate": 4.993116689450447e-05, + "loss": 0.1848, + "mean_token_accuracy": 0.9511231660842896, + "num_tokens": 18323144.0, + "step": 12945 + }, + { + "entropy": 0.12150358017534017, + "epoch": 3.0186501923301083, + "grad_norm": 4.1875, + "learning_rate": 4.993100683470667e-05, + "loss": 0.1386, + "mean_token_accuracy": 0.9623299181461334, + "num_tokens": 18343224.0, + "step": 12950 + }, + { + "entropy": 0.17119587287306787, + "epoch": 3.01981582935074, + "grad_norm": 7.5, + "learning_rate": 4.9930846589543855e-05, + "loss": 0.2687, + "mean_token_accuracy": 0.9386284410953522, + "num_tokens": 18383414.0, + "step": 12955 + }, + { + "entropy": 0.10301913348957896, + "epoch": 3.020981466371372, + "grad_norm": 4.25, + "learning_rate": 4.993068615901842e-05, + "loss": 0.1244, + "mean_token_accuracy": 0.9650148451328278, + "num_tokens": 18406781.0, + "step": 12960 + }, + { + "entropy": 0.11102181086316705, + "epoch": 3.0221471033920038, + "grad_norm": 3.78125, + "learning_rate": 4.993052554313275e-05, + "loss": 0.1959, + "mean_token_accuracy": 0.9532433927059174, + "num_tokens": 18426553.0, + "step": 12965 + }, + { + "entropy": 0.1458127958700061, + "epoch": 3.0233127404126354, + "grad_norm": 13.9375, + "learning_rate": 4.9930364741889264e-05, + "loss": 0.2835, + "mean_token_accuracy": 0.9430015027523041, + "num_tokens": 18446721.0, + "step": 12970 + }, + { + "entropy": 0.10947179980576038, + "epoch": 3.024478377433267, + "grad_norm": 7.34375, + "learning_rate": 4.9930203755290334e-05, + "loss": 0.1352, + "mean_token_accuracy": 0.9588992714881897, + "num_tokens": 18467326.0, + "step": 12975 + }, + { + "entropy": 0.18623995631933213, + "epoch": 3.025644014453899, + "grad_norm": 4.9375, + "learning_rate": 4.993004258333837e-05, + "loss": 0.2801, + "mean_token_accuracy": 0.9345187544822693, + "num_tokens": 18485722.0, + "step": 12980 + }, + { + "entropy": 0.10628864876925945, + "epoch": 3.026809651474531, + "grad_norm": 5.25, + "learning_rate": 4.992988122603578e-05, + "loss": 0.1406, + "mean_token_accuracy": 0.963499253988266, + "num_tokens": 18509525.0, + "step": 12985 + }, + { + "entropy": 0.1489834614098072, + "epoch": 3.0279752884951625, + "grad_norm": 1.828125, + "learning_rate": 4.992971968338496e-05, + "loss": 0.1771, + "mean_token_accuracy": 0.9506646573543549, + "num_tokens": 18541551.0, + "step": 12990 + }, + { + "entropy": 0.11751662027090788, + "epoch": 3.029140925515794, + "grad_norm": 0.609375, + "learning_rate": 4.992955795538832e-05, + "loss": 0.1912, + "mean_token_accuracy": 0.9536070585250854, + "num_tokens": 18559322.0, + "step": 12995 + }, + { + "entropy": 0.12452268935739993, + "epoch": 3.0303065625364263, + "grad_norm": 5.59375, + "learning_rate": 4.992939604204828e-05, + "loss": 0.1767, + "mean_token_accuracy": 0.9622524440288543, + "num_tokens": 18582602.0, + "step": 13000 + }, + { + "entropy": 0.10873323529958726, + "epoch": 3.031472199557058, + "grad_norm": 1.890625, + "learning_rate": 4.992923394336726e-05, + "loss": 0.1736, + "mean_token_accuracy": 0.9589035868644714, + "num_tokens": 18601698.0, + "step": 13005 + }, + { + "entropy": 0.09877428058534861, + "epoch": 3.0326378365776896, + "grad_norm": 1.53125, + "learning_rate": 4.992907165934766e-05, + "loss": 0.1921, + "mean_token_accuracy": 0.9654108107089996, + "num_tokens": 18629199.0, + "step": 13010 + }, + { + "entropy": 0.16017772555351256, + "epoch": 3.0338034735983217, + "grad_norm": 7.53125, + "learning_rate": 4.99289091899919e-05, + "loss": 0.2393, + "mean_token_accuracy": 0.9472411513328552, + "num_tokens": 18638162.0, + "step": 13015 + }, + { + "entropy": 0.11799342557787895, + "epoch": 3.0349691106189534, + "grad_norm": 6.15625, + "learning_rate": 4.9928746535302415e-05, + "loss": 0.1876, + "mean_token_accuracy": 0.9611060082912445, + "num_tokens": 18652672.0, + "step": 13020 + }, + { + "entropy": 0.12905988562852144, + "epoch": 3.036134747639585, + "grad_norm": 12.6875, + "learning_rate": 4.992858369528163e-05, + "loss": 0.2115, + "mean_token_accuracy": 0.9487624883651733, + "num_tokens": 18668323.0, + "step": 13025 + }, + { + "entropy": 0.11400379352271557, + "epoch": 3.0373003846602167, + "grad_norm": 8.625, + "learning_rate": 4.992842066993196e-05, + "loss": 0.2537, + "mean_token_accuracy": 0.954912292957306, + "num_tokens": 18685966.0, + "step": 13030 + }, + { + "entropy": 0.140830048173666, + "epoch": 3.038466021680849, + "grad_norm": 5.03125, + "learning_rate": 4.992825745925585e-05, + "loss": 0.2546, + "mean_token_accuracy": 0.9469396531581878, + "num_tokens": 18698156.0, + "step": 13035 + }, + { + "entropy": 0.19331585317850114, + "epoch": 3.0396316587014804, + "grad_norm": 5.625, + "learning_rate": 4.9928094063255734e-05, + "loss": 0.3055, + "mean_token_accuracy": 0.9267224192619323, + "num_tokens": 18705380.0, + "step": 13040 + }, + { + "entropy": 0.16906597539782525, + "epoch": 3.040797295722112, + "grad_norm": 8.75, + "learning_rate": 4.9927930481934034e-05, + "loss": 0.2464, + "mean_token_accuracy": 0.9498988151550293, + "num_tokens": 18716545.0, + "step": 13045 + }, + { + "entropy": 0.1345493733882904, + "epoch": 3.0419629327427438, + "grad_norm": 0.75, + "learning_rate": 4.99277667152932e-05, + "loss": 0.2407, + "mean_token_accuracy": 0.9507641017436981, + "num_tokens": 18735728.0, + "step": 13050 + }, + { + "entropy": 0.20635399986058472, + "epoch": 3.043128569763376, + "grad_norm": 12.0625, + "learning_rate": 4.992760276333567e-05, + "loss": 0.2186, + "mean_token_accuracy": 0.9326454043388367, + "num_tokens": 18767891.0, + "step": 13055 + }, + { + "entropy": 0.09413407100364565, + "epoch": 3.0442942067840075, + "grad_norm": 7.1875, + "learning_rate": 4.9927438626063894e-05, + "loss": 0.1287, + "mean_token_accuracy": 0.9676704227924346, + "num_tokens": 18790347.0, + "step": 13060 + }, + { + "entropy": 0.12085700780153275, + "epoch": 3.045459843804639, + "grad_norm": 0.37890625, + "learning_rate": 4.992727430348031e-05, + "loss": 0.1564, + "mean_token_accuracy": 0.9590084195137024, + "num_tokens": 18841046.0, + "step": 13065 + }, + { + "entropy": 0.15510401986539363, + "epoch": 3.046625480825271, + "grad_norm": 4.78125, + "learning_rate": 4.992710979558738e-05, + "loss": 0.2184, + "mean_token_accuracy": 0.9496059834957122, + "num_tokens": 18872274.0, + "step": 13070 + }, + { + "entropy": 0.150761703774333, + "epoch": 3.047791117845903, + "grad_norm": 6.96875, + "learning_rate": 4.992694510238755e-05, + "loss": 0.26, + "mean_token_accuracy": 0.9450468838214874, + "num_tokens": 18892617.0, + "step": 13075 + }, + { + "entropy": 0.14352366626262664, + "epoch": 3.0489567548665346, + "grad_norm": 1.9140625, + "learning_rate": 4.992678022388328e-05, + "loss": 0.21, + "mean_token_accuracy": 0.9523520827293396, + "num_tokens": 18903601.0, + "step": 13080 + }, + { + "entropy": 0.1325456030666828, + "epoch": 3.0501223918871663, + "grad_norm": 8.75, + "learning_rate": 4.992661516007702e-05, + "loss": 0.2799, + "mean_token_accuracy": 0.9441640317440033, + "num_tokens": 18921826.0, + "step": 13085 + }, + { + "entropy": 0.16167889218777418, + "epoch": 3.051288028907798, + "grad_norm": 4.5, + "learning_rate": 4.992644991097125e-05, + "loss": 0.297, + "mean_token_accuracy": 0.9417232036590576, + "num_tokens": 18967125.0, + "step": 13090 + }, + { + "entropy": 0.13939319923520088, + "epoch": 3.05245366592843, + "grad_norm": 1.3125, + "learning_rate": 4.992628447656841e-05, + "loss": 0.1361, + "mean_token_accuracy": 0.9541107892990113, + "num_tokens": 18993343.0, + "step": 13095 + }, + { + "entropy": 0.1326286420226097, + "epoch": 3.0536193029490617, + "grad_norm": 0.99609375, + "learning_rate": 4.9926118856870976e-05, + "loss": 0.1413, + "mean_token_accuracy": 0.9580071270465851, + "num_tokens": 19023224.0, + "step": 13100 + }, + { + "entropy": 0.14778729975223542, + "epoch": 3.0547849399696934, + "grad_norm": 3.84375, + "learning_rate": 4.9925953051881426e-05, + "loss": 0.162, + "mean_token_accuracy": 0.9471778392791748, + "num_tokens": 19037845.0, + "step": 13105 + }, + { + "entropy": 0.16834700480103493, + "epoch": 3.055950576990325, + "grad_norm": 0.8515625, + "learning_rate": 4.9925787061602226e-05, + "loss": 0.2342, + "mean_token_accuracy": 0.9260630905628204, + "num_tokens": 19062642.0, + "step": 13110 + }, + { + "entropy": 0.11567596178501845, + "epoch": 3.057116214010957, + "grad_norm": 5.1875, + "learning_rate": 4.992562088603585e-05, + "loss": 0.2426, + "mean_token_accuracy": 0.9500459372997284, + "num_tokens": 19079460.0, + "step": 13115 + }, + { + "entropy": 0.14092737063765526, + "epoch": 3.058281851031589, + "grad_norm": 10.875, + "learning_rate": 4.992545452518478e-05, + "loss": 0.3468, + "mean_token_accuracy": 0.9354611694812774, + "num_tokens": 19089866.0, + "step": 13120 + }, + { + "entropy": 0.14295823201537133, + "epoch": 3.0594474880522204, + "grad_norm": 6.9375, + "learning_rate": 4.992528797905149e-05, + "loss": 0.2638, + "mean_token_accuracy": 0.9444294333457947, + "num_tokens": 19101734.0, + "step": 13125 + }, + { + "entropy": 0.11939622908830642, + "epoch": 3.060613125072852, + "grad_norm": 7.1875, + "learning_rate": 4.992512124763847e-05, + "loss": 0.176, + "mean_token_accuracy": 0.9504463911056519, + "num_tokens": 19116952.0, + "step": 13130 + }, + { + "entropy": 0.14332516640424728, + "epoch": 3.061778762093484, + "grad_norm": 8.6875, + "learning_rate": 4.99249543309482e-05, + "loss": 0.2215, + "mean_token_accuracy": 0.9468707919120789, + "num_tokens": 19132506.0, + "step": 13135 + }, + { + "entropy": 0.1258048728108406, + "epoch": 3.062944399114116, + "grad_norm": 11.375, + "learning_rate": 4.992478722898318e-05, + "loss": 0.2369, + "mean_token_accuracy": 0.9458638608455658, + "num_tokens": 19144849.0, + "step": 13140 + }, + { + "entropy": 0.10387789681553841, + "epoch": 3.0641100361347475, + "grad_norm": 6.71875, + "learning_rate": 4.9924619941745886e-05, + "loss": 0.133, + "mean_token_accuracy": 0.9705339252948761, + "num_tokens": 19182754.0, + "step": 13145 + }, + { + "entropy": 0.09643221404403449, + "epoch": 3.065275673155379, + "grad_norm": 8.1875, + "learning_rate": 4.9924452469238826e-05, + "loss": 0.1519, + "mean_token_accuracy": 0.964071798324585, + "num_tokens": 19203256.0, + "step": 13150 + }, + { + "entropy": 0.19322482496500015, + "epoch": 3.0664413101760113, + "grad_norm": 2.015625, + "learning_rate": 4.99242848114645e-05, + "loss": 0.3931, + "mean_token_accuracy": 0.9145723462104798, + "num_tokens": 19212959.0, + "step": 13155 + }, + { + "entropy": 0.25037995502352717, + "epoch": 3.067606947196643, + "grad_norm": 8.4375, + "learning_rate": 4.992411696842538e-05, + "loss": 0.4867, + "mean_token_accuracy": 0.9011429846286774, + "num_tokens": 19231303.0, + "step": 13160 + }, + { + "entropy": 0.35061059817671775, + "epoch": 3.0687725842172746, + "grad_norm": 5.34375, + "learning_rate": 4.9923948940124006e-05, + "loss": 0.6799, + "mean_token_accuracy": 0.9077353715896607, + "num_tokens": 19254614.0, + "step": 13165 + }, + { + "entropy": 0.1632023498415947, + "epoch": 3.0699382212379067, + "grad_norm": 1.859375, + "learning_rate": 4.9923780726562856e-05, + "loss": 0.2547, + "mean_token_accuracy": 0.9486920773983002, + "num_tokens": 19274238.0, + "step": 13170 + }, + { + "entropy": 0.15164818074554204, + "epoch": 3.0711038582585384, + "grad_norm": 6.15625, + "learning_rate": 4.9923612327744454e-05, + "loss": 0.238, + "mean_token_accuracy": 0.938932454586029, + "num_tokens": 19296587.0, + "step": 13175 + }, + { + "entropy": 0.13162093460559846, + "epoch": 3.07226949527917, + "grad_norm": 5.0625, + "learning_rate": 4.9923443743671314e-05, + "loss": 0.1807, + "mean_token_accuracy": 0.9539216995239258, + "num_tokens": 19318280.0, + "step": 13180 + }, + { + "entropy": 0.16326101645827293, + "epoch": 3.0734351322998017, + "grad_norm": 0.69921875, + "learning_rate": 4.992327497434593e-05, + "loss": 0.275, + "mean_token_accuracy": 0.9378827571868896, + "num_tokens": 19335662.0, + "step": 13185 + }, + { + "entropy": 0.14299081321805715, + "epoch": 3.074600769320434, + "grad_norm": 0.61328125, + "learning_rate": 4.992310601977084e-05, + "loss": 0.2718, + "mean_token_accuracy": 0.9428001344203949, + "num_tokens": 19364430.0, + "step": 13190 + }, + { + "entropy": 0.11728681959211826, + "epoch": 3.0757664063410655, + "grad_norm": 3.90625, + "learning_rate": 4.992293687994855e-05, + "loss": 0.2007, + "mean_token_accuracy": 0.9604819655418396, + "num_tokens": 19377704.0, + "step": 13195 + }, + { + "entropy": 0.13060683608055115, + "epoch": 3.076932043361697, + "grad_norm": 4.71875, + "learning_rate": 4.992276755488158e-05, + "loss": 0.2601, + "mean_token_accuracy": 0.9461053490638733, + "num_tokens": 19401149.0, + "step": 13200 + }, + { + "entropy": 0.16250937394797801, + "epoch": 3.078097680382329, + "grad_norm": 0.96875, + "learning_rate": 4.992259804457247e-05, + "loss": 0.2137, + "mean_token_accuracy": 0.9518900394439698, + "num_tokens": 19435512.0, + "step": 13205 + }, + { + "entropy": 0.12088603749871255, + "epoch": 3.079263317402961, + "grad_norm": 4.3125, + "learning_rate": 4.992242834902373e-05, + "loss": 0.1753, + "mean_token_accuracy": 0.9508533656597138, + "num_tokens": 19453057.0, + "step": 13210 + }, + { + "entropy": 0.12925427481532098, + "epoch": 3.0804289544235925, + "grad_norm": 2.53125, + "learning_rate": 4.99222584682379e-05, + "loss": 0.165, + "mean_token_accuracy": 0.958453357219696, + "num_tokens": 19473192.0, + "step": 13215 + }, + { + "entropy": 0.13786442931741477, + "epoch": 3.081594591444224, + "grad_norm": 1.71875, + "learning_rate": 4.992208840221751e-05, + "loss": 0.1691, + "mean_token_accuracy": 0.9522160708904266, + "num_tokens": 19495497.0, + "step": 13220 + }, + { + "entropy": 0.11975713893771171, + "epoch": 3.082760228464856, + "grad_norm": 1.4296875, + "learning_rate": 4.9921918150965106e-05, + "loss": 0.1748, + "mean_token_accuracy": 0.9600012302398682, + "num_tokens": 19524516.0, + "step": 13225 + }, + { + "entropy": 0.20472723469138146, + "epoch": 3.083925865485488, + "grad_norm": 10.0, + "learning_rate": 4.9921747714483216e-05, + "loss": 0.3746, + "mean_token_accuracy": 0.9212909162044525, + "num_tokens": 19535087.0, + "step": 13230 + }, + { + "entropy": 0.12324888203293086, + "epoch": 3.0850915025061196, + "grad_norm": 3.671875, + "learning_rate": 4.9921577092774384e-05, + "loss": 0.2137, + "mean_token_accuracy": 0.9506260097026825, + "num_tokens": 19556253.0, + "step": 13235 + }, + { + "entropy": 0.16078415960073472, + "epoch": 3.0862571395267513, + "grad_norm": 0.7578125, + "learning_rate": 4.992140628584116e-05, + "loss": 0.2537, + "mean_token_accuracy": 0.9428843200206757, + "num_tokens": 19578151.0, + "step": 13240 + }, + { + "entropy": 0.1569695755839348, + "epoch": 3.087422776547383, + "grad_norm": 6.3125, + "learning_rate": 4.992123529368608e-05, + "loss": 0.2207, + "mean_token_accuracy": 0.9451090037822724, + "num_tokens": 19587923.0, + "step": 13245 + }, + { + "entropy": 0.12610364258289336, + "epoch": 3.088588413568015, + "grad_norm": 7.1875, + "learning_rate": 4.99210641163117e-05, + "loss": 0.1984, + "mean_token_accuracy": 0.9510172843933106, + "num_tokens": 19615303.0, + "step": 13250 + }, + { + "entropy": 0.12419973127543926, + "epoch": 3.0897540505886467, + "grad_norm": 0.81640625, + "learning_rate": 4.992089275372057e-05, + "loss": 0.1556, + "mean_token_accuracy": 0.9587218821048736, + "num_tokens": 19639739.0, + "step": 13255 + }, + { + "entropy": 0.12563247513026, + "epoch": 3.0909196876092784, + "grad_norm": 2.3125, + "learning_rate": 4.992072120591524e-05, + "loss": 0.1285, + "mean_token_accuracy": 0.9584830939769745, + "num_tokens": 19669972.0, + "step": 13260 + }, + { + "entropy": 0.11786816529929638, + "epoch": 3.09208532462991, + "grad_norm": 5.0, + "learning_rate": 4.992054947289829e-05, + "loss": 0.2424, + "mean_token_accuracy": 0.9408693611621857, + "num_tokens": 19694543.0, + "step": 13265 + }, + { + "entropy": 0.10978089049458503, + "epoch": 3.093250961650542, + "grad_norm": 2.15625, + "learning_rate": 4.992037755467226e-05, + "loss": 0.1406, + "mean_token_accuracy": 0.9633432149887085, + "num_tokens": 19715806.0, + "step": 13270 + }, + { + "entropy": 0.09481975287199021, + "epoch": 3.094416598671174, + "grad_norm": 3.6875, + "learning_rate": 4.992020545123972e-05, + "loss": 0.1058, + "mean_token_accuracy": 0.9747069358825684, + "num_tokens": 19749863.0, + "step": 13275 + }, + { + "entropy": 0.12356273904442787, + "epoch": 3.0955822356918055, + "grad_norm": 1.40625, + "learning_rate": 4.992003316260323e-05, + "loss": 0.1161, + "mean_token_accuracy": 0.9546688258647918, + "num_tokens": 19782518.0, + "step": 13280 + }, + { + "entropy": 0.11005722619593143, + "epoch": 3.096747872712437, + "grad_norm": 0.79296875, + "learning_rate": 4.991986068876537e-05, + "loss": 0.23, + "mean_token_accuracy": 0.9487053215503692, + "num_tokens": 19804363.0, + "step": 13285 + }, + { + "entropy": 0.14497708119452, + "epoch": 3.0979135097330692, + "grad_norm": 1.65625, + "learning_rate": 4.99196880297287e-05, + "loss": 0.1645, + "mean_token_accuracy": 0.9520582973957061, + "num_tokens": 19824122.0, + "step": 13290 + }, + { + "entropy": 0.0952205179259181, + "epoch": 3.099079146753701, + "grad_norm": 4.625, + "learning_rate": 4.991951518549581e-05, + "loss": 0.1178, + "mean_token_accuracy": 0.9668820440769196, + "num_tokens": 19860008.0, + "step": 13295 + }, + { + "entropy": 0.11507775709033012, + "epoch": 3.1002447837743325, + "grad_norm": 0.87109375, + "learning_rate": 4.9919342156069266e-05, + "loss": 0.211, + "mean_token_accuracy": 0.9555065870285034, + "num_tokens": 19877177.0, + "step": 13300 + }, + { + "entropy": 0.13037493210285903, + "epoch": 3.1014104207949647, + "grad_norm": 5.1875, + "learning_rate": 4.991916894145164e-05, + "loss": 0.1271, + "mean_token_accuracy": 0.9565082132816315, + "num_tokens": 19906565.0, + "step": 13305 + }, + { + "entropy": 0.11838086917996407, + "epoch": 3.1025760578155963, + "grad_norm": 5.4375, + "learning_rate": 4.991899554164554e-05, + "loss": 0.2005, + "mean_token_accuracy": 0.960281252861023, + "num_tokens": 19922383.0, + "step": 13310 + }, + { + "entropy": 0.14991481192409992, + "epoch": 3.103741694836228, + "grad_norm": 6.09375, + "learning_rate": 4.991882195665353e-05, + "loss": 0.2267, + "mean_token_accuracy": 0.9503510117530822, + "num_tokens": 19933976.0, + "step": 13315 + }, + { + "entropy": 0.11397014874964953, + "epoch": 3.1049073318568596, + "grad_norm": 3.203125, + "learning_rate": 4.991864818647821e-05, + "loss": 0.1354, + "mean_token_accuracy": 0.9622531473636627, + "num_tokens": 19956871.0, + "step": 13320 + }, + { + "entropy": 0.14367529824376107, + "epoch": 3.1060729688774917, + "grad_norm": 7.6875, + "learning_rate": 4.991847423112217e-05, + "loss": 0.2265, + "mean_token_accuracy": 0.9521259009838104, + "num_tokens": 19968259.0, + "step": 13325 + }, + { + "entropy": 0.11058022249490022, + "epoch": 3.1072386058981234, + "grad_norm": 2.390625, + "learning_rate": 4.991830009058799e-05, + "loss": 0.1461, + "mean_token_accuracy": 0.9625348687171936, + "num_tokens": 19995564.0, + "step": 13330 + }, + { + "entropy": 0.1090804586187005, + "epoch": 3.108404242918755, + "grad_norm": 3.078125, + "learning_rate": 4.991812576487829e-05, + "loss": 0.2279, + "mean_token_accuracy": 0.9549605369567871, + "num_tokens": 20013769.0, + "step": 13335 + }, + { + "entropy": 0.13848309740424156, + "epoch": 3.1095698799393867, + "grad_norm": 10.0, + "learning_rate": 4.991795125399564e-05, + "loss": 0.2078, + "mean_token_accuracy": 0.9510918617248535, + "num_tokens": 20035255.0, + "step": 13340 + }, + { + "entropy": 0.0821663798764348, + "epoch": 3.110735516960019, + "grad_norm": 1.28125, + "learning_rate": 4.9917776557942675e-05, + "loss": 0.0816, + "mean_token_accuracy": 0.9688577592372895, + "num_tokens": 20059178.0, + "step": 13345 + }, + { + "entropy": 0.13422894291579723, + "epoch": 3.1119011539806505, + "grad_norm": 1.0234375, + "learning_rate": 4.991760167672198e-05, + "loss": 0.2109, + "mean_token_accuracy": 0.9463982820510864, + "num_tokens": 20075794.0, + "step": 13350 + }, + { + "entropy": 0.16679993383586406, + "epoch": 3.113066791001282, + "grad_norm": 2.890625, + "learning_rate": 4.9917426610336165e-05, + "loss": 0.1872, + "mean_token_accuracy": 0.9396183788776398, + "num_tokens": 20091022.0, + "step": 13355 + }, + { + "entropy": 0.12409867160022259, + "epoch": 3.114232428021914, + "grad_norm": 1.5234375, + "learning_rate": 4.9917251358787845e-05, + "loss": 0.228, + "mean_token_accuracy": 0.9440020442008972, + "num_tokens": 20104163.0, + "step": 13360 + }, + { + "entropy": 0.1242295227944851, + "epoch": 3.115398065042546, + "grad_norm": 4.0625, + "learning_rate": 4.991707592207963e-05, + "loss": 0.2196, + "mean_token_accuracy": 0.9523638188838959, + "num_tokens": 20123125.0, + "step": 13365 + }, + { + "entropy": 0.19654411226511, + "epoch": 3.1165637020631776, + "grad_norm": 0.703125, + "learning_rate": 4.991690030021413e-05, + "loss": 0.198, + "mean_token_accuracy": 0.9465866386890411, + "num_tokens": 20145750.0, + "step": 13370 + }, + { + "entropy": 0.1174004428088665, + "epoch": 3.1177293390838092, + "grad_norm": 6.8125, + "learning_rate": 4.991672449319398e-05, + "loss": 0.1953, + "mean_token_accuracy": 0.9528234481811524, + "num_tokens": 20168354.0, + "step": 13375 + }, + { + "entropy": 0.11760329715907573, + "epoch": 3.118894976104441, + "grad_norm": 5.125, + "learning_rate": 4.9916548501021784e-05, + "loss": 0.152, + "mean_token_accuracy": 0.9535988092422485, + "num_tokens": 20191537.0, + "step": 13380 + }, + { + "entropy": 0.151387694850564, + "epoch": 3.120060613125073, + "grad_norm": 4.9375, + "learning_rate": 4.991637232370018e-05, + "loss": 0.164, + "mean_token_accuracy": 0.9565434157848358, + "num_tokens": 20211723.0, + "step": 13385 + }, + { + "entropy": 0.13986520580947398, + "epoch": 3.1212262501457047, + "grad_norm": 7.25, + "learning_rate": 4.991619596123178e-05, + "loss": 0.2171, + "mean_token_accuracy": 0.9584638893604278, + "num_tokens": 20238357.0, + "step": 13390 + }, + { + "entropy": 0.14957279935479165, + "epoch": 3.1223918871663363, + "grad_norm": 1.65625, + "learning_rate": 4.991601941361923e-05, + "loss": 0.1502, + "mean_token_accuracy": 0.9497217297554016, + "num_tokens": 20253687.0, + "step": 13395 + }, + { + "entropy": 0.13997821658849716, + "epoch": 3.123557524186968, + "grad_norm": 1.40625, + "learning_rate": 4.991584268086516e-05, + "loss": 0.2023, + "mean_token_accuracy": 0.9500046491622924, + "num_tokens": 20280327.0, + "step": 13400 + }, + { + "entropy": 0.1721389342099428, + "epoch": 3.1247231612076, + "grad_norm": 6.28125, + "learning_rate": 4.9915665762972196e-05, + "loss": 0.3787, + "mean_token_accuracy": 0.9287561297416687, + "num_tokens": 20297460.0, + "step": 13405 + }, + { + "entropy": 0.12821940444409846, + "epoch": 3.1258887982282317, + "grad_norm": 0.62109375, + "learning_rate": 4.991548865994298e-05, + "loss": 0.1801, + "mean_token_accuracy": 0.9497880697250366, + "num_tokens": 20333983.0, + "step": 13410 + }, + { + "entropy": 0.09938725344836712, + "epoch": 3.1270544352488634, + "grad_norm": 2.671875, + "learning_rate": 4.991531137178016e-05, + "loss": 0.1094, + "mean_token_accuracy": 0.971398013830185, + "num_tokens": 20355626.0, + "step": 13415 + }, + { + "entropy": 0.1275652002543211, + "epoch": 3.128220072269495, + "grad_norm": 2.0, + "learning_rate": 4.991513389848637e-05, + "loss": 0.1089, + "mean_token_accuracy": 0.9720631957054138, + "num_tokens": 20375383.0, + "step": 13420 + }, + { + "entropy": 0.13102855496108531, + "epoch": 3.129385709290127, + "grad_norm": 0.388671875, + "learning_rate": 4.991495624006426e-05, + "loss": 0.1487, + "mean_token_accuracy": 0.9604451298713684, + "num_tokens": 20397434.0, + "step": 13425 + }, + { + "entropy": 0.15394360907375812, + "epoch": 3.130551346310759, + "grad_norm": 5.21875, + "learning_rate": 4.9914778396516484e-05, + "loss": 0.2195, + "mean_token_accuracy": 0.9505378365516662, + "num_tokens": 20412780.0, + "step": 13430 + }, + { + "entropy": 0.15660555139184, + "epoch": 3.1317169833313905, + "grad_norm": 6.21875, + "learning_rate": 4.9914600367845685e-05, + "loss": 0.2906, + "mean_token_accuracy": 0.9402198016643524, + "num_tokens": 20422638.0, + "step": 13435 + }, + { + "entropy": 0.17960682585835458, + "epoch": 3.1328826203520226, + "grad_norm": 1.875, + "learning_rate": 4.991442215405452e-05, + "loss": 0.318, + "mean_token_accuracy": 0.9269765436649322, + "num_tokens": 20439090.0, + "step": 13440 + }, + { + "entropy": 0.20290196798741816, + "epoch": 3.1340482573726542, + "grad_norm": 0.72265625, + "learning_rate": 4.991424375514565e-05, + "loss": 0.2952, + "mean_token_accuracy": 0.9280561923980712, + "num_tokens": 20478063.0, + "step": 13445 + }, + { + "entropy": 0.11316476315259934, + "epoch": 3.135213894393286, + "grad_norm": 2.453125, + "learning_rate": 4.991406517112173e-05, + "loss": 0.1718, + "mean_token_accuracy": 0.9608360230922699, + "num_tokens": 20498608.0, + "step": 13450 + }, + { + "entropy": 0.13394585978239776, + "epoch": 3.1363795314139176, + "grad_norm": 8.5625, + "learning_rate": 4.991388640198543e-05, + "loss": 0.1833, + "mean_token_accuracy": 0.9532162606716156, + "num_tokens": 20520011.0, + "step": 13455 + }, + { + "entropy": 0.1493657087907195, + "epoch": 3.1375451684345497, + "grad_norm": 0.9375, + "learning_rate": 4.9913707447739416e-05, + "loss": 0.2914, + "mean_token_accuracy": 0.937654048204422, + "num_tokens": 20535373.0, + "step": 13460 + }, + { + "entropy": 0.15093789100646973, + "epoch": 3.1387108054551813, + "grad_norm": 4.125, + "learning_rate": 4.9913528308386345e-05, + "loss": 0.2278, + "mean_token_accuracy": 0.9449706077575684, + "num_tokens": 20547044.0, + "step": 13465 + }, + { + "entropy": 0.12902355715632438, + "epoch": 3.139876442475813, + "grad_norm": 4.1875, + "learning_rate": 4.99133489839289e-05, + "loss": 0.2036, + "mean_token_accuracy": 0.9510998904705048, + "num_tokens": 20566906.0, + "step": 13470 + }, + { + "entropy": 0.15702604167163373, + "epoch": 3.1410420794964447, + "grad_norm": 3.3125, + "learning_rate": 4.9913169474369754e-05, + "loss": 0.2774, + "mean_token_accuracy": 0.9410692870616912, + "num_tokens": 20578617.0, + "step": 13475 + }, + { + "entropy": 0.14392093010246754, + "epoch": 3.1422077165170768, + "grad_norm": 3.59375, + "learning_rate": 4.991298977971158e-05, + "loss": 0.2114, + "mean_token_accuracy": 0.9481341958045959, + "num_tokens": 20591977.0, + "step": 13480 + }, + { + "entropy": 0.1626926977187395, + "epoch": 3.1433733535377084, + "grad_norm": 2.109375, + "learning_rate": 4.9912809899957055e-05, + "loss": 0.2463, + "mean_token_accuracy": 0.9358658790588379, + "num_tokens": 20613827.0, + "step": 13485 + }, + { + "entropy": 0.08454335257411003, + "epoch": 3.14453899055834, + "grad_norm": 1.5625, + "learning_rate": 4.9912629835108864e-05, + "loss": 0.0539, + "mean_token_accuracy": 0.9670248806476593, + "num_tokens": 20649688.0, + "step": 13490 + }, + { + "entropy": 0.09732284527271987, + "epoch": 3.1457046275789717, + "grad_norm": 2.015625, + "learning_rate": 4.99124495851697e-05, + "loss": 0.1081, + "mean_token_accuracy": 0.960671192407608, + "num_tokens": 20682789.0, + "step": 13495 + }, + { + "entropy": 0.09042824842035771, + "epoch": 3.146870264599604, + "grad_norm": 1.984375, + "learning_rate": 4.991226915014223e-05, + "loss": 0.0762, + "mean_token_accuracy": 0.9731867849826813, + "num_tokens": 20710191.0, + "step": 13500 + }, + { + "entropy": 0.17311727702617646, + "epoch": 3.1480359016202355, + "grad_norm": 1.0859375, + "learning_rate": 4.9912088530029166e-05, + "loss": 0.2479, + "mean_token_accuracy": 0.9498288691043854, + "num_tokens": 20725259.0, + "step": 13505 + }, + { + "entropy": 0.15625217892229556, + "epoch": 3.149201538640867, + "grad_norm": 6.53125, + "learning_rate": 4.9911907724833196e-05, + "loss": 0.2065, + "mean_token_accuracy": 0.954789811372757, + "num_tokens": 20733671.0, + "step": 13510 + }, + { + "entropy": 0.12325193397700787, + "epoch": 3.150367175661499, + "grad_norm": 3.109375, + "learning_rate": 4.991172673455701e-05, + "loss": 0.2265, + "mean_token_accuracy": 0.9545148432254791, + "num_tokens": 20744720.0, + "step": 13515 + }, + { + "entropy": 0.15157404392957688, + "epoch": 3.151532812682131, + "grad_norm": 5.0625, + "learning_rate": 4.9911545559203306e-05, + "loss": 0.2189, + "mean_token_accuracy": 0.9399513125419616, + "num_tokens": 20765379.0, + "step": 13520 + }, + { + "entropy": 0.09883678015321493, + "epoch": 3.1526984497027626, + "grad_norm": 0.7734375, + "learning_rate": 4.991136419877479e-05, + "loss": 0.1561, + "mean_token_accuracy": 0.9537435114383698, + "num_tokens": 20800491.0, + "step": 13525 + }, + { + "entropy": 0.1349384069442749, + "epoch": 3.1538640867233942, + "grad_norm": 4.40625, + "learning_rate": 4.991118265327417e-05, + "loss": 0.1982, + "mean_token_accuracy": 0.9464501976966858, + "num_tokens": 20822363.0, + "step": 13530 + }, + { + "entropy": 0.13095712885260583, + "epoch": 3.155029723744026, + "grad_norm": 2.46875, + "learning_rate": 4.991100092270415e-05, + "loss": 0.2211, + "mean_token_accuracy": 0.9556451499462127, + "num_tokens": 20845713.0, + "step": 13535 + }, + { + "entropy": 0.2365718200802803, + "epoch": 3.156195360764658, + "grad_norm": 4.65625, + "learning_rate": 4.991081900706744e-05, + "loss": 0.3579, + "mean_token_accuracy": 0.8964258372783661, + "num_tokens": 20880390.0, + "step": 13540 + }, + { + "entropy": 0.1382368978112936, + "epoch": 3.1573609977852897, + "grad_norm": 2.5, + "learning_rate": 4.9910636906366746e-05, + "loss": 0.2208, + "mean_token_accuracy": 0.9520070552825928, + "num_tokens": 20890351.0, + "step": 13545 + }, + { + "entropy": 0.12153224758803845, + "epoch": 3.1585266348059213, + "grad_norm": 3.84375, + "learning_rate": 4.9910454620604794e-05, + "loss": 0.2088, + "mean_token_accuracy": 0.9552608907222748, + "num_tokens": 20907586.0, + "step": 13550 + }, + { + "entropy": 0.17177230566740037, + "epoch": 3.159692271826553, + "grad_norm": 9.6875, + "learning_rate": 4.99102721497843e-05, + "loss": 0.3257, + "mean_token_accuracy": 0.9317836463451385, + "num_tokens": 20923892.0, + "step": 13555 + }, + { + "entropy": 0.11779150888323783, + "epoch": 3.160857908847185, + "grad_norm": 1.359375, + "learning_rate": 4.991008949390797e-05, + "loss": 0.1493, + "mean_token_accuracy": 0.9623130559921265, + "num_tokens": 20955137.0, + "step": 13560 + }, + { + "entropy": 0.12503595724701883, + "epoch": 3.1620235458678168, + "grad_norm": 6.71875, + "learning_rate": 4.990990665297855e-05, + "loss": 0.1711, + "mean_token_accuracy": 0.9561767220497132, + "num_tokens": 20971485.0, + "step": 13565 + }, + { + "entropy": 0.15457735285162927, + "epoch": 3.1631891828884484, + "grad_norm": 4.28125, + "learning_rate": 4.9909723626998755e-05, + "loss": 0.3141, + "mean_token_accuracy": 0.9366472482681274, + "num_tokens": 20981271.0, + "step": 13570 + }, + { + "entropy": 0.21778239756822587, + "epoch": 3.1643548199090805, + "grad_norm": 3.734375, + "learning_rate": 4.990954041597131e-05, + "loss": 0.2955, + "mean_token_accuracy": 0.9206785082817077, + "num_tokens": 21002375.0, + "step": 13575 + }, + { + "entropy": 0.0996066389605403, + "epoch": 3.165520456929712, + "grad_norm": 0.7578125, + "learning_rate": 4.990935701989896e-05, + "loss": 0.1601, + "mean_token_accuracy": 0.9643292784690857, + "num_tokens": 21032367.0, + "step": 13580 + }, + { + "entropy": 0.13476105220615864, + "epoch": 3.166686093950344, + "grad_norm": 5.90625, + "learning_rate": 4.990917343878443e-05, + "loss": 0.2373, + "mean_token_accuracy": 0.9470726490020752, + "num_tokens": 21042480.0, + "step": 13585 + }, + { + "entropy": 0.11117649544030428, + "epoch": 3.1678517309709755, + "grad_norm": 4.71875, + "learning_rate": 4.990898967263046e-05, + "loss": 0.1549, + "mean_token_accuracy": 0.9646702468395233, + "num_tokens": 21069694.0, + "step": 13590 + }, + { + "entropy": 0.12479297295212746, + "epoch": 3.1690173679916076, + "grad_norm": 5.65625, + "learning_rate": 4.990880572143979e-05, + "loss": 0.1796, + "mean_token_accuracy": 0.9550163507461548, + "num_tokens": 21080793.0, + "step": 13595 + }, + { + "entropy": 0.12391726523637772, + "epoch": 3.1701830050122393, + "grad_norm": 2.546875, + "learning_rate": 4.9908621585215154e-05, + "loss": 0.1415, + "mean_token_accuracy": 0.9581521868705749, + "num_tokens": 21098312.0, + "step": 13600 + }, + { + "entropy": 0.13280257768929005, + "epoch": 3.171348642032871, + "grad_norm": 0.69921875, + "learning_rate": 4.990843726395932e-05, + "loss": 0.2267, + "mean_token_accuracy": 0.9563954889774322, + "num_tokens": 21119709.0, + "step": 13605 + }, + { + "entropy": 0.12048251070082187, + "epoch": 3.1725142790535026, + "grad_norm": 3.328125, + "learning_rate": 4.990825275767501e-05, + "loss": 0.1771, + "mean_token_accuracy": 0.9502053916454315, + "num_tokens": 21144999.0, + "step": 13610 + }, + { + "entropy": 0.10526430867612362, + "epoch": 3.1736799160741347, + "grad_norm": 3.640625, + "learning_rate": 4.9908068066365e-05, + "loss": 0.2226, + "mean_token_accuracy": 0.9513380050659179, + "num_tokens": 21159827.0, + "step": 13615 + }, + { + "entropy": 0.14900004006922246, + "epoch": 3.1748455530947663, + "grad_norm": 1.015625, + "learning_rate": 4.990788319003202e-05, + "loss": 0.2445, + "mean_token_accuracy": 0.9436604678630829, + "num_tokens": 21179584.0, + "step": 13620 + }, + { + "entropy": 0.1654419906437397, + "epoch": 3.176011190115398, + "grad_norm": 9.9375, + "learning_rate": 4.990769812867885e-05, + "loss": 0.2853, + "mean_token_accuracy": 0.9375516057014466, + "num_tokens": 21190250.0, + "step": 13625 + }, + { + "entropy": 0.16855915486812592, + "epoch": 3.1771768271360297, + "grad_norm": 4.625, + "learning_rate": 4.990751288230824e-05, + "loss": 0.3594, + "mean_token_accuracy": 0.9368915677070617, + "num_tokens": 21200016.0, + "step": 13630 + }, + { + "entropy": 0.1235238466411829, + "epoch": 3.1783424641566618, + "grad_norm": 9.375, + "learning_rate": 4.990732745092294e-05, + "loss": 0.2636, + "mean_token_accuracy": 0.9344924747943878, + "num_tokens": 21217432.0, + "step": 13635 + }, + { + "entropy": 0.11267777048051357, + "epoch": 3.1795081011772934, + "grad_norm": 2.09375, + "learning_rate": 4.9907141834525734e-05, + "loss": 0.2743, + "mean_token_accuracy": 0.9462158858776093, + "num_tokens": 21233878.0, + "step": 13640 + }, + { + "entropy": 0.08724936954677105, + "epoch": 3.180673738197925, + "grad_norm": 7.78125, + "learning_rate": 4.9906956033119376e-05, + "loss": 0.1146, + "mean_token_accuracy": 0.9678235054016113, + "num_tokens": 21258870.0, + "step": 13645 + }, + { + "entropy": 0.13224483132362366, + "epoch": 3.1818393752185568, + "grad_norm": 0.75, + "learning_rate": 4.990677004670665e-05, + "loss": 0.2061, + "mean_token_accuracy": 0.9559370160102845, + "num_tokens": 21279265.0, + "step": 13650 + }, + { + "entropy": 0.12577038034796714, + "epoch": 3.183005012239189, + "grad_norm": 5.8125, + "learning_rate": 4.990658387529032e-05, + "loss": 0.2115, + "mean_token_accuracy": 0.9527509272098541, + "num_tokens": 21292607.0, + "step": 13655 + }, + { + "entropy": 0.09519344065338373, + "epoch": 3.1841706492598205, + "grad_norm": 2.109375, + "learning_rate": 4.990639751887316e-05, + "loss": 0.1431, + "mean_token_accuracy": 0.9555594265460968, + "num_tokens": 21323628.0, + "step": 13660 + }, + { + "entropy": 0.14371351413428785, + "epoch": 3.185336286280452, + "grad_norm": 6.4375, + "learning_rate": 4.9906210977457956e-05, + "loss": 0.2347, + "mean_token_accuracy": 0.9484319508075714, + "num_tokens": 21345259.0, + "step": 13665 + }, + { + "entropy": 0.1446783158928156, + "epoch": 3.186501923301084, + "grad_norm": 9.4375, + "learning_rate": 4.9906024251047486e-05, + "loss": 0.2042, + "mean_token_accuracy": 0.9526873469352722, + "num_tokens": 21371830.0, + "step": 13670 + }, + { + "entropy": 0.11993208415806293, + "epoch": 3.187667560321716, + "grad_norm": 0.87890625, + "learning_rate": 4.990583733964453e-05, + "loss": 0.2079, + "mean_token_accuracy": 0.9530717611312867, + "num_tokens": 21403937.0, + "step": 13675 + }, + { + "entropy": 0.1540200922638178, + "epoch": 3.1888331973423476, + "grad_norm": 1.4140625, + "learning_rate": 4.9905650243251886e-05, + "loss": 0.1084, + "mean_token_accuracy": 0.9452480733394623, + "num_tokens": 21447968.0, + "step": 13680 + }, + { + "entropy": 0.12013146840035915, + "epoch": 3.1899988343629793, + "grad_norm": 1.5546875, + "learning_rate": 4.9905462961872334e-05, + "loss": 0.1844, + "mean_token_accuracy": 0.9520611464977264, + "num_tokens": 21463123.0, + "step": 13685 + }, + { + "entropy": 0.08509744182229043, + "epoch": 3.191164471383611, + "grad_norm": 0.68359375, + "learning_rate": 4.990527549550867e-05, + "loss": 0.1185, + "mean_token_accuracy": 0.9626811742782593, + "num_tokens": 21492569.0, + "step": 13690 + }, + { + "entropy": 0.13770532943308353, + "epoch": 3.192330108404243, + "grad_norm": 2.0, + "learning_rate": 4.990508784416369e-05, + "loss": 0.1866, + "mean_token_accuracy": 0.9388071298599243, + "num_tokens": 21513409.0, + "step": 13695 + }, + { + "entropy": 0.11494051553308963, + "epoch": 3.1934957454248747, + "grad_norm": 1.265625, + "learning_rate": 4.9904900007840195e-05, + "loss": 0.1976, + "mean_token_accuracy": 0.9597686767578125, + "num_tokens": 21528668.0, + "step": 13700 + }, + { + "entropy": 0.12358577623963356, + "epoch": 3.1946613824455063, + "grad_norm": 4.9375, + "learning_rate": 4.990471198654098e-05, + "loss": 0.2058, + "mean_token_accuracy": 0.9521121263504029, + "num_tokens": 21550385.0, + "step": 13705 + }, + { + "entropy": 0.09476232714951038, + "epoch": 3.1958270194661385, + "grad_norm": 2.03125, + "learning_rate": 4.9904523780268854e-05, + "loss": 0.1059, + "mean_token_accuracy": 0.9660671591758728, + "num_tokens": 21572739.0, + "step": 13710 + }, + { + "entropy": 0.13330335095524787, + "epoch": 3.19699265648677, + "grad_norm": 13.0, + "learning_rate": 4.990433538902662e-05, + "loss": 0.271, + "mean_token_accuracy": 0.9459414899349212, + "num_tokens": 21585685.0, + "step": 13715 + }, + { + "entropy": 0.12506634294986724, + "epoch": 3.1981582935074018, + "grad_norm": 3.765625, + "learning_rate": 4.9904146812817087e-05, + "loss": 0.1427, + "mean_token_accuracy": 0.9513930737972259, + "num_tokens": 21603696.0, + "step": 13720 + }, + { + "entropy": 0.09047595858573913, + "epoch": 3.1993239305280334, + "grad_norm": 0.87890625, + "learning_rate": 4.990395805164307e-05, + "loss": 0.0768, + "mean_token_accuracy": 0.9803003549575806, + "num_tokens": 21631611.0, + "step": 13725 + }, + { + "entropy": 0.1679716443642974, + "epoch": 3.2004895675486655, + "grad_norm": 2.625, + "learning_rate": 4.990376910550738e-05, + "loss": 0.1202, + "mean_token_accuracy": 0.9510182023048401, + "num_tokens": 21684068.0, + "step": 13730 + }, + { + "entropy": 0.08525995686650276, + "epoch": 3.201655204569297, + "grad_norm": 2.578125, + "learning_rate": 4.990357997441284e-05, + "loss": 0.1319, + "mean_token_accuracy": 0.9639061510562896, + "num_tokens": 21717942.0, + "step": 13735 + }, + { + "entropy": 0.09524129014462232, + "epoch": 3.202820841589929, + "grad_norm": 3.625, + "learning_rate": 4.990339065836226e-05, + "loss": 0.1859, + "mean_token_accuracy": 0.9589514017105103, + "num_tokens": 21738190.0, + "step": 13740 + }, + { + "entropy": 0.07734522055834532, + "epoch": 3.2039864786105605, + "grad_norm": 1.4921875, + "learning_rate": 4.990320115735848e-05, + "loss": 0.1033, + "mean_token_accuracy": 0.9685503840446472, + "num_tokens": 21763096.0, + "step": 13745 + }, + { + "entropy": 0.0874601237475872, + "epoch": 3.2051521156311926, + "grad_norm": 0.640625, + "learning_rate": 4.9903011471404304e-05, + "loss": 0.1462, + "mean_token_accuracy": 0.9692184507846833, + "num_tokens": 21788974.0, + "step": 13750 + }, + { + "entropy": 0.1801584392786026, + "epoch": 3.2063177526518243, + "grad_norm": 5.15625, + "learning_rate": 4.9902821600502575e-05, + "loss": 0.2558, + "mean_token_accuracy": 0.930813604593277, + "num_tokens": 21800267.0, + "step": 13755 + }, + { + "entropy": 0.11756899803876877, + "epoch": 3.207483389672456, + "grad_norm": 7.96875, + "learning_rate": 4.990263154465612e-05, + "loss": 0.2071, + "mean_token_accuracy": 0.9543874800205231, + "num_tokens": 21811432.0, + "step": 13760 + }, + { + "entropy": 0.11073267944157124, + "epoch": 3.2086490266930876, + "grad_norm": 1.3515625, + "learning_rate": 4.990244130386778e-05, + "loss": 0.126, + "mean_token_accuracy": 0.9537279605865479, + "num_tokens": 21840039.0, + "step": 13765 + }, + { + "entropy": 0.14336490221321582, + "epoch": 3.2098146637137197, + "grad_norm": 3.96875, + "learning_rate": 4.990225087814038e-05, + "loss": 0.165, + "mean_token_accuracy": 0.9559800326824188, + "num_tokens": 21852623.0, + "step": 13770 + }, + { + "entropy": 0.13291695453226565, + "epoch": 3.2109803007343514, + "grad_norm": 3.0625, + "learning_rate": 4.990206026747677e-05, + "loss": 0.1719, + "mean_token_accuracy": 0.9552733778953553, + "num_tokens": 21863643.0, + "step": 13775 + }, + { + "entropy": 0.1144310999661684, + "epoch": 3.212145937754983, + "grad_norm": 2.328125, + "learning_rate": 4.9901869471879784e-05, + "loss": 0.1422, + "mean_token_accuracy": 0.9642179667949676, + "num_tokens": 21886692.0, + "step": 13780 + }, + { + "entropy": 0.19398325234651564, + "epoch": 3.2133115747756147, + "grad_norm": 1.5703125, + "learning_rate": 4.990167849135227e-05, + "loss": 0.2933, + "mean_token_accuracy": 0.9346424221992493, + "num_tokens": 21901027.0, + "step": 13785 + }, + { + "entropy": 0.14228848069906236, + "epoch": 3.214477211796247, + "grad_norm": 0.87890625, + "learning_rate": 4.990148732589708e-05, + "loss": 0.2278, + "mean_token_accuracy": 0.9492959976196289, + "num_tokens": 21924230.0, + "step": 13790 + }, + { + "entropy": 0.11809794455766678, + "epoch": 3.2156428488168785, + "grad_norm": 4.03125, + "learning_rate": 4.990129597551706e-05, + "loss": 0.1691, + "mean_token_accuracy": 0.9494635581970214, + "num_tokens": 21944816.0, + "step": 13795 + }, + { + "entropy": 0.14083162061870097, + "epoch": 3.21680848583751, + "grad_norm": 1.703125, + "learning_rate": 4.9901104440215064e-05, + "loss": 0.203, + "mean_token_accuracy": 0.9425077676773072, + "num_tokens": 21957162.0, + "step": 13800 + }, + { + "entropy": 0.19819015860557557, + "epoch": 3.2179741228581418, + "grad_norm": 3.1875, + "learning_rate": 4.9900912719993944e-05, + "loss": 0.3117, + "mean_token_accuracy": 0.9249706268310547, + "num_tokens": 21970654.0, + "step": 13805 + }, + { + "entropy": 0.10589619725942612, + "epoch": 3.219139759878774, + "grad_norm": 1.703125, + "learning_rate": 4.9900720814856574e-05, + "loss": 0.1353, + "mean_token_accuracy": 0.961876267194748, + "num_tokens": 21989719.0, + "step": 13810 + }, + { + "entropy": 0.14038040973246096, + "epoch": 3.2203053968994055, + "grad_norm": 2.171875, + "learning_rate": 4.990052872480579e-05, + "loss": 0.1447, + "mean_token_accuracy": 0.9591590285301208, + "num_tokens": 22008316.0, + "step": 13815 + }, + { + "entropy": 0.1346709206700325, + "epoch": 3.221471033920037, + "grad_norm": 9.625, + "learning_rate": 4.9900336449844486e-05, + "loss": 0.3254, + "mean_token_accuracy": 0.9380843579769135, + "num_tokens": 22026814.0, + "step": 13820 + }, + { + "entropy": 0.12028101049363613, + "epoch": 3.222636670940669, + "grad_norm": 1.8046875, + "learning_rate": 4.9900143989975515e-05, + "loss": 0.1735, + "mean_token_accuracy": 0.9532825112342834, + "num_tokens": 22047666.0, + "step": 13825 + }, + { + "entropy": 0.16554685439914466, + "epoch": 3.223802307961301, + "grad_norm": 2.21875, + "learning_rate": 4.989995134520173e-05, + "loss": 0.2984, + "mean_token_accuracy": 0.9322503209114075, + "num_tokens": 22068533.0, + "step": 13830 + }, + { + "entropy": 0.09840068519115448, + "epoch": 3.2249679449819326, + "grad_norm": 6.0625, + "learning_rate": 4.9899758515526035e-05, + "loss": 0.1664, + "mean_token_accuracy": 0.9617088854312896, + "num_tokens": 22092569.0, + "step": 13835 + }, + { + "entropy": 0.10480661168694497, + "epoch": 3.2261335820025643, + "grad_norm": 7.96875, + "learning_rate": 4.989956550095129e-05, + "loss": 0.1949, + "mean_token_accuracy": 0.9511193692684173, + "num_tokens": 22110234.0, + "step": 13840 + }, + { + "entropy": 0.13478904515504836, + "epoch": 3.2272992190231964, + "grad_norm": 4.59375, + "learning_rate": 4.989937230148036e-05, + "loss": 0.1947, + "mean_token_accuracy": 0.9495131015777588, + "num_tokens": 22126238.0, + "step": 13845 + }, + { + "entropy": 0.18446614295244218, + "epoch": 3.228464856043828, + "grad_norm": 1.2109375, + "learning_rate": 4.989917891711615e-05, + "loss": 0.33, + "mean_token_accuracy": 0.9250796020030976, + "num_tokens": 22162234.0, + "step": 13850 + }, + { + "entropy": 0.14001993387937545, + "epoch": 3.2296304930644597, + "grad_norm": 4.78125, + "learning_rate": 4.989898534786153e-05, + "loss": 0.2614, + "mean_token_accuracy": 0.945748233795166, + "num_tokens": 22172904.0, + "step": 13855 + }, + { + "entropy": 0.1121131750755012, + "epoch": 3.2307961300850914, + "grad_norm": 0.5703125, + "learning_rate": 4.989879159371939e-05, + "loss": 0.1443, + "mean_token_accuracy": 0.9647936940193176, + "num_tokens": 22200250.0, + "step": 13860 + }, + { + "entropy": 0.17312668301165104, + "epoch": 3.2319617671057235, + "grad_norm": 4.0625, + "learning_rate": 4.989859765469261e-05, + "loss": 0.2362, + "mean_token_accuracy": 0.9496867656707764, + "num_tokens": 22212661.0, + "step": 13865 + }, + { + "entropy": 0.11042605116963386, + "epoch": 3.233127404126355, + "grad_norm": 4.78125, + "learning_rate": 4.989840353078411e-05, + "loss": 0.165, + "mean_token_accuracy": 0.9618758082389831, + "num_tokens": 22230362.0, + "step": 13870 + }, + { + "entropy": 0.12297814935445786, + "epoch": 3.234293041146987, + "grad_norm": 7.96875, + "learning_rate": 4.989820922199675e-05, + "loss": 0.1651, + "mean_token_accuracy": 0.9570007860660553, + "num_tokens": 22266384.0, + "step": 13875 + }, + { + "entropy": 0.09149080365896226, + "epoch": 3.2354586781676185, + "grad_norm": 1.5, + "learning_rate": 4.9898014728333444e-05, + "loss": 0.1237, + "mean_token_accuracy": 0.967199444770813, + "num_tokens": 22296680.0, + "step": 13880 + }, + { + "entropy": 0.1321562185883522, + "epoch": 3.2366243151882506, + "grad_norm": 1.4140625, + "learning_rate": 4.98978200497971e-05, + "loss": 0.1815, + "mean_token_accuracy": 0.9545339822769165, + "num_tokens": 22316879.0, + "step": 13885 + }, + { + "entropy": 0.12758674062788486, + "epoch": 3.237789952208882, + "grad_norm": 11.0, + "learning_rate": 4.989762518639059e-05, + "loss": 0.2647, + "mean_token_accuracy": 0.9446513593196869, + "num_tokens": 22331553.0, + "step": 13890 + }, + { + "entropy": 0.11283358000218868, + "epoch": 3.238955589229514, + "grad_norm": 0.5703125, + "learning_rate": 4.9897430138116863e-05, + "loss": 0.2038, + "mean_token_accuracy": 0.9567483723163605, + "num_tokens": 22350593.0, + "step": 13895 + }, + { + "entropy": 0.1608588956296444, + "epoch": 3.2401212262501455, + "grad_norm": 4.6875, + "learning_rate": 4.9897234904978803e-05, + "loss": 0.2525, + "mean_token_accuracy": 0.9486888587474823, + "num_tokens": 22360472.0, + "step": 13900 + }, + { + "entropy": 0.15052568428218366, + "epoch": 3.2412868632707776, + "grad_norm": 4.46875, + "learning_rate": 4.989703948697932e-05, + "loss": 0.1805, + "mean_token_accuracy": 0.9498693823814393, + "num_tokens": 22375242.0, + "step": 13905 + }, + { + "entropy": 0.1559467740356922, + "epoch": 3.2424525002914093, + "grad_norm": 8.125, + "learning_rate": 4.989684388412133e-05, + "loss": 0.2759, + "mean_token_accuracy": 0.9473506152629853, + "num_tokens": 22385223.0, + "step": 13910 + }, + { + "entropy": 0.12847465351223947, + "epoch": 3.243618137312041, + "grad_norm": 2.53125, + "learning_rate": 4.989664809640775e-05, + "loss": 0.2491, + "mean_token_accuracy": 0.9525147080421448, + "num_tokens": 22397441.0, + "step": 13915 + }, + { + "entropy": 0.13161057084798813, + "epoch": 3.2447837743326726, + "grad_norm": 1.1171875, + "learning_rate": 4.989645212384151e-05, + "loss": 0.199, + "mean_token_accuracy": 0.9566210269927978, + "num_tokens": 22424867.0, + "step": 13920 + }, + { + "entropy": 0.08073518313467502, + "epoch": 3.2459494113533047, + "grad_norm": 7.5, + "learning_rate": 4.9896255966425516e-05, + "loss": 0.1307, + "mean_token_accuracy": 0.9721478164196015, + "num_tokens": 22445947.0, + "step": 13925 + }, + { + "entropy": 0.17617430575191975, + "epoch": 3.2471150483739364, + "grad_norm": 5.0625, + "learning_rate": 4.989605962416271e-05, + "loss": 0.2539, + "mean_token_accuracy": 0.9380265295505523, + "num_tokens": 22461954.0, + "step": 13930 + }, + { + "entropy": 0.09188916571438313, + "epoch": 3.248280685394568, + "grad_norm": 6.09375, + "learning_rate": 4.989586309705599e-05, + "loss": 0.1049, + "mean_token_accuracy": 0.965660160779953, + "num_tokens": 22492788.0, + "step": 13935 + }, + { + "entropy": 0.12470996119081974, + "epoch": 3.2494463224151997, + "grad_norm": 0.67578125, + "learning_rate": 4.989566638510833e-05, + "loss": 0.156, + "mean_token_accuracy": 0.9591312229633331, + "num_tokens": 22517242.0, + "step": 13940 + }, + { + "entropy": 0.124464544467628, + "epoch": 3.250611959435832, + "grad_norm": 6.6875, + "learning_rate": 4.9895469488322624e-05, + "loss": 0.18, + "mean_token_accuracy": 0.9532681405544281, + "num_tokens": 22534897.0, + "step": 13945 + }, + { + "entropy": 0.21237018406391145, + "epoch": 3.2517775964564635, + "grad_norm": 2.8125, + "learning_rate": 4.989527240670183e-05, + "loss": 0.3355, + "mean_token_accuracy": 0.9278428435325623, + "num_tokens": 22566016.0, + "step": 13950 + }, + { + "entropy": 0.16752017810940742, + "epoch": 3.252943233477095, + "grad_norm": 8.0625, + "learning_rate": 4.9895075140248884e-05, + "loss": 0.2726, + "mean_token_accuracy": 0.9438336849212646, + "num_tokens": 22575102.0, + "step": 13955 + }, + { + "entropy": 0.12253744266927243, + "epoch": 3.254108870497727, + "grad_norm": 5.90625, + "learning_rate": 4.9894877688966726e-05, + "loss": 0.2037, + "mean_token_accuracy": 0.9605319619178772, + "num_tokens": 22587999.0, + "step": 13960 + }, + { + "entropy": 0.10806054174900055, + "epoch": 3.255274507518359, + "grad_norm": 5.75, + "learning_rate": 4.989468005285829e-05, + "loss": 0.1474, + "mean_token_accuracy": 0.9701383888721467, + "num_tokens": 22612878.0, + "step": 13965 + }, + { + "entropy": 0.07961149625480175, + "epoch": 3.2564401445389906, + "grad_norm": 2.625, + "learning_rate": 4.989448223192654e-05, + "loss": 0.1128, + "mean_token_accuracy": 0.9735390186309815, + "num_tokens": 22635307.0, + "step": 13970 + }, + { + "entropy": 0.09045897722244263, + "epoch": 3.257605781559622, + "grad_norm": 6.90625, + "learning_rate": 4.989428422617441e-05, + "loss": 0.0872, + "mean_token_accuracy": 0.9661646127700806, + "num_tokens": 22670036.0, + "step": 13975 + }, + { + "entropy": 0.14846278242766858, + "epoch": 3.2587714185802543, + "grad_norm": 1.078125, + "learning_rate": 4.989408603560486e-05, + "loss": 0.2354, + "mean_token_accuracy": 0.9485298454761505, + "num_tokens": 22691930.0, + "step": 13980 + }, + { + "entropy": 0.14384981635957955, + "epoch": 3.259937055600886, + "grad_norm": 5.125, + "learning_rate": 4.989388766022085e-05, + "loss": 0.197, + "mean_token_accuracy": 0.9554014205932617, + "num_tokens": 22711935.0, + "step": 13985 + }, + { + "entropy": 0.1226340863853693, + "epoch": 3.2611026926215176, + "grad_norm": 2.265625, + "learning_rate": 4.989368910002534e-05, + "loss": 0.1169, + "mean_token_accuracy": 0.958068335056305, + "num_tokens": 22736033.0, + "step": 13990 + }, + { + "entropy": 0.09807408321648836, + "epoch": 3.2622683296421493, + "grad_norm": 0.80859375, + "learning_rate": 4.9893490355021275e-05, + "loss": 0.11, + "mean_token_accuracy": 0.9633495807647705, + "num_tokens": 22765974.0, + "step": 13995 + }, + { + "entropy": 0.13382991403341293, + "epoch": 3.263433966662781, + "grad_norm": 4.96875, + "learning_rate": 4.989329142521163e-05, + "loss": 0.2118, + "mean_token_accuracy": 0.9530467092990875, + "num_tokens": 22785509.0, + "step": 14000 + }, + { + "entropy": 0.12002462726086378, + "epoch": 3.264599603683413, + "grad_norm": 4.1875, + "learning_rate": 4.989309231059937e-05, + "loss": 0.2169, + "mean_token_accuracy": 0.9570177972316742, + "num_tokens": 22803325.0, + "step": 14005 + }, + { + "entropy": 0.14383577033877373, + "epoch": 3.2657652407040447, + "grad_norm": 7.59375, + "learning_rate": 4.989289301118746e-05, + "loss": 0.2408, + "mean_token_accuracy": 0.9505583703517914, + "num_tokens": 22812269.0, + "step": 14010 + }, + { + "entropy": 0.15477120950818063, + "epoch": 3.2669308777246764, + "grad_norm": 4.40625, + "learning_rate": 4.989269352697888e-05, + "loss": 0.2265, + "mean_token_accuracy": 0.9491691708564758, + "num_tokens": 22822670.0, + "step": 14015 + }, + { + "entropy": 0.12411811240017415, + "epoch": 3.2680965147453085, + "grad_norm": 1.046875, + "learning_rate": 4.98924938579766e-05, + "loss": 0.1797, + "mean_token_accuracy": 0.9555955529212952, + "num_tokens": 22841556.0, + "step": 14020 + }, + { + "entropy": 0.15641675479710102, + "epoch": 3.26926215176594, + "grad_norm": 6.40625, + "learning_rate": 4.989229400418359e-05, + "loss": 0.2924, + "mean_token_accuracy": 0.9350105941295623, + "num_tokens": 22851956.0, + "step": 14025 + }, + { + "entropy": 0.17089218497276307, + "epoch": 3.270427788786572, + "grad_norm": 1.2421875, + "learning_rate": 4.9892093965602846e-05, + "loss": 0.2689, + "mean_token_accuracy": 0.9299245476722717, + "num_tokens": 22873141.0, + "step": 14030 + }, + { + "entropy": 0.10465769805014133, + "epoch": 3.2715934258072035, + "grad_norm": 6.125, + "learning_rate": 4.9891893742237336e-05, + "loss": 0.1531, + "mean_token_accuracy": 0.9625381529331207, + "num_tokens": 22892855.0, + "step": 14035 + }, + { + "entropy": 0.12475298270583153, + "epoch": 3.2727590628278356, + "grad_norm": 3.015625, + "learning_rate": 4.989169333409006e-05, + "loss": 0.1886, + "mean_token_accuracy": 0.9612498998641967, + "num_tokens": 22908889.0, + "step": 14040 + }, + { + "entropy": 0.12309679705649615, + "epoch": 3.2739246998484672, + "grad_norm": 1.109375, + "learning_rate": 4.9891492741163986e-05, + "loss": 0.1482, + "mean_token_accuracy": 0.9679423987865448, + "num_tokens": 22926291.0, + "step": 14045 + }, + { + "entropy": 0.16394661888480186, + "epoch": 3.275090336869099, + "grad_norm": 3.515625, + "learning_rate": 4.989129196346213e-05, + "loss": 0.2189, + "mean_token_accuracy": 0.939568281173706, + "num_tokens": 22953734.0, + "step": 14050 + }, + { + "entropy": 0.22891110852360724, + "epoch": 3.2762559738897306, + "grad_norm": 3.828125, + "learning_rate": 4.989109100098746e-05, + "loss": 0.2433, + "mean_token_accuracy": 0.9389278292655945, + "num_tokens": 22973890.0, + "step": 14055 + }, + { + "entropy": 0.12039781026542187, + "epoch": 3.2774216109103627, + "grad_norm": 7.0, + "learning_rate": 4.9890889853743e-05, + "loss": 0.2894, + "mean_token_accuracy": 0.9522871553897858, + "num_tokens": 22987566.0, + "step": 14060 + }, + { + "entropy": 0.102488574385643, + "epoch": 3.2785872479309943, + "grad_norm": 0.8828125, + "learning_rate": 4.9890688521731726e-05, + "loss": 0.1132, + "mean_token_accuracy": 0.9691874027252197, + "num_tokens": 23013277.0, + "step": 14065 + }, + { + "entropy": 0.11040122248232365, + "epoch": 3.279752884951626, + "grad_norm": 1.1953125, + "learning_rate": 4.989048700495665e-05, + "loss": 0.1639, + "mean_token_accuracy": 0.9644237160682678, + "num_tokens": 23027335.0, + "step": 14070 + }, + { + "entropy": 0.13591878786683081, + "epoch": 3.280918521972258, + "grad_norm": 1.8125, + "learning_rate": 4.989028530342078e-05, + "loss": 0.1813, + "mean_token_accuracy": 0.9562951922416687, + "num_tokens": 23040430.0, + "step": 14075 + }, + { + "entropy": 0.10207988247275353, + "epoch": 3.2820841589928897, + "grad_norm": 0.7578125, + "learning_rate": 4.989008341712712e-05, + "loss": 0.1693, + "mean_token_accuracy": 0.9547303438186645, + "num_tokens": 23060680.0, + "step": 14080 + }, + { + "entropy": 0.12369899153709411, + "epoch": 3.2832497960135214, + "grad_norm": 1.2265625, + "learning_rate": 4.9889881346078675e-05, + "loss": 0.1882, + "mean_token_accuracy": 0.9508943438529969, + "num_tokens": 23078759.0, + "step": 14085 + }, + { + "entropy": 0.10847660899162292, + "epoch": 3.284415433034153, + "grad_norm": 0.5625, + "learning_rate": 4.9889679090278466e-05, + "loss": 0.1459, + "mean_token_accuracy": 0.9644874215126038, + "num_tokens": 23102055.0, + "step": 14090 + }, + { + "entropy": 0.2650509625673294, + "epoch": 3.2855810700547847, + "grad_norm": 9.625, + "learning_rate": 4.988947664972951e-05, + "loss": 0.4359, + "mean_token_accuracy": 0.9194409489631653, + "num_tokens": 23124775.0, + "step": 14095 + }, + { + "entropy": 0.14016255624592305, + "epoch": 3.286746707075417, + "grad_norm": 3.515625, + "learning_rate": 4.9889274024434826e-05, + "loss": 0.1227, + "mean_token_accuracy": 0.948961591720581, + "num_tokens": 23163545.0, + "step": 14100 + }, + { + "entropy": 0.11483432129025459, + "epoch": 3.2879123440960485, + "grad_norm": 2.78125, + "learning_rate": 4.988907121439742e-05, + "loss": 0.1191, + "mean_token_accuracy": 0.9661029517650604, + "num_tokens": 23188930.0, + "step": 14105 + }, + { + "entropy": 0.12453803811222315, + "epoch": 3.28907798111668, + "grad_norm": 6.0, + "learning_rate": 4.988886821962034e-05, + "loss": 0.1842, + "mean_token_accuracy": 0.9471354961395264, + "num_tokens": 23216209.0, + "step": 14110 + }, + { + "entropy": 0.13249999955296515, + "epoch": 3.2902436181373123, + "grad_norm": 2.046875, + "learning_rate": 4.988866504010659e-05, + "loss": 0.1485, + "mean_token_accuracy": 0.9522889494895935, + "num_tokens": 23233724.0, + "step": 14115 + }, + { + "entropy": 0.1101855117827654, + "epoch": 3.291409255157944, + "grad_norm": 7.875, + "learning_rate": 4.988846167585922e-05, + "loss": 0.1761, + "mean_token_accuracy": 0.9595109105110169, + "num_tokens": 23255614.0, + "step": 14120 + }, + { + "entropy": 0.11480397321283817, + "epoch": 3.2925748921785756, + "grad_norm": 5.34375, + "learning_rate": 4.9888258126881246e-05, + "loss": 0.1468, + "mean_token_accuracy": 0.961454713344574, + "num_tokens": 23275000.0, + "step": 14125 + }, + { + "entropy": 0.11717780642211437, + "epoch": 3.2937405291992072, + "grad_norm": 4.65625, + "learning_rate": 4.9888054393175715e-05, + "loss": 0.1914, + "mean_token_accuracy": 0.9546636343002319, + "num_tokens": 23286380.0, + "step": 14130 + }, + { + "entropy": 0.10818963898345828, + "epoch": 3.294906166219839, + "grad_norm": 1.7890625, + "learning_rate": 4.9887850474745654e-05, + "loss": 0.151, + "mean_token_accuracy": 0.9663783788681031, + "num_tokens": 23306723.0, + "step": 14135 + }, + { + "entropy": 0.241145624127239, + "epoch": 3.296071803240471, + "grad_norm": 0.5625, + "learning_rate": 4.988764637159412e-05, + "loss": 0.4151, + "mean_token_accuracy": 0.9382971048355102, + "num_tokens": 23345607.0, + "step": 14140 + }, + { + "entropy": 0.12085740342736244, + "epoch": 3.2972374402611027, + "grad_norm": 3.390625, + "learning_rate": 4.9887442083724146e-05, + "loss": 0.2171, + "mean_token_accuracy": 0.9586883842945099, + "num_tokens": 23358320.0, + "step": 14145 + }, + { + "entropy": 0.16930502615869045, + "epoch": 3.2984030772817343, + "grad_norm": 7.3125, + "learning_rate": 4.988723761113877e-05, + "loss": 0.2626, + "mean_token_accuracy": 0.9430509269237518, + "num_tokens": 23378309.0, + "step": 14150 + }, + { + "entropy": 0.08466316685080529, + "epoch": 3.2995687143023664, + "grad_norm": 0.79296875, + "learning_rate": 4.988703295384105e-05, + "loss": 0.1232, + "mean_token_accuracy": 0.9719673752784729, + "num_tokens": 23415091.0, + "step": 14155 + }, + { + "entropy": 0.09591401666402817, + "epoch": 3.300734351322998, + "grad_norm": 1.3671875, + "learning_rate": 4.9886828111834046e-05, + "loss": 0.2293, + "mean_token_accuracy": 0.9556362986564636, + "num_tokens": 23433837.0, + "step": 14160 + }, + { + "entropy": 0.2667720340192318, + "epoch": 3.3018999883436297, + "grad_norm": 1.296875, + "learning_rate": 4.98866230851208e-05, + "loss": 0.3849, + "mean_token_accuracy": 0.93751819729805, + "num_tokens": 23455339.0, + "step": 14165 + }, + { + "entropy": 0.12662522951141, + "epoch": 3.3030656253642614, + "grad_norm": 0.498046875, + "learning_rate": 4.988641787370437e-05, + "loss": 0.1087, + "mean_token_accuracy": 0.9500738441944122, + "num_tokens": 23479400.0, + "step": 14170 + }, + { + "entropy": 0.09370872657746077, + "epoch": 3.3042312623848935, + "grad_norm": 1.8359375, + "learning_rate": 4.988621247758782e-05, + "loss": 0.0845, + "mean_token_accuracy": 0.9714587390422821, + "num_tokens": 23501072.0, + "step": 14175 + }, + { + "entropy": 0.0931572213768959, + "epoch": 3.305396899405525, + "grad_norm": 5.21875, + "learning_rate": 4.988600689677422e-05, + "loss": 0.0987, + "mean_token_accuracy": 0.9701979279518127, + "num_tokens": 23531872.0, + "step": 14180 + }, + { + "entropy": 0.17157430276274682, + "epoch": 3.306562536426157, + "grad_norm": 6.40625, + "learning_rate": 4.988580113126662e-05, + "loss": 0.3283, + "mean_token_accuracy": 0.9170258045196533, + "num_tokens": 23550255.0, + "step": 14185 + }, + { + "entropy": 0.12436717860400677, + "epoch": 3.3077281734467885, + "grad_norm": 1.7578125, + "learning_rate": 4.9885595181068094e-05, + "loss": 0.1538, + "mean_token_accuracy": 0.9461873650550843, + "num_tokens": 23591479.0, + "step": 14190 + }, + { + "entropy": 0.11454317420721054, + "epoch": 3.3088938104674206, + "grad_norm": 4.125, + "learning_rate": 4.988538904618172e-05, + "loss": 0.1739, + "mean_token_accuracy": 0.956846272945404, + "num_tokens": 23619937.0, + "step": 14195 + }, + { + "entropy": 0.08084500934928655, + "epoch": 3.3100594474880523, + "grad_norm": 0.79296875, + "learning_rate": 4.988518272661057e-05, + "loss": 0.0701, + "mean_token_accuracy": 0.9723656892776489, + "num_tokens": 23654470.0, + "step": 14200 + }, + { + "entropy": 0.1474765609949827, + "epoch": 3.311225084508684, + "grad_norm": 2.65625, + "learning_rate": 4.988497622235771e-05, + "loss": 0.2552, + "mean_token_accuracy": 0.941499924659729, + "num_tokens": 23674024.0, + "step": 14205 + }, + { + "entropy": 0.14339925050735475, + "epoch": 3.312390721529316, + "grad_norm": 6.96875, + "learning_rate": 4.988476953342623e-05, + "loss": 0.2872, + "mean_token_accuracy": 0.94207683801651, + "num_tokens": 23683173.0, + "step": 14210 + }, + { + "entropy": 0.1142255749553442, + "epoch": 3.3135563585499477, + "grad_norm": 1.21875, + "learning_rate": 4.988456265981921e-05, + "loss": 0.1305, + "mean_token_accuracy": 0.9632910430431366, + "num_tokens": 23708330.0, + "step": 14215 + }, + { + "entropy": 0.09450707472860813, + "epoch": 3.3147219955705793, + "grad_norm": 0.8125, + "learning_rate": 4.988435560153972e-05, + "loss": 0.1312, + "mean_token_accuracy": 0.9680880188941956, + "num_tokens": 23739180.0, + "step": 14220 + }, + { + "entropy": 0.11961795259267091, + "epoch": 3.315887632591211, + "grad_norm": 0.640625, + "learning_rate": 4.988414835859087e-05, + "loss": 0.1746, + "mean_token_accuracy": 0.9507930636405945, + "num_tokens": 23766179.0, + "step": 14225 + }, + { + "entropy": 0.10384149886667729, + "epoch": 3.3170532696118427, + "grad_norm": 4.21875, + "learning_rate": 4.988394093097575e-05, + "loss": 0.1533, + "mean_token_accuracy": 0.9634926319122314, + "num_tokens": 23781500.0, + "step": 14230 + }, + { + "entropy": 0.11352912969887256, + "epoch": 3.3182189066324748, + "grad_norm": 6.875, + "learning_rate": 4.9883733318697436e-05, + "loss": 0.1626, + "mean_token_accuracy": 0.9677087485790252, + "num_tokens": 23795117.0, + "step": 14235 + }, + { + "entropy": 0.14532850068062544, + "epoch": 3.3193845436531064, + "grad_norm": 6.1875, + "learning_rate": 4.9883525521759034e-05, + "loss": 0.2539, + "mean_token_accuracy": 0.9434155464172364, + "num_tokens": 23814943.0, + "step": 14240 + }, + { + "entropy": 0.09582647122442722, + "epoch": 3.320550180673738, + "grad_norm": 3.0, + "learning_rate": 4.9883317540163634e-05, + "loss": 0.0985, + "mean_token_accuracy": 0.9703680038452148, + "num_tokens": 23843871.0, + "step": 14245 + }, + { + "entropy": 0.13182559311389924, + "epoch": 3.32171581769437, + "grad_norm": 5.875, + "learning_rate": 4.988310937391435e-05, + "loss": 0.2936, + "mean_token_accuracy": 0.9366560280323029, + "num_tokens": 23864184.0, + "step": 14250 + }, + { + "entropy": 0.15998778566718103, + "epoch": 3.322881454715002, + "grad_norm": 1.078125, + "learning_rate": 4.9882901023014284e-05, + "loss": 0.249, + "mean_token_accuracy": 0.9381362736225128, + "num_tokens": 23884339.0, + "step": 14255 + }, + { + "entropy": 0.11376429684460163, + "epoch": 3.3240470917356335, + "grad_norm": 2.078125, + "learning_rate": 4.9882692487466534e-05, + "loss": 0.1179, + "mean_token_accuracy": 0.968958580493927, + "num_tokens": 23901021.0, + "step": 14260 + }, + { + "entropy": 0.09357526563107968, + "epoch": 3.325212728756265, + "grad_norm": 4.96875, + "learning_rate": 4.988248376727421e-05, + "loss": 0.0891, + "mean_token_accuracy": 0.9657317698001862, + "num_tokens": 23940219.0, + "step": 14265 + }, + { + "entropy": 0.13367250710725784, + "epoch": 3.326378365776897, + "grad_norm": 6.71875, + "learning_rate": 4.988227486244044e-05, + "loss": 0.2013, + "mean_token_accuracy": 0.9435231447219848, + "num_tokens": 23962461.0, + "step": 14270 + }, + { + "entropy": 0.13902662843465804, + "epoch": 3.327544002797529, + "grad_norm": 1.7109375, + "learning_rate": 4.988206577296832e-05, + "loss": 0.2072, + "mean_token_accuracy": 0.9430397391319275, + "num_tokens": 23979910.0, + "step": 14275 + }, + { + "entropy": 0.11999710947275162, + "epoch": 3.3287096398181606, + "grad_norm": 4.78125, + "learning_rate": 4.9881856498860976e-05, + "loss": 0.1741, + "mean_token_accuracy": 0.9627709746360779, + "num_tokens": 23993337.0, + "step": 14280 + }, + { + "entropy": 0.16236310750246047, + "epoch": 3.3298752768387923, + "grad_norm": 5.03125, + "learning_rate": 4.988164704012153e-05, + "loss": 0.2812, + "mean_token_accuracy": 0.9466241359710693, + "num_tokens": 24011775.0, + "step": 14285 + }, + { + "entropy": 0.13325685746967791, + "epoch": 3.3310409138594244, + "grad_norm": 2.375, + "learning_rate": 4.98814373967531e-05, + "loss": 0.1634, + "mean_token_accuracy": 0.9539561152458191, + "num_tokens": 24038307.0, + "step": 14290 + }, + { + "entropy": 0.1551351621747017, + "epoch": 3.332206550880056, + "grad_norm": 1.90625, + "learning_rate": 4.988122756875881e-05, + "loss": 0.2918, + "mean_token_accuracy": 0.9345535576343537, + "num_tokens": 24058169.0, + "step": 14295 + }, + { + "entropy": 0.11559103038161993, + "epoch": 3.3333721879006877, + "grad_norm": 2.703125, + "learning_rate": 4.988101755614181e-05, + "loss": 0.1565, + "mean_token_accuracy": 0.9542689323425293, + "num_tokens": 24083609.0, + "step": 14300 + }, + { + "entropy": 0.11027558147907257, + "epoch": 3.3345378249213193, + "grad_norm": 0.9140625, + "learning_rate": 4.9880807358905205e-05, + "loss": 0.1631, + "mean_token_accuracy": 0.9572655200958252, + "num_tokens": 24112635.0, + "step": 14305 + }, + { + "entropy": 0.12529565021395683, + "epoch": 3.3357034619419514, + "grad_norm": 6.71875, + "learning_rate": 4.9880596977052146e-05, + "loss": 0.208, + "mean_token_accuracy": 0.9428850889205933, + "num_tokens": 24127007.0, + "step": 14310 + }, + { + "entropy": 0.12923308033496142, + "epoch": 3.336869098962583, + "grad_norm": 0.578125, + "learning_rate": 4.988038641058577e-05, + "loss": 0.1843, + "mean_token_accuracy": 0.9490243077278138, + "num_tokens": 24155835.0, + "step": 14315 + }, + { + "entropy": 0.14287668615579605, + "epoch": 3.3380347359832148, + "grad_norm": 2.453125, + "learning_rate": 4.98801756595092e-05, + "loss": 0.188, + "mean_token_accuracy": 0.9477911353111267, + "num_tokens": 24178011.0, + "step": 14320 + }, + { + "entropy": 0.11310774460434914, + "epoch": 3.3392003730038464, + "grad_norm": 0.99609375, + "learning_rate": 4.9879964723825586e-05, + "loss": 0.1576, + "mean_token_accuracy": 0.9627729594707489, + "num_tokens": 24207402.0, + "step": 14325 + }, + { + "entropy": 0.12244891636073589, + "epoch": 3.3403660100244785, + "grad_norm": 1.203125, + "learning_rate": 4.9879753603538095e-05, + "loss": 0.1238, + "mean_token_accuracy": 0.9554929971694947, + "num_tokens": 24236148.0, + "step": 14330 + }, + { + "entropy": 0.17563813105225562, + "epoch": 3.34153164704511, + "grad_norm": 5.5625, + "learning_rate": 4.987954229864984e-05, + "loss": 0.2167, + "mean_token_accuracy": 0.9393636643886566, + "num_tokens": 24252465.0, + "step": 14335 + }, + { + "entropy": 0.13802102711051703, + "epoch": 3.342697284065742, + "grad_norm": 7.8125, + "learning_rate": 4.9879330809164006e-05, + "loss": 0.2489, + "mean_token_accuracy": 0.9434120118618011, + "num_tokens": 24264730.0, + "step": 14340 + }, + { + "entropy": 0.08878612257540226, + "epoch": 3.343862921086374, + "grad_norm": 0.59765625, + "learning_rate": 4.987911913508372e-05, + "loss": 0.1215, + "mean_token_accuracy": 0.9721697270870209, + "num_tokens": 24300175.0, + "step": 14345 + }, + { + "entropy": 0.10990535113960505, + "epoch": 3.3450285581070056, + "grad_norm": 6.75, + "learning_rate": 4.9878907276412154e-05, + "loss": 0.2078, + "mean_token_accuracy": 0.9509715735912323, + "num_tokens": 24318581.0, + "step": 14350 + }, + { + "entropy": 0.10521319657564163, + "epoch": 3.3461941951276373, + "grad_norm": 7.03125, + "learning_rate": 4.987869523315245e-05, + "loss": 0.1905, + "mean_token_accuracy": 0.9640790045261383, + "num_tokens": 24337633.0, + "step": 14355 + }, + { + "entropy": 0.14670916181057692, + "epoch": 3.347359832148269, + "grad_norm": 7.34375, + "learning_rate": 4.9878483005307793e-05, + "loss": 0.2265, + "mean_token_accuracy": 0.9481269121170044, + "num_tokens": 24355450.0, + "step": 14360 + }, + { + "entropy": 0.1167627077549696, + "epoch": 3.3485254691689006, + "grad_norm": 3.015625, + "learning_rate": 4.987827059288133e-05, + "loss": 0.1541, + "mean_token_accuracy": 0.9560252726078033, + "num_tokens": 24372481.0, + "step": 14365 + }, + { + "entropy": 0.08436639718711376, + "epoch": 3.3496911061895327, + "grad_norm": 2.03125, + "learning_rate": 4.9878057995876235e-05, + "loss": 0.1231, + "mean_token_accuracy": 0.9654347062110901, + "num_tokens": 24393139.0, + "step": 14370 + }, + { + "entropy": 0.15002177990972995, + "epoch": 3.3508567432101644, + "grad_norm": 4.84375, + "learning_rate": 4.9877845214295685e-05, + "loss": 0.1722, + "mean_token_accuracy": 0.9522580683231354, + "num_tokens": 24414782.0, + "step": 14375 + }, + { + "entropy": 0.16212302520871164, + "epoch": 3.352022380230796, + "grad_norm": 7.625, + "learning_rate": 4.987763224814284e-05, + "loss": 0.4094, + "mean_token_accuracy": 0.9197640836238861, + "num_tokens": 24427675.0, + "step": 14380 + }, + { + "entropy": 0.15881211534142495, + "epoch": 3.353188017251428, + "grad_norm": 5.03125, + "learning_rate": 4.987741909742088e-05, + "loss": 0.2321, + "mean_token_accuracy": 0.9404888391494751, + "num_tokens": 24444212.0, + "step": 14385 + }, + { + "entropy": 0.10046768113970757, + "epoch": 3.35435365427206, + "grad_norm": 0.5390625, + "learning_rate": 4.987720576213299e-05, + "loss": 0.1566, + "mean_token_accuracy": 0.9620472431182862, + "num_tokens": 24470465.0, + "step": 14390 + }, + { + "entropy": 0.1034924827516079, + "epoch": 3.3555192912926914, + "grad_norm": 1.2109375, + "learning_rate": 4.987699224228234e-05, + "loss": 0.212, + "mean_token_accuracy": 0.9532204508781433, + "num_tokens": 24483496.0, + "step": 14395 + }, + { + "entropy": 0.1358404979109764, + "epoch": 3.356684928313323, + "grad_norm": 2.0625, + "learning_rate": 4.987677853787212e-05, + "loss": 0.2205, + "mean_token_accuracy": 0.9371620178222656, + "num_tokens": 24494400.0, + "step": 14400 + }, + { + "entropy": 0.09902511592954397, + "epoch": 3.3578505653339548, + "grad_norm": 0.85546875, + "learning_rate": 4.987656464890552e-05, + "loss": 0.1139, + "mean_token_accuracy": 0.9678740739822388, + "num_tokens": 24535866.0, + "step": 14405 + }, + { + "entropy": 0.14913907796144485, + "epoch": 3.359016202354587, + "grad_norm": 6.5625, + "learning_rate": 4.987635057538572e-05, + "loss": 0.2283, + "mean_token_accuracy": 0.9517066776752472, + "num_tokens": 24543669.0, + "step": 14410 + }, + { + "entropy": 0.15391396917402744, + "epoch": 3.3601818393752185, + "grad_norm": 7.0, + "learning_rate": 4.987613631731592e-05, + "loss": 0.1576, + "mean_token_accuracy": 0.941257655620575, + "num_tokens": 24571538.0, + "step": 14415 + }, + { + "entropy": 0.10840295329689979, + "epoch": 3.36134747639585, + "grad_norm": 6.71875, + "learning_rate": 4.987592187469932e-05, + "loss": 0.2069, + "mean_token_accuracy": 0.9558283507823944, + "num_tokens": 24583454.0, + "step": 14420 + }, + { + "entropy": 0.1372427899390459, + "epoch": 3.3625131134164823, + "grad_norm": 0.6953125, + "learning_rate": 4.98757072475391e-05, + "loss": 0.1648, + "mean_token_accuracy": 0.957892256975174, + "num_tokens": 24600439.0, + "step": 14425 + }, + { + "entropy": 0.15529920905828476, + "epoch": 3.363678750437114, + "grad_norm": 8.375, + "learning_rate": 4.987549243583848e-05, + "loss": 0.2872, + "mean_token_accuracy": 0.9361419081687927, + "num_tokens": 24615485.0, + "step": 14430 + }, + { + "entropy": 0.14903669357299804, + "epoch": 3.3648443874577456, + "grad_norm": 0.9921875, + "learning_rate": 4.9875277439600644e-05, + "loss": 0.2364, + "mean_token_accuracy": 0.9379843652248383, + "num_tokens": 24644974.0, + "step": 14435 + }, + { + "entropy": 0.1466895282268524, + "epoch": 3.3660100244783773, + "grad_norm": 4.40625, + "learning_rate": 4.9875062258828815e-05, + "loss": 0.1642, + "mean_token_accuracy": 0.9672025084495545, + "num_tokens": 24669348.0, + "step": 14440 + }, + { + "entropy": 0.1412310428917408, + "epoch": 3.3671756614990094, + "grad_norm": 3.234375, + "learning_rate": 4.98748468935262e-05, + "loss": 0.2244, + "mean_token_accuracy": 0.9454042732715606, + "num_tokens": 24685191.0, + "step": 14445 + }, + { + "entropy": 0.10494980327785015, + "epoch": 3.368341298519641, + "grad_norm": 1.0390625, + "learning_rate": 4.987463134369599e-05, + "loss": 0.1908, + "mean_token_accuracy": 0.9530590176582336, + "num_tokens": 24709482.0, + "step": 14450 + }, + { + "entropy": 0.15416168235242367, + "epoch": 3.3695069355402727, + "grad_norm": 5.28125, + "learning_rate": 4.987441560934142e-05, + "loss": 0.2631, + "mean_token_accuracy": 0.9420225262641907, + "num_tokens": 24718356.0, + "step": 14455 + }, + { + "entropy": 0.14280086159706115, + "epoch": 3.3706725725609044, + "grad_norm": 4.625, + "learning_rate": 4.9874199690465705e-05, + "loss": 0.3272, + "mean_token_accuracy": 0.9363387167453766, + "num_tokens": 24727686.0, + "step": 14460 + }, + { + "entropy": 0.11754573006182908, + "epoch": 3.3718382095815365, + "grad_norm": 3.078125, + "learning_rate": 4.987398358707206e-05, + "loss": 0.1205, + "mean_token_accuracy": 0.9647015869617462, + "num_tokens": 24747975.0, + "step": 14465 + }, + { + "entropy": 0.31885806322097776, + "epoch": 3.373003846602168, + "grad_norm": 5.9375, + "learning_rate": 4.98737672991637e-05, + "loss": 0.6347, + "mean_token_accuracy": 0.8757389962673188, + "num_tokens": 24774500.0, + "step": 14470 + }, + { + "entropy": 0.13117293193936347, + "epoch": 3.3741694836228, + "grad_norm": 8.25, + "learning_rate": 4.987355082674387e-05, + "loss": 0.2232, + "mean_token_accuracy": 0.9562461674213409, + "num_tokens": 24794508.0, + "step": 14475 + }, + { + "entropy": 0.08942113667726517, + "epoch": 3.3753351206434314, + "grad_norm": 1.3046875, + "learning_rate": 4.987333416981578e-05, + "loss": 0.1175, + "mean_token_accuracy": 0.9634603500366211, + "num_tokens": 24815483.0, + "step": 14480 + }, + { + "entropy": 0.09471773691475391, + "epoch": 3.3765007576640635, + "grad_norm": 3.046875, + "learning_rate": 4.987311732838267e-05, + "loss": 0.0834, + "mean_token_accuracy": 0.9728473007678986, + "num_tokens": 24840356.0, + "step": 14485 + }, + { + "entropy": 0.1439421821385622, + "epoch": 3.377666394684695, + "grad_norm": 2.96875, + "learning_rate": 4.9872900302447766e-05, + "loss": 0.1813, + "mean_token_accuracy": 0.9585707783699036, + "num_tokens": 24867551.0, + "step": 14490 + }, + { + "entropy": 0.13271769210696222, + "epoch": 3.378832031705327, + "grad_norm": 3.453125, + "learning_rate": 4.9872683092014315e-05, + "loss": 0.1605, + "mean_token_accuracy": 0.9479785680770874, + "num_tokens": 24886496.0, + "step": 14495 + }, + { + "entropy": 0.11814107708632945, + "epoch": 3.3799976687259585, + "grad_norm": 2.640625, + "learning_rate": 4.987246569708555e-05, + "loss": 0.1814, + "mean_token_accuracy": 0.9549219727516174, + "num_tokens": 24896522.0, + "step": 14500 + }, + { + "entropy": 0.09513367228209972, + "epoch": 3.3811633057465906, + "grad_norm": 2.859375, + "learning_rate": 4.9872248117664706e-05, + "loss": 0.12, + "mean_token_accuracy": 0.9677752256393433, + "num_tokens": 24914316.0, + "step": 14505 + }, + { + "entropy": 0.0879001509398222, + "epoch": 3.3823289427672223, + "grad_norm": 8.75, + "learning_rate": 4.987203035375503e-05, + "loss": 0.1097, + "mean_token_accuracy": 0.9770538091659546, + "num_tokens": 24941795.0, + "step": 14510 + }, + { + "entropy": 0.11058845482766629, + "epoch": 3.383494579787854, + "grad_norm": 0.61328125, + "learning_rate": 4.987181240535978e-05, + "loss": 0.1229, + "mean_token_accuracy": 0.9565304040908813, + "num_tokens": 24980354.0, + "step": 14515 + }, + { + "entropy": 0.11777700698003173, + "epoch": 3.384660216808486, + "grad_norm": 0.328125, + "learning_rate": 4.98715942724822e-05, + "loss": 0.2174, + "mean_token_accuracy": 0.955057030916214, + "num_tokens": 25003226.0, + "step": 14520 + }, + { + "entropy": 0.10959577150642871, + "epoch": 3.3858258538291177, + "grad_norm": 4.03125, + "learning_rate": 4.9871375955125535e-05, + "loss": 0.12, + "mean_token_accuracy": 0.9680540144443512, + "num_tokens": 25031189.0, + "step": 14525 + }, + { + "entropy": 0.16477933339774609, + "epoch": 3.3869914908497494, + "grad_norm": 6.1875, + "learning_rate": 4.9871157453293057e-05, + "loss": 0.2975, + "mean_token_accuracy": 0.9351774334907532, + "num_tokens": 25053474.0, + "step": 14530 + }, + { + "entropy": 0.14881827160716057, + "epoch": 3.388157127870381, + "grad_norm": 0.73046875, + "learning_rate": 4.987093876698801e-05, + "loss": 0.1589, + "mean_token_accuracy": 0.9479979932308197, + "num_tokens": 25098699.0, + "step": 14535 + }, + { + "entropy": 0.14751329086720943, + "epoch": 3.3893227648910127, + "grad_norm": 2.15625, + "learning_rate": 4.9870719896213654e-05, + "loss": 0.2256, + "mean_token_accuracy": 0.9514884114265442, + "num_tokens": 25119598.0, + "step": 14540 + }, + { + "entropy": 0.13617292381823062, + "epoch": 3.390488401911645, + "grad_norm": 5.25, + "learning_rate": 4.987050084097326e-05, + "loss": 0.2124, + "mean_token_accuracy": 0.9466132402420044, + "num_tokens": 25139817.0, + "step": 14545 + }, + { + "entropy": 0.13683177419006826, + "epoch": 3.3916540389322765, + "grad_norm": 1.0625, + "learning_rate": 4.987028160127009e-05, + "loss": 0.287, + "mean_token_accuracy": 0.9442753791809082, + "num_tokens": 25160490.0, + "step": 14550 + }, + { + "entropy": 0.15925323218107224, + "epoch": 3.392819675952908, + "grad_norm": 2.53125, + "learning_rate": 4.9870062177107425e-05, + "loss": 0.1181, + "mean_token_accuracy": 0.9543979823589325, + "num_tokens": 25182754.0, + "step": 14555 + }, + { + "entropy": 0.09092184137552976, + "epoch": 3.3939853129735402, + "grad_norm": 0.5859375, + "learning_rate": 4.986984256848852e-05, + "loss": 0.105, + "mean_token_accuracy": 0.9717929124832153, + "num_tokens": 25211791.0, + "step": 14560 + }, + { + "entropy": 0.10400575455278158, + "epoch": 3.395150949994172, + "grad_norm": 0.8984375, + "learning_rate": 4.986962277541665e-05, + "loss": 0.1191, + "mean_token_accuracy": 0.966571044921875, + "num_tokens": 25232556.0, + "step": 14565 + }, + { + "entropy": 0.10869345776736736, + "epoch": 3.3963165870148035, + "grad_norm": 3.4375, + "learning_rate": 4.986940279789511e-05, + "loss": 0.26, + "mean_token_accuracy": 0.9437666535377502, + "num_tokens": 25243127.0, + "step": 14570 + }, + { + "entropy": 0.14265891797840596, + "epoch": 3.397482224035435, + "grad_norm": 8.8125, + "learning_rate": 4.986918263592717e-05, + "loss": 0.2428, + "mean_token_accuracy": 0.947214013338089, + "num_tokens": 25259552.0, + "step": 14575 + }, + { + "entropy": 0.12260825484991074, + "epoch": 3.3986478610560673, + "grad_norm": 5.34375, + "learning_rate": 4.986896228951611e-05, + "loss": 0.1781, + "mean_token_accuracy": 0.9559799790382385, + "num_tokens": 25282016.0, + "step": 14580 + }, + { + "entropy": 0.16009515076875686, + "epoch": 3.399813498076699, + "grad_norm": 7.4375, + "learning_rate": 4.9868741758665216e-05, + "loss": 0.2073, + "mean_token_accuracy": 0.9487410664558411, + "num_tokens": 25298413.0, + "step": 14585 + }, + { + "entropy": 0.10344462431967258, + "epoch": 3.4009791350973306, + "grad_norm": 1.4296875, + "learning_rate": 4.986852104337778e-05, + "loss": 0.097, + "mean_token_accuracy": 0.9678920030593872, + "num_tokens": 25323048.0, + "step": 14590 + }, + { + "entropy": 0.11637456268072129, + "epoch": 3.4021447721179623, + "grad_norm": 4.25, + "learning_rate": 4.986830014365709e-05, + "loss": 0.1875, + "mean_token_accuracy": 0.9532294034957886, + "num_tokens": 25344447.0, + "step": 14595 + }, + { + "entropy": 0.12720008194446564, + "epoch": 3.4033104091385944, + "grad_norm": 3.5625, + "learning_rate": 4.9868079059506443e-05, + "loss": 0.1928, + "mean_token_accuracy": 0.9561345875263214, + "num_tokens": 25362513.0, + "step": 14600 + }, + { + "entropy": 0.10051075369119644, + "epoch": 3.404476046159226, + "grad_norm": 0.66015625, + "learning_rate": 4.986785779092914e-05, + "loss": 0.1544, + "mean_token_accuracy": 0.9672173917293548, + "num_tokens": 25382170.0, + "step": 14605 + }, + { + "entropy": 0.21919383555650712, + "epoch": 3.4056416831798577, + "grad_norm": 2.125, + "learning_rate": 4.986763633792847e-05, + "loss": 0.2736, + "mean_token_accuracy": 0.931623786687851, + "num_tokens": 25395526.0, + "step": 14610 + }, + { + "entropy": 0.13503132686018943, + "epoch": 3.4068073202004894, + "grad_norm": 8.25, + "learning_rate": 4.986741470050774e-05, + "loss": 0.2221, + "mean_token_accuracy": 0.9475953102111816, + "num_tokens": 25407123.0, + "step": 14615 + }, + { + "entropy": 0.10371251739561557, + "epoch": 3.4079729572211215, + "grad_norm": 3.328125, + "learning_rate": 4.986719287867025e-05, + "loss": 0.1929, + "mean_token_accuracy": 0.9592093110084534, + "num_tokens": 25431016.0, + "step": 14620 + }, + { + "entropy": 0.09616609346121549, + "epoch": 3.409138594241753, + "grad_norm": 0.58984375, + "learning_rate": 4.9866970872419324e-05, + "loss": 0.1478, + "mean_token_accuracy": 0.9624132871627807, + "num_tokens": 25449463.0, + "step": 14625 + }, + { + "entropy": 0.10277366153895855, + "epoch": 3.410304231262385, + "grad_norm": 1.515625, + "learning_rate": 4.986674868175826e-05, + "loss": 0.2068, + "mean_token_accuracy": 0.9529636323451995, + "num_tokens": 25473260.0, + "step": 14630 + }, + { + "entropy": 0.1336559236049652, + "epoch": 3.4114698682830165, + "grad_norm": 4.40625, + "learning_rate": 4.986652630669036e-05, + "loss": 0.1747, + "mean_token_accuracy": 0.9519469976425171, + "num_tokens": 25488285.0, + "step": 14635 + }, + { + "entropy": 0.12686145789921283, + "epoch": 3.4126355053036486, + "grad_norm": 0.5625, + "learning_rate": 4.9866303747218966e-05, + "loss": 0.2085, + "mean_token_accuracy": 0.9547830998897553, + "num_tokens": 25511643.0, + "step": 14640 + }, + { + "entropy": 0.16887935139238835, + "epoch": 3.4138011423242802, + "grad_norm": 0.62109375, + "learning_rate": 4.986608100334738e-05, + "loss": 0.264, + "mean_token_accuracy": 0.9450030744075775, + "num_tokens": 25529182.0, + "step": 14645 + }, + { + "entropy": 0.13547539040446283, + "epoch": 3.414966779344912, + "grad_norm": 2.859375, + "learning_rate": 4.9865858075078925e-05, + "loss": 0.1122, + "mean_token_accuracy": 0.9562264323234558, + "num_tokens": 25549237.0, + "step": 14650 + }, + { + "entropy": 0.10412461049854756, + "epoch": 3.416132416365544, + "grad_norm": 2.140625, + "learning_rate": 4.986563496241693e-05, + "loss": 0.1686, + "mean_token_accuracy": 0.9625400602817535, + "num_tokens": 25572688.0, + "step": 14655 + }, + { + "entropy": 0.20502724535763264, + "epoch": 3.4172980533861756, + "grad_norm": 5.125, + "learning_rate": 4.986541166536471e-05, + "loss": 0.3598, + "mean_token_accuracy": 0.9178643345832824, + "num_tokens": 25601340.0, + "step": 14660 + }, + { + "entropy": 0.1213500926271081, + "epoch": 3.4184636904068073, + "grad_norm": 0.66796875, + "learning_rate": 4.9865188183925614e-05, + "loss": 0.0992, + "mean_token_accuracy": 0.955825799703598, + "num_tokens": 25640563.0, + "step": 14665 + }, + { + "entropy": 0.12477913033217192, + "epoch": 3.419629327427439, + "grad_norm": 6.0625, + "learning_rate": 4.9864964518102955e-05, + "loss": 0.1823, + "mean_token_accuracy": 0.9556873321533204, + "num_tokens": 25665320.0, + "step": 14670 + }, + { + "entropy": 0.11235918961465359, + "epoch": 3.4207949644480706, + "grad_norm": 3.703125, + "learning_rate": 4.986474066790008e-05, + "loss": 0.1455, + "mean_token_accuracy": 0.9635839581489563, + "num_tokens": 25677602.0, + "step": 14675 + }, + { + "entropy": 0.11041145771741867, + "epoch": 3.4219606014687027, + "grad_norm": 2.578125, + "learning_rate": 4.986451663332033e-05, + "loss": 0.1105, + "mean_token_accuracy": 0.9651696503162384, + "num_tokens": 25701403.0, + "step": 14680 + }, + { + "entropy": 0.11710798554122448, + "epoch": 3.4231262384893344, + "grad_norm": 3.171875, + "learning_rate": 4.9864292414367035e-05, + "loss": 0.1645, + "mean_token_accuracy": 0.9639812111854553, + "num_tokens": 25714663.0, + "step": 14685 + }, + { + "entropy": 0.15187625922262668, + "epoch": 3.424291875509966, + "grad_norm": 7.8125, + "learning_rate": 4.986406801104354e-05, + "loss": 0.2054, + "mean_token_accuracy": 0.9400863349437714, + "num_tokens": 25725105.0, + "step": 14690 + }, + { + "entropy": 0.17266749329864978, + "epoch": 3.425457512530598, + "grad_norm": 0.62109375, + "learning_rate": 4.98638434233532e-05, + "loss": 0.2918, + "mean_token_accuracy": 0.9378050565719604, + "num_tokens": 25754734.0, + "step": 14695 + }, + { + "entropy": 0.10930528789758683, + "epoch": 3.42662314955123, + "grad_norm": 2.734375, + "learning_rate": 4.986361865129935e-05, + "loss": 0.0861, + "mean_token_accuracy": 0.9631672918796539, + "num_tokens": 25782665.0, + "step": 14700 + }, + { + "entropy": 0.09926526248455048, + "epoch": 3.4277887865718615, + "grad_norm": 0.99609375, + "learning_rate": 4.986339369488536e-05, + "loss": 0.101, + "mean_token_accuracy": 0.9671981871128082, + "num_tokens": 25804543.0, + "step": 14705 + }, + { + "entropy": 0.12382793109863996, + "epoch": 3.428954423592493, + "grad_norm": 12.375, + "learning_rate": 4.986316855411457e-05, + "loss": 0.2453, + "mean_token_accuracy": 0.9416598856449128, + "num_tokens": 25824472.0, + "step": 14710 + }, + { + "entropy": 0.13590311929583548, + "epoch": 3.4301200606131252, + "grad_norm": 7.21875, + "learning_rate": 4.986294322899035e-05, + "loss": 0.3006, + "mean_token_accuracy": 0.9415552854537964, + "num_tokens": 25841402.0, + "step": 14715 + }, + { + "entropy": 0.1355541491881013, + "epoch": 3.431285697633757, + "grad_norm": 0.58203125, + "learning_rate": 4.986271771951604e-05, + "loss": 0.1188, + "mean_token_accuracy": 0.9523917257785797, + "num_tokens": 25862438.0, + "step": 14720 + }, + { + "entropy": 0.1305895209312439, + "epoch": 3.4324513346543886, + "grad_norm": 1.421875, + "learning_rate": 4.986249202569501e-05, + "loss": 0.2254, + "mean_token_accuracy": 0.9471198916435242, + "num_tokens": 25876083.0, + "step": 14725 + }, + { + "entropy": 0.15410227999091147, + "epoch": 3.4336169716750202, + "grad_norm": 3.671875, + "learning_rate": 4.986226614753064e-05, + "loss": 0.3039, + "mean_token_accuracy": 0.9457458674907684, + "num_tokens": 25883947.0, + "step": 14730 + }, + { + "entropy": 0.13728398755192756, + "epoch": 3.4347826086956523, + "grad_norm": 3.0, + "learning_rate": 4.9862040085026286e-05, + "loss": 0.203, + "mean_token_accuracy": 0.9486221551895142, + "num_tokens": 25901558.0, + "step": 14735 + }, + { + "entropy": 0.08139637187123298, + "epoch": 3.435948245716284, + "grad_norm": 0.6875, + "learning_rate": 4.986181383818532e-05, + "loss": 0.1275, + "mean_token_accuracy": 0.9661568462848663, + "num_tokens": 25936465.0, + "step": 14740 + }, + { + "entropy": 0.21345613710582256, + "epoch": 3.4371138827369156, + "grad_norm": 1.9765625, + "learning_rate": 4.986158740701112e-05, + "loss": 0.2902, + "mean_token_accuracy": 0.9319839298725128, + "num_tokens": 25964623.0, + "step": 14745 + }, + { + "entropy": 0.12739054299890995, + "epoch": 3.4382795197575473, + "grad_norm": 5.21875, + "learning_rate": 4.986136079150705e-05, + "loss": 0.2276, + "mean_token_accuracy": 0.9527396023273468, + "num_tokens": 25974800.0, + "step": 14750 + }, + { + "entropy": 0.12611738108098508, + "epoch": 3.4394451567781794, + "grad_norm": 0.6953125, + "learning_rate": 4.98611339916765e-05, + "loss": 0.1493, + "mean_token_accuracy": 0.964297991991043, + "num_tokens": 25995660.0, + "step": 14755 + }, + { + "entropy": 0.09796325005590915, + "epoch": 3.440610793798811, + "grad_norm": 12.0625, + "learning_rate": 4.9860907007522853e-05, + "loss": 0.1836, + "mean_token_accuracy": 0.9559937536716461, + "num_tokens": 26013294.0, + "step": 14760 + }, + { + "entropy": 0.10678284913301468, + "epoch": 3.4417764308194427, + "grad_norm": 1.4453125, + "learning_rate": 4.98606798390495e-05, + "loss": 0.1968, + "mean_token_accuracy": 0.9488790988922119, + "num_tokens": 26032299.0, + "step": 14765 + }, + { + "entropy": 0.0958446266129613, + "epoch": 3.4429420678400744, + "grad_norm": 1.1015625, + "learning_rate": 4.9860452486259806e-05, + "loss": 0.1872, + "mean_token_accuracy": 0.9606073677539826, + "num_tokens": 26055375.0, + "step": 14770 + }, + { + "entropy": 0.17819878049194812, + "epoch": 3.4441077048607065, + "grad_norm": 12.125, + "learning_rate": 4.9860224949157175e-05, + "loss": 0.2588, + "mean_token_accuracy": 0.9511879086494446, + "num_tokens": 26075362.0, + "step": 14775 + }, + { + "entropy": 0.10445505864918232, + "epoch": 3.445273341881338, + "grad_norm": 0.7421875, + "learning_rate": 4.985999722774501e-05, + "loss": 0.0783, + "mean_token_accuracy": 0.96468505859375, + "num_tokens": 26102862.0, + "step": 14780 + }, + { + "entropy": 0.1406596891582012, + "epoch": 3.44643897890197, + "grad_norm": 14.75, + "learning_rate": 4.985976932202668e-05, + "loss": 0.2339, + "mean_token_accuracy": 0.9482826471328736, + "num_tokens": 26120838.0, + "step": 14785 + }, + { + "entropy": 0.18683748096227645, + "epoch": 3.447604615922602, + "grad_norm": 0.97265625, + "learning_rate": 4.9859541232005616e-05, + "loss": 0.2159, + "mean_token_accuracy": 0.9524526178836823, + "num_tokens": 26134765.0, + "step": 14790 + }, + { + "entropy": 0.11773408502340317, + "epoch": 3.4487702529432336, + "grad_norm": 3.109375, + "learning_rate": 4.985931295768519e-05, + "loss": 0.2107, + "mean_token_accuracy": 0.9583574175834656, + "num_tokens": 26147152.0, + "step": 14795 + }, + { + "entropy": 0.16266305055469274, + "epoch": 3.4499358899638652, + "grad_norm": 2.125, + "learning_rate": 4.985908449906882e-05, + "loss": 0.1388, + "mean_token_accuracy": 0.9389150321483613, + "num_tokens": 26179377.0, + "step": 14800 + }, + { + "entropy": 0.14590467549860478, + "epoch": 3.451101526984497, + "grad_norm": 1.171875, + "learning_rate": 4.985885585615991e-05, + "loss": 0.2239, + "mean_token_accuracy": 0.9484518945217133, + "num_tokens": 26204972.0, + "step": 14805 + }, + { + "entropy": 0.08754877429455518, + "epoch": 3.4522671640051286, + "grad_norm": 1.3359375, + "learning_rate": 4.985862702896188e-05, + "loss": 0.0818, + "mean_token_accuracy": 0.9678203165531158, + "num_tokens": 26229849.0, + "step": 14810 + }, + { + "entropy": 0.1233688484877348, + "epoch": 3.4534328010257607, + "grad_norm": 5.40625, + "learning_rate": 4.985839801747812e-05, + "loss": 0.17, + "mean_token_accuracy": 0.9536608278751373, + "num_tokens": 26253483.0, + "step": 14815 + }, + { + "entropy": 0.12931798473000528, + "epoch": 3.4545984380463923, + "grad_norm": 7.75, + "learning_rate": 4.9858168821712065e-05, + "loss": 0.2308, + "mean_token_accuracy": 0.9489045977592468, + "num_tokens": 26269779.0, + "step": 14820 + }, + { + "entropy": 0.08671133350580931, + "epoch": 3.455764075067024, + "grad_norm": 0.66796875, + "learning_rate": 4.9857939441667125e-05, + "loss": 0.0994, + "mean_token_accuracy": 0.9656893134117126, + "num_tokens": 26300644.0, + "step": 14825 + }, + { + "entropy": 0.09761807462200522, + "epoch": 3.456929712087656, + "grad_norm": 6.125, + "learning_rate": 4.985770987734672e-05, + "loss": 0.1859, + "mean_token_accuracy": 0.9515304028987884, + "num_tokens": 26319996.0, + "step": 14830 + }, + { + "entropy": 0.2023943942040205, + "epoch": 3.4580953491082878, + "grad_norm": 4.6875, + "learning_rate": 4.985748012875427e-05, + "loss": 0.2977, + "mean_token_accuracy": 0.9330312609672546, + "num_tokens": 26341742.0, + "step": 14835 + }, + { + "entropy": 0.19122447371482848, + "epoch": 3.4592609861289194, + "grad_norm": 3.34375, + "learning_rate": 4.985725019589321e-05, + "loss": 0.2801, + "mean_token_accuracy": 0.9373561441898346, + "num_tokens": 26367285.0, + "step": 14840 + }, + { + "entropy": 0.124190529063344, + "epoch": 3.460426623149551, + "grad_norm": 1.9453125, + "learning_rate": 4.985702007876696e-05, + "loss": 0.186, + "mean_token_accuracy": 0.9488656878471374, + "num_tokens": 26378772.0, + "step": 14845 + }, + { + "entropy": 0.13527803476899863, + "epoch": 3.461592260170183, + "grad_norm": 6.375, + "learning_rate": 4.985678977737895e-05, + "loss": 0.208, + "mean_token_accuracy": 0.9437292635440826, + "num_tokens": 26396490.0, + "step": 14850 + }, + { + "entropy": 0.09913870450109244, + "epoch": 3.462757897190815, + "grad_norm": 1.40625, + "learning_rate": 4.985655929173263e-05, + "loss": 0.1981, + "mean_token_accuracy": 0.950927072763443, + "num_tokens": 26413952.0, + "step": 14855 + }, + { + "entropy": 0.10881685335189104, + "epoch": 3.4639235342114465, + "grad_norm": 0.546875, + "learning_rate": 4.985632862183142e-05, + "loss": 0.1164, + "mean_token_accuracy": 0.9676409482955932, + "num_tokens": 26441690.0, + "step": 14860 + }, + { + "entropy": 0.11910134218633175, + "epoch": 3.465089171232078, + "grad_norm": 5.0, + "learning_rate": 4.9856097767678764e-05, + "loss": 0.1886, + "mean_token_accuracy": 0.9555825769901276, + "num_tokens": 26465495.0, + "step": 14865 + }, + { + "entropy": 0.12688817270100117, + "epoch": 3.4662548082527103, + "grad_norm": 11.5625, + "learning_rate": 4.9855866729278114e-05, + "loss": 0.1772, + "mean_token_accuracy": 0.951020085811615, + "num_tokens": 26486507.0, + "step": 14870 + }, + { + "entropy": 0.17504406850785018, + "epoch": 3.467420445273342, + "grad_norm": 4.34375, + "learning_rate": 4.985563550663289e-05, + "loss": 0.2582, + "mean_token_accuracy": 0.9411736011505127, + "num_tokens": 26503076.0, + "step": 14875 + }, + { + "entropy": 0.18092816174030305, + "epoch": 3.4685860822939736, + "grad_norm": 4.5, + "learning_rate": 4.9855404099746574e-05, + "loss": 0.3976, + "mean_token_accuracy": 0.929013067483902, + "num_tokens": 26511528.0, + "step": 14880 + }, + { + "entropy": 0.09570451155304908, + "epoch": 3.4697517193146052, + "grad_norm": 0.65234375, + "learning_rate": 4.985517250862259e-05, + "loss": 0.1375, + "mean_token_accuracy": 0.9654882431030274, + "num_tokens": 26541793.0, + "step": 14885 + }, + { + "entropy": 0.10951367020606995, + "epoch": 3.4709173563352373, + "grad_norm": 8.0, + "learning_rate": 4.98549407332644e-05, + "loss": 0.1434, + "mean_token_accuracy": 0.9649019360542297, + "num_tokens": 26557800.0, + "step": 14890 + }, + { + "entropy": 0.20391902434639633, + "epoch": 3.472082993355869, + "grad_norm": 4.125, + "learning_rate": 4.985470877367546e-05, + "loss": 0.4358, + "mean_token_accuracy": 0.9307071447372437, + "num_tokens": 26584988.0, + "step": 14895 + }, + { + "entropy": 0.1240554254502058, + "epoch": 3.4732486303765007, + "grad_norm": 4.15625, + "learning_rate": 4.985447662985924e-05, + "loss": 0.2078, + "mean_token_accuracy": 0.9551225781440735, + "num_tokens": 26611675.0, + "step": 14900 + }, + { + "entropy": 0.07346545197069645, + "epoch": 3.4744142673971323, + "grad_norm": 2.46875, + "learning_rate": 4.985424430181918e-05, + "loss": 0.0766, + "mean_token_accuracy": 0.9802018702030182, + "num_tokens": 26637716.0, + "step": 14905 + }, + { + "entropy": 0.10290342718362808, + "epoch": 3.4755799044177644, + "grad_norm": 0.7734375, + "learning_rate": 4.9854011789558764e-05, + "loss": 0.13, + "mean_token_accuracy": 0.963810783624649, + "num_tokens": 26661275.0, + "step": 14910 + }, + { + "entropy": 0.1287055004388094, + "epoch": 3.476745541438396, + "grad_norm": 1.75, + "learning_rate": 4.985377909308144e-05, + "loss": 0.1955, + "mean_token_accuracy": 0.9553277611732482, + "num_tokens": 26677624.0, + "step": 14915 + }, + { + "entropy": 0.11266841646283865, + "epoch": 3.4779111784590278, + "grad_norm": 3.546875, + "learning_rate": 4.98535462123907e-05, + "loss": 0.126, + "mean_token_accuracy": 0.9733292937278748, + "num_tokens": 26693952.0, + "step": 14920 + }, + { + "entropy": 0.1386932224035263, + "epoch": 3.47907681547966, + "grad_norm": 1.0078125, + "learning_rate": 4.985331314749e-05, + "loss": 0.2552, + "mean_token_accuracy": 0.9495406329631806, + "num_tokens": 26714106.0, + "step": 14925 + }, + { + "entropy": 0.10363805964589119, + "epoch": 3.4802424525002915, + "grad_norm": 4.0625, + "learning_rate": 4.985307989838282e-05, + "loss": 0.1596, + "mean_token_accuracy": 0.9670630037784577, + "num_tokens": 26737082.0, + "step": 14930 + }, + { + "entropy": 0.08988064369186759, + "epoch": 3.481408089520923, + "grad_norm": 4.34375, + "learning_rate": 4.985284646507264e-05, + "loss": 0.1481, + "mean_token_accuracy": 0.9675386667251586, + "num_tokens": 26760018.0, + "step": 14935 + }, + { + "entropy": 0.0987139768898487, + "epoch": 3.482573726541555, + "grad_norm": 0.6328125, + "learning_rate": 4.9852612847562936e-05, + "loss": 0.1844, + "mean_token_accuracy": 0.9589445412158966, + "num_tokens": 26779761.0, + "step": 14940 + }, + { + "entropy": 0.12049501091241836, + "epoch": 3.4837393635621865, + "grad_norm": 9.5625, + "learning_rate": 4.98523790458572e-05, + "loss": 0.2723, + "mean_token_accuracy": 0.9480070292949676, + "num_tokens": 26789666.0, + "step": 14945 + }, + { + "entropy": 0.08247858472168446, + "epoch": 3.4849050005828186, + "grad_norm": 1.1875, + "learning_rate": 4.985214505995891e-05, + "loss": 0.1239, + "mean_token_accuracy": 0.9717278420925141, + "num_tokens": 26813109.0, + "step": 14950 + }, + { + "entropy": 0.12373597361147404, + "epoch": 3.4860706376034503, + "grad_norm": 0.71484375, + "learning_rate": 4.9851910889871554e-05, + "loss": 0.1562, + "mean_token_accuracy": 0.9588066041469574, + "num_tokens": 26843536.0, + "step": 14955 + }, + { + "entropy": 0.16234209034591912, + "epoch": 3.487236274624082, + "grad_norm": 1.515625, + "learning_rate": 4.985167653559864e-05, + "loss": 0.2294, + "mean_token_accuracy": 0.9414169251918793, + "num_tokens": 26870752.0, + "step": 14960 + }, + { + "entropy": 0.1290282540023327, + "epoch": 3.488401911644714, + "grad_norm": 6.125, + "learning_rate": 4.9851441997143646e-05, + "loss": 0.2052, + "mean_token_accuracy": 0.9569578051567078, + "num_tokens": 26890897.0, + "step": 14965 + }, + { + "entropy": 0.12936341762542725, + "epoch": 3.4895675486653457, + "grad_norm": 6.0625, + "learning_rate": 4.985120727451007e-05, + "loss": 0.1832, + "mean_token_accuracy": 0.9518023371696472, + "num_tokens": 26911675.0, + "step": 14970 + }, + { + "entropy": 0.11797473523765803, + "epoch": 3.4907331856859773, + "grad_norm": 2.9375, + "learning_rate": 4.985097236770142e-05, + "loss": 0.1362, + "mean_token_accuracy": 0.9681229948997497, + "num_tokens": 26931840.0, + "step": 14975 + }, + { + "entropy": 0.12061390187591314, + "epoch": 3.491898822706609, + "grad_norm": 7.0, + "learning_rate": 4.985073727672119e-05, + "loss": 0.2357, + "mean_token_accuracy": 0.9532876312732697, + "num_tokens": 26944348.0, + "step": 14980 + }, + { + "entropy": 0.13178243041038512, + "epoch": 3.493064459727241, + "grad_norm": 6.21875, + "learning_rate": 4.9850502001572905e-05, + "loss": 0.212, + "mean_token_accuracy": 0.9557155132293701, + "num_tokens": 26962502.0, + "step": 14985 + }, + { + "entropy": 0.09238446112722158, + "epoch": 3.4942300967478728, + "grad_norm": 4.4375, + "learning_rate": 4.9850266542260044e-05, + "loss": 0.1369, + "mean_token_accuracy": 0.9672260642051697, + "num_tokens": 26979970.0, + "step": 14990 + }, + { + "entropy": 0.11178217800334096, + "epoch": 3.4953957337685044, + "grad_norm": 0.61328125, + "learning_rate": 4.985003089878614e-05, + "loss": 0.1052, + "mean_token_accuracy": 0.9628475129604339, + "num_tokens": 27000359.0, + "step": 14995 + }, + { + "entropy": 0.08186651319265366, + "epoch": 3.496561370789136, + "grad_norm": 0.390625, + "learning_rate": 4.9849795071154696e-05, + "loss": 0.0904, + "mean_token_accuracy": 0.9762087106704712, + "num_tokens": 27037109.0, + "step": 15000 + }, + { + "entropy": 0.11758413314819335, + "epoch": 3.497727007809768, + "grad_norm": 6.15625, + "learning_rate": 4.9849559059369236e-05, + "loss": 0.1745, + "mean_token_accuracy": 0.9557598829269409, + "num_tokens": 27053484.0, + "step": 15005 + }, + { + "entropy": 0.09997645281255245, + "epoch": 3.4988926448304, + "grad_norm": 1.0546875, + "learning_rate": 4.984932286343327e-05, + "loss": 0.152, + "mean_token_accuracy": 0.9670864582061768, + "num_tokens": 27076982.0, + "step": 15010 + }, + { + "entropy": 0.10639596097171307, + "epoch": 3.5000582818510315, + "grad_norm": 5.59375, + "learning_rate": 4.984908648335033e-05, + "loss": 0.1861, + "mean_token_accuracy": 0.957602858543396, + "num_tokens": 27090182.0, + "step": 15015 + }, + { + "entropy": 0.10824050679802895, + "epoch": 3.5012239188716636, + "grad_norm": 3.765625, + "learning_rate": 4.984884991912394e-05, + "loss": 0.1751, + "mean_token_accuracy": 0.9627082347869873, + "num_tokens": 27101437.0, + "step": 15020 + }, + { + "entropy": 0.1901700021699071, + "epoch": 3.5023895558922953, + "grad_norm": 4.78125, + "learning_rate": 4.984861317075762e-05, + "loss": 0.2803, + "mean_token_accuracy": 0.9506976902484894, + "num_tokens": 27115517.0, + "step": 15025 + }, + { + "entropy": 0.08258118499070406, + "epoch": 3.503555192912927, + "grad_norm": 1.0234375, + "learning_rate": 4.98483762382549e-05, + "loss": 0.0934, + "mean_token_accuracy": 0.9727859675884247, + "num_tokens": 27139907.0, + "step": 15030 + }, + { + "entropy": 0.1136773657053709, + "epoch": 3.5047208299335586, + "grad_norm": 6.59375, + "learning_rate": 4.984813912161932e-05, + "loss": 0.1184, + "mean_token_accuracy": 0.9658208072185517, + "num_tokens": 27170803.0, + "step": 15035 + }, + { + "entropy": 0.1513181956484914, + "epoch": 3.5058864669541903, + "grad_norm": 4.90625, + "learning_rate": 4.984790182085442e-05, + "loss": 0.2491, + "mean_token_accuracy": 0.943081659078598, + "num_tokens": 27201574.0, + "step": 15040 + }, + { + "entropy": 0.0724037921987474, + "epoch": 3.5070521039748224, + "grad_norm": 1.4765625, + "learning_rate": 4.984766433596372e-05, + "loss": 0.0629, + "mean_token_accuracy": 0.9772098004817963, + "num_tokens": 27232309.0, + "step": 15045 + }, + { + "entropy": 0.1291754100471735, + "epoch": 3.508217740995454, + "grad_norm": 5.46875, + "learning_rate": 4.984742666695078e-05, + "loss": 0.2822, + "mean_token_accuracy": 0.9417845368385315, + "num_tokens": 27242969.0, + "step": 15050 + }, + { + "entropy": 0.12059564888477325, + "epoch": 3.5093833780160857, + "grad_norm": 4.34375, + "learning_rate": 4.984718881381914e-05, + "loss": 0.2573, + "mean_token_accuracy": 0.9473318874835968, + "num_tokens": 27256073.0, + "step": 15055 + }, + { + "entropy": 0.11320340782403945, + "epoch": 3.510549015036718, + "grad_norm": 1.3984375, + "learning_rate": 4.984695077657234e-05, + "loss": 0.2205, + "mean_token_accuracy": 0.9602530896663666, + "num_tokens": 27268498.0, + "step": 15060 + }, + { + "entropy": 0.17097560949623586, + "epoch": 3.5117146520573495, + "grad_norm": 6.0, + "learning_rate": 4.984671255521393e-05, + "loss": 0.265, + "mean_token_accuracy": 0.944575160741806, + "num_tokens": 27287806.0, + "step": 15065 + }, + { + "entropy": 0.11076088473200799, + "epoch": 3.512880289077981, + "grad_norm": 4.0625, + "learning_rate": 4.984647414974747e-05, + "loss": 0.0947, + "mean_token_accuracy": 0.9570927262306214, + "num_tokens": 27312827.0, + "step": 15070 + }, + { + "entropy": 0.15406885892152786, + "epoch": 3.5140459260986128, + "grad_norm": 1.390625, + "learning_rate": 4.9846235560176505e-05, + "loss": 0.2675, + "mean_token_accuracy": 0.9428991496562957, + "num_tokens": 27334152.0, + "step": 15075 + }, + { + "entropy": 0.126622261852026, + "epoch": 3.5152115631192444, + "grad_norm": 2.578125, + "learning_rate": 4.98459967865046e-05, + "loss": 0.2088, + "mean_token_accuracy": 0.9512693524360657, + "num_tokens": 27344318.0, + "step": 15080 + }, + { + "entropy": 0.16098451465368271, + "epoch": 3.5163772001398765, + "grad_norm": 3.921875, + "learning_rate": 4.984575782873532e-05, + "loss": 0.2481, + "mean_token_accuracy": 0.9495172142982483, + "num_tokens": 27367074.0, + "step": 15085 + }, + { + "entropy": 0.178832789324224, + "epoch": 3.517542837160508, + "grad_norm": 13.25, + "learning_rate": 4.9845518686872215e-05, + "loss": 0.2985, + "mean_token_accuracy": 0.9411785483360291, + "num_tokens": 27383701.0, + "step": 15090 + }, + { + "entropy": 0.13766136653721334, + "epoch": 3.51870847418114, + "grad_norm": 0.8671875, + "learning_rate": 4.984527936091885e-05, + "loss": 0.2352, + "mean_token_accuracy": 0.9489962339401246, + "num_tokens": 27399990.0, + "step": 15095 + }, + { + "entropy": 0.14877081848680973, + "epoch": 3.519874111201772, + "grad_norm": 10.8125, + "learning_rate": 4.984503985087882e-05, + "loss": 0.21, + "mean_token_accuracy": 0.9492302298545837, + "num_tokens": 27413742.0, + "step": 15100 + }, + { + "entropy": 0.15225835423916578, + "epoch": 3.5210397482224036, + "grad_norm": 10.9375, + "learning_rate": 4.9844800156755665e-05, + "loss": 0.2496, + "mean_token_accuracy": 0.9365932464599609, + "num_tokens": 27426570.0, + "step": 15105 + }, + { + "entropy": 0.11053324565291404, + "epoch": 3.5222053852430353, + "grad_norm": 1.3984375, + "learning_rate": 4.9844560278552976e-05, + "loss": 0.1407, + "mean_token_accuracy": 0.9656516313552856, + "num_tokens": 27454187.0, + "step": 15110 + }, + { + "entropy": 0.0999959884211421, + "epoch": 3.523371022263667, + "grad_norm": 0.80078125, + "learning_rate": 4.9844320216274326e-05, + "loss": 0.1142, + "mean_token_accuracy": 0.9690761923789978, + "num_tokens": 27475207.0, + "step": 15115 + }, + { + "entropy": 0.1034471170976758, + "epoch": 3.5245366592842986, + "grad_norm": 5.46875, + "learning_rate": 4.9844079969923295e-05, + "loss": 0.1016, + "mean_token_accuracy": 0.9620785713195801, + "num_tokens": 27494393.0, + "step": 15120 + }, + { + "entropy": 0.12301951255649328, + "epoch": 3.5257022963049307, + "grad_norm": 2.59375, + "learning_rate": 4.984383953950346e-05, + "loss": 0.1582, + "mean_token_accuracy": 0.9564571857452393, + "num_tokens": 27522349.0, + "step": 15125 + }, + { + "entropy": 0.1520868118852377, + "epoch": 3.5268679333255624, + "grad_norm": 6.875, + "learning_rate": 4.984359892501842e-05, + "loss": 0.32, + "mean_token_accuracy": 0.9273856461048127, + "num_tokens": 27533402.0, + "step": 15130 + }, + { + "entropy": 0.1576168665662408, + "epoch": 3.528033570346194, + "grad_norm": 4.4375, + "learning_rate": 4.9843358126471746e-05, + "loss": 0.2757, + "mean_token_accuracy": 0.9358682692050934, + "num_tokens": 27555411.0, + "step": 15135 + }, + { + "entropy": 0.16003640741109848, + "epoch": 3.529199207366826, + "grad_norm": 8.4375, + "learning_rate": 4.984311714386704e-05, + "loss": 0.2395, + "mean_token_accuracy": 0.9503751337528229, + "num_tokens": 27563017.0, + "step": 15140 + }, + { + "entropy": 0.1150142002850771, + "epoch": 3.530364844387458, + "grad_norm": 0.67578125, + "learning_rate": 4.984287597720789e-05, + "loss": 0.2614, + "mean_token_accuracy": 0.952860701084137, + "num_tokens": 27586379.0, + "step": 15145 + }, + { + "entropy": 0.1206765715032816, + "epoch": 3.5315304814080895, + "grad_norm": 5.84375, + "learning_rate": 4.98426346264979e-05, + "loss": 0.2821, + "mean_token_accuracy": 0.9396198153495788, + "num_tokens": 27598286.0, + "step": 15150 + }, + { + "entropy": 0.14196645841002464, + "epoch": 3.5326961184287216, + "grad_norm": 14.6875, + "learning_rate": 4.984239309174066e-05, + "loss": 0.3251, + "mean_token_accuracy": 0.9355909168720246, + "num_tokens": 27608408.0, + "step": 15155 + }, + { + "entropy": 0.11324986163526773, + "epoch": 3.533861755449353, + "grad_norm": 4.75, + "learning_rate": 4.984215137293978e-05, + "loss": 0.1432, + "mean_token_accuracy": 0.963469636440277, + "num_tokens": 27627909.0, + "step": 15160 + }, + { + "entropy": 0.10810782052576542, + "epoch": 3.535027392469985, + "grad_norm": 2.75, + "learning_rate": 4.984190947009885e-05, + "loss": 0.1521, + "mean_token_accuracy": 0.9573879361152648, + "num_tokens": 27644139.0, + "step": 15165 + }, + { + "entropy": 0.1313298497349024, + "epoch": 3.5361930294906165, + "grad_norm": 1.4453125, + "learning_rate": 4.9841667383221494e-05, + "loss": 0.2283, + "mean_token_accuracy": 0.9522750377655029, + "num_tokens": 27654841.0, + "step": 15170 + }, + { + "entropy": 0.08479751572012902, + "epoch": 3.537358666511248, + "grad_norm": 1.984375, + "learning_rate": 4.984142511231131e-05, + "loss": 0.0905, + "mean_token_accuracy": 0.966051709651947, + "num_tokens": 27679994.0, + "step": 15175 + }, + { + "entropy": 0.10010197125375271, + "epoch": 3.5385243035318803, + "grad_norm": 1.4765625, + "learning_rate": 4.9841182657371913e-05, + "loss": 0.134, + "mean_token_accuracy": 0.96122225522995, + "num_tokens": 27700305.0, + "step": 15180 + }, + { + "entropy": 0.12406671978533268, + "epoch": 3.539689940552512, + "grad_norm": 4.59375, + "learning_rate": 4.984094001840693e-05, + "loss": 0.1602, + "mean_token_accuracy": 0.9643674075603486, + "num_tokens": 27722193.0, + "step": 15185 + }, + { + "entropy": 0.2311142822727561, + "epoch": 3.5408555775731436, + "grad_norm": 2.53125, + "learning_rate": 4.984069719541996e-05, + "loss": 0.4184, + "mean_token_accuracy": 0.9141157269477844, + "num_tokens": 27746084.0, + "step": 15190 + }, + { + "entropy": 0.10267463922500611, + "epoch": 3.5420212145937757, + "grad_norm": 1.25, + "learning_rate": 4.984045418841464e-05, + "loss": 0.1895, + "mean_token_accuracy": 0.9577461659908295, + "num_tokens": 27762974.0, + "step": 15195 + }, + { + "entropy": 0.142679588124156, + "epoch": 3.5431868516144074, + "grad_norm": 2.96875, + "learning_rate": 4.984021099739458e-05, + "loss": 0.2069, + "mean_token_accuracy": 0.9549462914466857, + "num_tokens": 27781410.0, + "step": 15200 + }, + { + "entropy": 0.09814808573573827, + "epoch": 3.544352488635039, + "grad_norm": 4.375, + "learning_rate": 4.983996762236342e-05, + "loss": 0.1375, + "mean_token_accuracy": 0.9615116119384766, + "num_tokens": 27803041.0, + "step": 15205 + }, + { + "entropy": 0.15655723661184312, + "epoch": 3.5455181256556707, + "grad_norm": 3.875, + "learning_rate": 4.9839724063324775e-05, + "loss": 0.2196, + "mean_token_accuracy": 0.9429301500320435, + "num_tokens": 27824317.0, + "step": 15210 + }, + { + "entropy": 0.12263799570500851, + "epoch": 3.5466837626763024, + "grad_norm": 7.53125, + "learning_rate": 4.9839480320282295e-05, + "loss": 0.1773, + "mean_token_accuracy": 0.9546642303466797, + "num_tokens": 27839225.0, + "step": 15215 + }, + { + "entropy": 0.08694457476958632, + "epoch": 3.5478493996969345, + "grad_norm": 0.6640625, + "learning_rate": 4.98392363932396e-05, + "loss": 0.0964, + "mean_token_accuracy": 0.9705213010311127, + "num_tokens": 27874861.0, + "step": 15220 + }, + { + "entropy": 0.13864086512476206, + "epoch": 3.549015036717566, + "grad_norm": 7.25, + "learning_rate": 4.983899228220032e-05, + "loss": 0.2244, + "mean_token_accuracy": 0.9488819360733032, + "num_tokens": 27902160.0, + "step": 15225 + }, + { + "entropy": 0.1462076909840107, + "epoch": 3.550180673738198, + "grad_norm": 4.65625, + "learning_rate": 4.983874798716812e-05, + "loss": 0.1449, + "mean_token_accuracy": 0.9579621255397797, + "num_tokens": 27921363.0, + "step": 15230 + }, + { + "entropy": 0.15108269155025483, + "epoch": 3.55134631075883, + "grad_norm": 10.4375, + "learning_rate": 4.983850350814662e-05, + "loss": 0.2535, + "mean_token_accuracy": 0.9454373240470886, + "num_tokens": 27939954.0, + "step": 15235 + }, + { + "entropy": 0.1413711801171303, + "epoch": 3.5525119477794616, + "grad_norm": 7.78125, + "learning_rate": 4.983825884513948e-05, + "loss": 0.1943, + "mean_token_accuracy": 0.9570164740085602, + "num_tokens": 27960307.0, + "step": 15240 + }, + { + "entropy": 0.16688954643905163, + "epoch": 3.553677584800093, + "grad_norm": 4.40625, + "learning_rate": 4.983801399815035e-05, + "loss": 0.1522, + "mean_token_accuracy": 0.9553694128990173, + "num_tokens": 27971180.0, + "step": 15245 + }, + { + "entropy": 0.11758850514888763, + "epoch": 3.554843221820725, + "grad_norm": 4.3125, + "learning_rate": 4.9837768967182855e-05, + "loss": 0.2191, + "mean_token_accuracy": 0.9551753103733063, + "num_tokens": 27981929.0, + "step": 15250 + }, + { + "entropy": 0.08926377333700657, + "epoch": 3.5560088588413565, + "grad_norm": 2.5625, + "learning_rate": 4.983752375224068e-05, + "loss": 0.1264, + "mean_token_accuracy": 0.970320975780487, + "num_tokens": 27997156.0, + "step": 15255 + }, + { + "entropy": 0.07973122633993626, + "epoch": 3.5571744958619886, + "grad_norm": 5.46875, + "learning_rate": 4.9837278353327466e-05, + "loss": 0.1264, + "mean_token_accuracy": 0.9706411957740784, + "num_tokens": 28015681.0, + "step": 15260 + }, + { + "entropy": 0.12640931233763694, + "epoch": 3.5583401328826203, + "grad_norm": 7.28125, + "learning_rate": 4.9837032770446876e-05, + "loss": 0.257, + "mean_token_accuracy": 0.9483764410018921, + "num_tokens": 28029400.0, + "step": 15265 + }, + { + "entropy": 0.16228528693318367, + "epoch": 3.559505769903252, + "grad_norm": 1.2734375, + "learning_rate": 4.9836787003602575e-05, + "loss": 0.3166, + "mean_token_accuracy": 0.9338264346122742, + "num_tokens": 28045344.0, + "step": 15270 + }, + { + "entropy": 0.11942652426660061, + "epoch": 3.560671406923884, + "grad_norm": 9.375, + "learning_rate": 4.983654105279822e-05, + "loss": 0.1878, + "mean_token_accuracy": 0.9502759516239166, + "num_tokens": 28066127.0, + "step": 15275 + }, + { + "entropy": 0.11859394274652005, + "epoch": 3.5618370439445157, + "grad_norm": 0.55078125, + "learning_rate": 4.983629491803748e-05, + "loss": 0.1743, + "mean_token_accuracy": 0.9581486761569977, + "num_tokens": 28081534.0, + "step": 15280 + }, + { + "entropy": 0.13153154766187072, + "epoch": 3.5630026809651474, + "grad_norm": 6.375, + "learning_rate": 4.983604859932404e-05, + "loss": 0.203, + "mean_token_accuracy": 0.9452811002731323, + "num_tokens": 28098550.0, + "step": 15285 + }, + { + "entropy": 0.11013969406485558, + "epoch": 3.5641683179857795, + "grad_norm": 5.03125, + "learning_rate": 4.983580209666155e-05, + "loss": 0.1825, + "mean_token_accuracy": 0.9638396859169006, + "num_tokens": 28111009.0, + "step": 15290 + }, + { + "entropy": 0.162632117792964, + "epoch": 3.565333955006411, + "grad_norm": 6.71875, + "learning_rate": 4.9835555410053696e-05, + "loss": 0.2456, + "mean_token_accuracy": 0.943684995174408, + "num_tokens": 28129844.0, + "step": 15295 + }, + { + "entropy": 0.158411131054163, + "epoch": 3.566499592027043, + "grad_norm": 3.921875, + "learning_rate": 4.983530853950416e-05, + "loss": 0.2244, + "mean_token_accuracy": 0.9570217251777648, + "num_tokens": 28152006.0, + "step": 15300 + }, + { + "entropy": 0.10298453513532876, + "epoch": 3.5676652290476745, + "grad_norm": 3.625, + "learning_rate": 4.983506148501662e-05, + "loss": 0.154, + "mean_token_accuracy": 0.9592744171619415, + "num_tokens": 28176371.0, + "step": 15305 + }, + { + "entropy": 0.12455929182469845, + "epoch": 3.568830866068306, + "grad_norm": 3.0, + "learning_rate": 4.983481424659476e-05, + "loss": 0.1722, + "mean_token_accuracy": 0.9571922421455383, + "num_tokens": 28194824.0, + "step": 15310 + }, + { + "entropy": 0.17322831777855754, + "epoch": 3.5699965030889382, + "grad_norm": 8.4375, + "learning_rate": 4.983456682424227e-05, + "loss": 0.3505, + "mean_token_accuracy": 0.9319654107093811, + "num_tokens": 28220220.0, + "step": 15315 + }, + { + "entropy": 0.16658639833331107, + "epoch": 3.57116214010957, + "grad_norm": 4.0625, + "learning_rate": 4.983431921796284e-05, + "loss": 0.3279, + "mean_token_accuracy": 0.9347608745098114, + "num_tokens": 28238594.0, + "step": 15320 + }, + { + "entropy": 0.22185911796987057, + "epoch": 3.5723277771302016, + "grad_norm": 1.1484375, + "learning_rate": 4.983407142776015e-05, + "loss": 0.2466, + "mean_token_accuracy": 0.937779426574707, + "num_tokens": 28269567.0, + "step": 15325 + }, + { + "entropy": 0.12034954987466336, + "epoch": 3.5734934141508337, + "grad_norm": 2.484375, + "learning_rate": 4.983382345363791e-05, + "loss": 0.1659, + "mean_token_accuracy": 0.9600392401218414, + "num_tokens": 28286618.0, + "step": 15330 + }, + { + "entropy": 0.13138836286962033, + "epoch": 3.5746590511714653, + "grad_norm": 6.5, + "learning_rate": 4.98335752955998e-05, + "loss": 0.1817, + "mean_token_accuracy": 0.9593860268592834, + "num_tokens": 28314646.0, + "step": 15335 + }, + { + "entropy": 0.0884293843060732, + "epoch": 3.575824688192097, + "grad_norm": 0.65234375, + "learning_rate": 4.983332695364955e-05, + "loss": 0.0927, + "mean_token_accuracy": 0.973773580789566, + "num_tokens": 28348623.0, + "step": 15340 + }, + { + "entropy": 0.12610990218818188, + "epoch": 3.5769903252127286, + "grad_norm": 4.84375, + "learning_rate": 4.983307842779083e-05, + "loss": 0.1848, + "mean_token_accuracy": 0.9506776928901672, + "num_tokens": 28359937.0, + "step": 15345 + }, + { + "entropy": 0.11546969823539258, + "epoch": 3.5781559622333603, + "grad_norm": 2.96875, + "learning_rate": 4.9832829718027364e-05, + "loss": 0.1492, + "mean_token_accuracy": 0.9545735061168671, + "num_tokens": 28371610.0, + "step": 15350 + }, + { + "entropy": 0.14369152104482055, + "epoch": 3.5793215992539924, + "grad_norm": 4.5, + "learning_rate": 4.9832580824362856e-05, + "loss": 0.2731, + "mean_token_accuracy": 0.9323119401931763, + "num_tokens": 28414897.0, + "step": 15355 + }, + { + "entropy": 0.11302361041307449, + "epoch": 3.580487236274624, + "grad_norm": 4.9375, + "learning_rate": 4.9832331746801017e-05, + "loss": 0.1324, + "mean_token_accuracy": 0.961307954788208, + "num_tokens": 28439232.0, + "step": 15360 + }, + { + "entropy": 0.11460042167454958, + "epoch": 3.5816528732952557, + "grad_norm": 1.875, + "learning_rate": 4.9832082485345573e-05, + "loss": 0.1597, + "mean_token_accuracy": 0.9591732680797577, + "num_tokens": 28472770.0, + "step": 15365 + }, + { + "entropy": 0.1156302273273468, + "epoch": 3.582818510315888, + "grad_norm": 9.125, + "learning_rate": 4.983183304000022e-05, + "loss": 0.2583, + "mean_token_accuracy": 0.9537204146385193, + "num_tokens": 28483640.0, + "step": 15370 + }, + { + "entropy": 0.11941173635423183, + "epoch": 3.5839841473365195, + "grad_norm": 0.9140625, + "learning_rate": 4.9831583410768695e-05, + "loss": 0.089, + "mean_token_accuracy": 0.9625566601753235, + "num_tokens": 28515232.0, + "step": 15375 + }, + { + "entropy": 0.16708160936832428, + "epoch": 3.585149784357151, + "grad_norm": 3.96875, + "learning_rate": 4.983133359765471e-05, + "loss": 0.3082, + "mean_token_accuracy": 0.9396808683872223, + "num_tokens": 28525110.0, + "step": 15380 + }, + { + "entropy": 0.10022514667361974, + "epoch": 3.586315421377783, + "grad_norm": 0.59765625, + "learning_rate": 4.9831083600661995e-05, + "loss": 0.116, + "mean_token_accuracy": 0.9687018394470215, + "num_tokens": 28547249.0, + "step": 15385 + }, + { + "entropy": 0.10627367310225963, + "epoch": 3.5874810583984145, + "grad_norm": 0.96875, + "learning_rate": 4.983083341979427e-05, + "loss": 0.151, + "mean_token_accuracy": 0.9632999539375305, + "num_tokens": 28562994.0, + "step": 15390 + }, + { + "entropy": 0.0709003258496523, + "epoch": 3.5886466954190466, + "grad_norm": 0.76953125, + "learning_rate": 4.983058305505528e-05, + "loss": 0.058, + "mean_token_accuracy": 0.9755187392234802, + "num_tokens": 28598074.0, + "step": 15395 + }, + { + "entropy": 0.12664932943880558, + "epoch": 3.5898123324396782, + "grad_norm": 9.875, + "learning_rate": 4.983033250644875e-05, + "loss": 0.1625, + "mean_token_accuracy": 0.9476115763187408, + "num_tokens": 28619669.0, + "step": 15400 + }, + { + "entropy": 0.11317505575716495, + "epoch": 3.59097796946031, + "grad_norm": 3.640625, + "learning_rate": 4.983008177397841e-05, + "loss": 0.1903, + "mean_token_accuracy": 0.9582743942737579, + "num_tokens": 28630736.0, + "step": 15405 + }, + { + "entropy": 0.20718681290745736, + "epoch": 3.592143606480942, + "grad_norm": 5.875, + "learning_rate": 4.9829830857648016e-05, + "loss": 0.3451, + "mean_token_accuracy": 0.9288463413715362, + "num_tokens": 28638695.0, + "step": 15410 + }, + { + "entropy": 0.10314843617379665, + "epoch": 3.5933092435015737, + "grad_norm": 0.79296875, + "learning_rate": 4.98295797574613e-05, + "loss": 0.1336, + "mean_token_accuracy": 0.9661559939384461, + "num_tokens": 28673027.0, + "step": 15415 + }, + { + "entropy": 0.10093957595527173, + "epoch": 3.5944748805222053, + "grad_norm": 5.5625, + "learning_rate": 4.982932847342199e-05, + "loss": 0.1342, + "mean_token_accuracy": 0.9647237479686737, + "num_tokens": 28685524.0, + "step": 15420 + }, + { + "entropy": 0.1997634179890156, + "epoch": 3.5956405175428374, + "grad_norm": 4.0, + "learning_rate": 4.982907700553386e-05, + "loss": 0.3577, + "mean_token_accuracy": 0.9233972787857055, + "num_tokens": 28707734.0, + "step": 15425 + }, + { + "entropy": 0.20143613442778588, + "epoch": 3.596806154563469, + "grad_norm": 5.78125, + "learning_rate": 4.982882535380064e-05, + "loss": 0.248, + "mean_token_accuracy": 0.9392694950103759, + "num_tokens": 28717460.0, + "step": 15430 + }, + { + "entropy": 0.15649377331137657, + "epoch": 3.5979717915841007, + "grad_norm": 0.8046875, + "learning_rate": 4.98285735182261e-05, + "loss": 0.2217, + "mean_token_accuracy": 0.9479660391807556, + "num_tokens": 28739023.0, + "step": 15435 + }, + { + "entropy": 0.10691715404391289, + "epoch": 3.5991374286047324, + "grad_norm": 1.703125, + "learning_rate": 4.982832149881398e-05, + "loss": 0.1454, + "mean_token_accuracy": 0.9623469948768616, + "num_tokens": 28752932.0, + "step": 15440 + }, + { + "entropy": 0.07475876267999411, + "epoch": 3.600303065625364, + "grad_norm": 1.7578125, + "learning_rate": 4.982806929556804e-05, + "loss": 0.1132, + "mean_token_accuracy": 0.9730147421360016, + "num_tokens": 28784051.0, + "step": 15445 + }, + { + "entropy": 0.24029637901112438, + "epoch": 3.601468702645996, + "grad_norm": 11.625, + "learning_rate": 4.982781690849205e-05, + "loss": 0.5745, + "mean_token_accuracy": 0.9266455054283143, + "num_tokens": 28817951.0, + "step": 15450 + }, + { + "entropy": 0.1538819193840027, + "epoch": 3.602634339666628, + "grad_norm": 9.4375, + "learning_rate": 4.9827564337589764e-05, + "loss": 0.264, + "mean_token_accuracy": 0.9450575411319733, + "num_tokens": 28836860.0, + "step": 15455 + }, + { + "entropy": 0.1413545010611415, + "epoch": 3.6037999766872595, + "grad_norm": 1.0546875, + "learning_rate": 4.982731158286495e-05, + "loss": 0.1918, + "mean_token_accuracy": 0.9462571084499359, + "num_tokens": 28868174.0, + "step": 15460 + }, + { + "entropy": 0.08998983614146709, + "epoch": 3.6049656137078916, + "grad_norm": 1.484375, + "learning_rate": 4.982705864432139e-05, + "loss": 0.0978, + "mean_token_accuracy": 0.9719910442829132, + "num_tokens": 28896694.0, + "step": 15465 + }, + { + "entropy": 0.09497287701815367, + "epoch": 3.6061312507285233, + "grad_norm": 2.921875, + "learning_rate": 4.982680552196284e-05, + "loss": 0.1236, + "mean_token_accuracy": 0.9718452095985413, + "num_tokens": 28930529.0, + "step": 15470 + }, + { + "entropy": 0.09434767477214337, + "epoch": 3.607296887749155, + "grad_norm": 4.6875, + "learning_rate": 4.982655221579308e-05, + "loss": 0.1505, + "mean_token_accuracy": 0.9634428501129151, + "num_tokens": 28946170.0, + "step": 15475 + }, + { + "entropy": 0.11451233848929405, + "epoch": 3.6084625247697866, + "grad_norm": 2.234375, + "learning_rate": 4.982629872581588e-05, + "loss": 0.1832, + "mean_token_accuracy": 0.9608821451663971, + "num_tokens": 28969256.0, + "step": 15480 + }, + { + "entropy": 0.09396213125437498, + "epoch": 3.6096281617904182, + "grad_norm": 0.65625, + "learning_rate": 4.9826045052035026e-05, + "loss": 0.1419, + "mean_token_accuracy": 0.9580700159072876, + "num_tokens": 28991020.0, + "step": 15485 + }, + { + "entropy": 0.10509989410638809, + "epoch": 3.6107937988110503, + "grad_norm": 1.3046875, + "learning_rate": 4.9825791194454304e-05, + "loss": 0.1666, + "mean_token_accuracy": 0.9542161762714386, + "num_tokens": 29011662.0, + "step": 15490 + }, + { + "entropy": 0.17095814682543278, + "epoch": 3.611959435831682, + "grad_norm": 3.75, + "learning_rate": 4.9825537153077496e-05, + "loss": 0.2074, + "mean_token_accuracy": 0.936936765909195, + "num_tokens": 29032831.0, + "step": 15495 + }, + { + "entropy": 0.08246345948427916, + "epoch": 3.6131250728523137, + "grad_norm": 3.75, + "learning_rate": 4.982528292790839e-05, + "loss": 0.1081, + "mean_token_accuracy": 0.9697481095790863, + "num_tokens": 29053059.0, + "step": 15500 + }, + { + "entropy": 0.11737008113414049, + "epoch": 3.6142907098729458, + "grad_norm": 0.353515625, + "learning_rate": 4.9825028518950776e-05, + "loss": 0.1684, + "mean_token_accuracy": 0.9596979439258575, + "num_tokens": 29072514.0, + "step": 15505 + }, + { + "entropy": 0.14000843446701766, + "epoch": 3.6154563468935774, + "grad_norm": 4.03125, + "learning_rate": 4.9824773926208455e-05, + "loss": 0.1935, + "mean_token_accuracy": 0.9475890219211578, + "num_tokens": 29088807.0, + "step": 15510 + }, + { + "entropy": 0.08790351636707783, + "epoch": 3.616621983914209, + "grad_norm": 3.3125, + "learning_rate": 4.982451914968521e-05, + "loss": 0.1464, + "mean_token_accuracy": 0.9602734923362732, + "num_tokens": 29102568.0, + "step": 15515 + }, + { + "entropy": 0.12344647385179996, + "epoch": 3.6177876209348407, + "grad_norm": 1.421875, + "learning_rate": 4.982426418938485e-05, + "loss": 0.1908, + "mean_token_accuracy": 0.9598969042301178, + "num_tokens": 29127406.0, + "step": 15520 + }, + { + "entropy": 0.13656870573759078, + "epoch": 3.6189532579554724, + "grad_norm": 1.96875, + "learning_rate": 4.9824009045311174e-05, + "loss": 0.2374, + "mean_token_accuracy": 0.946445894241333, + "num_tokens": 29139309.0, + "step": 15525 + }, + { + "entropy": 0.12615330442786216, + "epoch": 3.6201188949761045, + "grad_norm": 4.1875, + "learning_rate": 4.982375371746799e-05, + "loss": 0.1767, + "mean_token_accuracy": 0.9628724992275238, + "num_tokens": 29158047.0, + "step": 15530 + }, + { + "entropy": 0.13344797156751156, + "epoch": 3.621284531996736, + "grad_norm": 6.34375, + "learning_rate": 4.9823498205859096e-05, + "loss": 0.2194, + "mean_token_accuracy": 0.953150063753128, + "num_tokens": 29172389.0, + "step": 15535 + }, + { + "entropy": 0.1036421962082386, + "epoch": 3.622450169017368, + "grad_norm": 4.875, + "learning_rate": 4.9823242510488315e-05, + "loss": 0.1068, + "mean_token_accuracy": 0.9691031992435455, + "num_tokens": 29195919.0, + "step": 15540 + }, + { + "entropy": 0.11982145886868238, + "epoch": 3.623615806038, + "grad_norm": 1.03125, + "learning_rate": 4.9822986631359455e-05, + "loss": 0.1182, + "mean_token_accuracy": 0.9620106518268585, + "num_tokens": 29225367.0, + "step": 15545 + }, + { + "entropy": 0.10043896548449993, + "epoch": 3.6247814430586316, + "grad_norm": 1.9296875, + "learning_rate": 4.982273056847632e-05, + "loss": 0.1494, + "mean_token_accuracy": 0.962291032075882, + "num_tokens": 29245781.0, + "step": 15550 + }, + { + "entropy": 0.12442281097173691, + "epoch": 3.6259470800792633, + "grad_norm": 5.25, + "learning_rate": 4.982247432184275e-05, + "loss": 0.1443, + "mean_token_accuracy": 0.9543093621730805, + "num_tokens": 29272352.0, + "step": 15555 + }, + { + "entropy": 0.09449044466018677, + "epoch": 3.6271127170998954, + "grad_norm": 3.03125, + "learning_rate": 4.982221789146254e-05, + "loss": 0.1245, + "mean_token_accuracy": 0.965887975692749, + "num_tokens": 29292819.0, + "step": 15560 + }, + { + "entropy": 0.11279590725898743, + "epoch": 3.628278354120527, + "grad_norm": 7.09375, + "learning_rate": 4.982196127733954e-05, + "loss": 0.2447, + "mean_token_accuracy": 0.9535762310028076, + "num_tokens": 29317518.0, + "step": 15565 + }, + { + "entropy": 0.07748936731368303, + "epoch": 3.6294439911411587, + "grad_norm": 2.46875, + "learning_rate": 4.982170447947755e-05, + "loss": 0.0748, + "mean_token_accuracy": 0.9684795379638672, + "num_tokens": 29338480.0, + "step": 15570 + }, + { + "entropy": 0.17385158613324164, + "epoch": 3.6306096281617903, + "grad_norm": 0.90234375, + "learning_rate": 4.982144749788042e-05, + "loss": 0.3076, + "mean_token_accuracy": 0.922554886341095, + "num_tokens": 29357470.0, + "step": 15575 + }, + { + "entropy": 0.11460159979760647, + "epoch": 3.631775265182422, + "grad_norm": 6.5625, + "learning_rate": 4.9821190332551975e-05, + "loss": 0.1568, + "mean_token_accuracy": 0.9668726980686188, + "num_tokens": 29372212.0, + "step": 15580 + }, + { + "entropy": 0.15606100708246232, + "epoch": 3.632940902203054, + "grad_norm": 5.15625, + "learning_rate": 4.982093298349605e-05, + "loss": 0.2635, + "mean_token_accuracy": 0.944490659236908, + "num_tokens": 29394169.0, + "step": 15585 + }, + { + "entropy": 0.0882917718961835, + "epoch": 3.6341065392236858, + "grad_norm": 4.3125, + "learning_rate": 4.9820675450716484e-05, + "loss": 0.1031, + "mean_token_accuracy": 0.9658193111419677, + "num_tokens": 29408975.0, + "step": 15590 + }, + { + "entropy": 0.07895313240587712, + "epoch": 3.6352721762443174, + "grad_norm": 5.8125, + "learning_rate": 4.982041773421712e-05, + "loss": 0.0786, + "mean_token_accuracy": 0.9723894357681274, + "num_tokens": 29434543.0, + "step": 15595 + }, + { + "entropy": 0.15614924132823943, + "epoch": 3.6364378132649495, + "grad_norm": 1.2109375, + "learning_rate": 4.982015983400179e-05, + "loss": 0.3187, + "mean_token_accuracy": 0.9334610044956207, + "num_tokens": 29452989.0, + "step": 15600 + }, + { + "entropy": 0.08418496306985616, + "epoch": 3.637603450285581, + "grad_norm": 1.0546875, + "learning_rate": 4.981990175007435e-05, + "loss": 0.1543, + "mean_token_accuracy": 0.9651324570178985, + "num_tokens": 29471768.0, + "step": 15605 + }, + { + "entropy": 0.099199612531811, + "epoch": 3.638769087306213, + "grad_norm": 1.15625, + "learning_rate": 4.981964348243864e-05, + "loss": 0.1925, + "mean_token_accuracy": 0.9549100041389466, + "num_tokens": 29498761.0, + "step": 15610 + }, + { + "entropy": 0.2713124416768551, + "epoch": 3.6399347243268445, + "grad_norm": 7.75, + "learning_rate": 4.981938503109852e-05, + "loss": 0.5981, + "mean_token_accuracy": 0.9235803484916687, + "num_tokens": 29516944.0, + "step": 15615 + }, + { + "entropy": 0.23586437962949275, + "epoch": 3.641100361347476, + "grad_norm": 1.8984375, + "learning_rate": 4.981912639605784e-05, + "loss": 0.4017, + "mean_token_accuracy": 0.9161858439445496, + "num_tokens": 29533561.0, + "step": 15620 + }, + { + "entropy": 0.13091112915426492, + "epoch": 3.6422659983681083, + "grad_norm": 6.8125, + "learning_rate": 4.981886757732045e-05, + "loss": 0.2171, + "mean_token_accuracy": 0.9459374427795411, + "num_tokens": 29563131.0, + "step": 15625 + }, + { + "entropy": 0.3564877349883318, + "epoch": 3.64343163538874, + "grad_norm": 2.203125, + "learning_rate": 4.9818608574890226e-05, + "loss": 0.4436, + "mean_token_accuracy": 0.9162199199199677, + "num_tokens": 29591151.0, + "step": 15630 + }, + { + "entropy": 0.08597991708666086, + "epoch": 3.6445972724093716, + "grad_norm": 1.2578125, + "learning_rate": 4.981834938877101e-05, + "loss": 0.0575, + "mean_token_accuracy": 0.9764993786811829, + "num_tokens": 29621998.0, + "step": 15635 + }, + { + "entropy": 0.14188469909131526, + "epoch": 3.6457629094300037, + "grad_norm": 1.921875, + "learning_rate": 4.981809001896668e-05, + "loss": 0.2125, + "mean_token_accuracy": 0.9538449347019196, + "num_tokens": 29636259.0, + "step": 15640 + }, + { + "entropy": 0.15098804775625468, + "epoch": 3.6469285464506354, + "grad_norm": 0.5703125, + "learning_rate": 4.9817830465481106e-05, + "loss": 0.1794, + "mean_token_accuracy": 0.9561287760734558, + "num_tokens": 29658153.0, + "step": 15645 + }, + { + "entropy": 0.11412147097289563, + "epoch": 3.648094183471267, + "grad_norm": 1.3984375, + "learning_rate": 4.981757072831815e-05, + "loss": 0.1546, + "mean_token_accuracy": 0.9611155807971954, + "num_tokens": 29675553.0, + "step": 15650 + }, + { + "entropy": 0.11966407895088196, + "epoch": 3.6492598204918987, + "grad_norm": 4.375, + "learning_rate": 4.981731080748169e-05, + "loss": 0.1912, + "mean_token_accuracy": 0.9574562072753906, + "num_tokens": 29698339.0, + "step": 15655 + }, + { + "entropy": 0.1513479059562087, + "epoch": 3.6504254575125303, + "grad_norm": 4.03125, + "learning_rate": 4.981705070297559e-05, + "loss": 0.1603, + "mean_token_accuracy": 0.9487121284008027, + "num_tokens": 29733698.0, + "step": 15660 + }, + { + "entropy": 0.11037451829761266, + "epoch": 3.6515910945331624, + "grad_norm": 2.609375, + "learning_rate": 4.981679041480375e-05, + "loss": 0.207, + "mean_token_accuracy": 0.9578937530517578, + "num_tokens": 29749055.0, + "step": 15665 + }, + { + "entropy": 0.09969578366726636, + "epoch": 3.652756731553794, + "grad_norm": 0.8046875, + "learning_rate": 4.981652994297003e-05, + "loss": 0.0758, + "mean_token_accuracy": 0.97415571808815, + "num_tokens": 29789590.0, + "step": 15670 + }, + { + "entropy": 0.13219497799873353, + "epoch": 3.6539223685744258, + "grad_norm": 7.96875, + "learning_rate": 4.981626928747832e-05, + "loss": 0.2559, + "mean_token_accuracy": 0.9514385044574738, + "num_tokens": 29804384.0, + "step": 15675 + }, + { + "entropy": 0.12863097973167897, + "epoch": 3.655088005595058, + "grad_norm": 2.125, + "learning_rate": 4.981600844833253e-05, + "loss": 0.1296, + "mean_token_accuracy": 0.9587861657142639, + "num_tokens": 29822822.0, + "step": 15680 + }, + { + "entropy": 0.13467212654650212, + "epoch": 3.6562536426156895, + "grad_norm": 6.625, + "learning_rate": 4.9815747425536505e-05, + "loss": 0.2266, + "mean_token_accuracy": 0.953845477104187, + "num_tokens": 29839053.0, + "step": 15685 + }, + { + "entropy": 0.1336902890354395, + "epoch": 3.657419279636321, + "grad_norm": 1.9921875, + "learning_rate": 4.981548621909418e-05, + "loss": 0.1921, + "mean_token_accuracy": 0.9556374251842499, + "num_tokens": 29853597.0, + "step": 15690 + }, + { + "entropy": 0.1312162894755602, + "epoch": 3.6585849166569533, + "grad_norm": 5.34375, + "learning_rate": 4.9815224829009424e-05, + "loss": 0.1911, + "mean_token_accuracy": 0.9566432774066925, + "num_tokens": 29881783.0, + "step": 15695 + }, + { + "entropy": 0.1050132367759943, + "epoch": 3.659750553677585, + "grad_norm": 1.1953125, + "learning_rate": 4.9814963255286144e-05, + "loss": 0.16, + "mean_token_accuracy": 0.9637480795383453, + "num_tokens": 29905353.0, + "step": 15700 + }, + { + "entropy": 0.0904469602741301, + "epoch": 3.6609161906982166, + "grad_norm": 1.640625, + "learning_rate": 4.981470149792824e-05, + "loss": 0.1005, + "mean_token_accuracy": 0.9642527401447296, + "num_tokens": 29931329.0, + "step": 15705 + }, + { + "entropy": 0.09534500148147344, + "epoch": 3.6620818277188483, + "grad_norm": 5.59375, + "learning_rate": 4.981443955693962e-05, + "loss": 0.1104, + "mean_token_accuracy": 0.9741943538188934, + "num_tokens": 29962728.0, + "step": 15710 + }, + { + "entropy": 0.09082780564203859, + "epoch": 3.66324746473948, + "grad_norm": 0.5078125, + "learning_rate": 4.981417743232417e-05, + "loss": 0.1501, + "mean_token_accuracy": 0.9552464723587036, + "num_tokens": 29990387.0, + "step": 15715 + }, + { + "entropy": 0.126339378207922, + "epoch": 3.664413101760112, + "grad_norm": 9.0625, + "learning_rate": 4.981391512408582e-05, + "loss": 0.2424, + "mean_token_accuracy": 0.9529932260513305, + "num_tokens": 30000074.0, + "step": 15720 + }, + { + "entropy": 0.1250433325767517, + "epoch": 3.6655787387807437, + "grad_norm": 6.5625, + "learning_rate": 4.981365263222848e-05, + "loss": 0.141, + "mean_token_accuracy": 0.9634235501289368, + "num_tokens": 30015666.0, + "step": 15725 + }, + { + "entropy": 0.11333463415503502, + "epoch": 3.6667443758013754, + "grad_norm": 3.640625, + "learning_rate": 4.981338995675605e-05, + "loss": 0.2123, + "mean_token_accuracy": 0.9570781469345093, + "num_tokens": 30026972.0, + "step": 15730 + }, + { + "entropy": 0.18440841864794494, + "epoch": 3.6679100128220075, + "grad_norm": 6.3125, + "learning_rate": 4.981312709767246e-05, + "loss": 0.3143, + "mean_token_accuracy": 0.9335676729679108, + "num_tokens": 30058406.0, + "step": 15735 + }, + { + "entropy": 0.171936047822237, + "epoch": 3.669075649842639, + "grad_norm": 7.78125, + "learning_rate": 4.981286405498163e-05, + "loss": 0.23, + "mean_token_accuracy": 0.9359781265258789, + "num_tokens": 30080231.0, + "step": 15740 + }, + { + "entropy": 0.12102098194882274, + "epoch": 3.670241286863271, + "grad_norm": 1.3125, + "learning_rate": 4.981260082868747e-05, + "loss": 0.1511, + "mean_token_accuracy": 0.9653957903385162, + "num_tokens": 30116521.0, + "step": 15745 + }, + { + "entropy": 0.18943593911826612, + "epoch": 3.6714069238839024, + "grad_norm": 0.8828125, + "learning_rate": 4.981233741879391e-05, + "loss": 0.289, + "mean_token_accuracy": 0.9413058817386627, + "num_tokens": 30144200.0, + "step": 15750 + }, + { + "entropy": 0.11288008131086827, + "epoch": 3.672572560904534, + "grad_norm": 5.3125, + "learning_rate": 4.981207382530489e-05, + "loss": 0.1753, + "mean_token_accuracy": 0.9527966439723968, + "num_tokens": 30171349.0, + "step": 15755 + }, + { + "entropy": 0.09640868995338678, + "epoch": 3.673738197925166, + "grad_norm": 6.46875, + "learning_rate": 4.981181004822432e-05, + "loss": 0.1566, + "mean_token_accuracy": 0.9636990368366242, + "num_tokens": 30192008.0, + "step": 15760 + }, + { + "entropy": 0.2270708303898573, + "epoch": 3.674903834945798, + "grad_norm": 4.71875, + "learning_rate": 4.981154608755615e-05, + "loss": 0.4298, + "mean_token_accuracy": 0.912353515625, + "num_tokens": 30217346.0, + "step": 15765 + }, + { + "entropy": 0.08794712722301483, + "epoch": 3.6760694719664295, + "grad_norm": 5.28125, + "learning_rate": 4.981128194330431e-05, + "loss": 0.1921, + "mean_token_accuracy": 0.9609225809574127, + "num_tokens": 30229956.0, + "step": 15770 + }, + { + "entropy": 0.11557548865675926, + "epoch": 3.6772351089870616, + "grad_norm": 4.8125, + "learning_rate": 4.981101761547274e-05, + "loss": 0.1623, + "mean_token_accuracy": 0.9657979011535645, + "num_tokens": 30249241.0, + "step": 15775 + }, + { + "entropy": 0.08836949989199638, + "epoch": 3.6784007460076933, + "grad_norm": 4.78125, + "learning_rate": 4.9810753104065376e-05, + "loss": 0.0928, + "mean_token_accuracy": 0.9697605192661285, + "num_tokens": 30277573.0, + "step": 15780 + }, + { + "entropy": 0.08709455393254757, + "epoch": 3.679566383028325, + "grad_norm": 0.92578125, + "learning_rate": 4.981048840908618e-05, + "loss": 0.1042, + "mean_token_accuracy": 0.9733495712280273, + "num_tokens": 30302216.0, + "step": 15785 + }, + { + "entropy": 0.1160502802580595, + "epoch": 3.6807320200489566, + "grad_norm": 3.515625, + "learning_rate": 4.981022353053907e-05, + "loss": 0.1789, + "mean_token_accuracy": 0.9632501363754272, + "num_tokens": 30313531.0, + "step": 15790 + }, + { + "entropy": 0.09411624427884817, + "epoch": 3.6818976570695883, + "grad_norm": 3.296875, + "learning_rate": 4.9809958468428015e-05, + "loss": 0.0966, + "mean_token_accuracy": 0.9665277242660523, + "num_tokens": 30329907.0, + "step": 15795 + }, + { + "entropy": 0.19231935106217862, + "epoch": 3.6830632940902204, + "grad_norm": 9.25, + "learning_rate": 4.980969322275697e-05, + "loss": 0.3127, + "mean_token_accuracy": 0.940207052230835, + "num_tokens": 30338833.0, + "step": 15800 + }, + { + "entropy": 0.12116588074713945, + "epoch": 3.684228931110852, + "grad_norm": 7.9375, + "learning_rate": 4.980942779352988e-05, + "loss": 0.2015, + "mean_token_accuracy": 0.9503274917602539, + "num_tokens": 30352552.0, + "step": 15805 + }, + { + "entropy": 0.11752695105969906, + "epoch": 3.6853945681314837, + "grad_norm": 7.75, + "learning_rate": 4.9809162180750705e-05, + "loss": 0.2157, + "mean_token_accuracy": 0.9551667273044586, + "num_tokens": 30363063.0, + "step": 15810 + }, + { + "entropy": 0.12046983316540719, + "epoch": 3.686560205152116, + "grad_norm": 4.375, + "learning_rate": 4.980889638442341e-05, + "loss": 0.1791, + "mean_token_accuracy": 0.9582276821136475, + "num_tokens": 30382257.0, + "step": 15815 + }, + { + "entropy": 0.11929085087031126, + "epoch": 3.6877258421727475, + "grad_norm": 7.75, + "learning_rate": 4.980863040455196e-05, + "loss": 0.1394, + "mean_token_accuracy": 0.9591721177101136, + "num_tokens": 30407940.0, + "step": 15820 + }, + { + "entropy": 0.10976488478481769, + "epoch": 3.688891479193379, + "grad_norm": 1.296875, + "learning_rate": 4.9808364241140316e-05, + "loss": 0.1348, + "mean_token_accuracy": 0.965981525182724, + "num_tokens": 30436227.0, + "step": 15825 + }, + { + "entropy": 0.08953040465712547, + "epoch": 3.6900571162140112, + "grad_norm": 1.375, + "learning_rate": 4.980809789419244e-05, + "loss": 0.0962, + "mean_token_accuracy": 0.9648583948612213, + "num_tokens": 30458523.0, + "step": 15830 + }, + { + "entropy": 0.0966216598637402, + "epoch": 3.691222753234643, + "grad_norm": 2.015625, + "learning_rate": 4.980783136371232e-05, + "loss": 0.126, + "mean_token_accuracy": 0.9684146821498871, + "num_tokens": 30475490.0, + "step": 15835 + }, + { + "entropy": 0.10115543827414512, + "epoch": 3.6923883902552745, + "grad_norm": 1.7109375, + "learning_rate": 4.980756464970392e-05, + "loss": 0.1456, + "mean_token_accuracy": 0.9590452373027801, + "num_tokens": 30492963.0, + "step": 15840 + }, + { + "entropy": 0.11767181642353534, + "epoch": 3.693554027275906, + "grad_norm": 8.9375, + "learning_rate": 4.9807297752171225e-05, + "loss": 0.1732, + "mean_token_accuracy": 0.9512511551380157, + "num_tokens": 30520731.0, + "step": 15845 + }, + { + "entropy": 0.07681280169636011, + "epoch": 3.694719664296538, + "grad_norm": 5.34375, + "learning_rate": 4.9807030671118205e-05, + "loss": 0.1331, + "mean_token_accuracy": 0.9656332790851593, + "num_tokens": 30546042.0, + "step": 15850 + }, + { + "entropy": 0.09752353429794311, + "epoch": 3.69588530131717, + "grad_norm": 8.4375, + "learning_rate": 4.980676340654884e-05, + "loss": 0.1193, + "mean_token_accuracy": 0.9660360097885132, + "num_tokens": 30561427.0, + "step": 15855 + }, + { + "entropy": 0.09423494134098291, + "epoch": 3.6970509383378016, + "grad_norm": 7.34375, + "learning_rate": 4.980649595846713e-05, + "loss": 0.1271, + "mean_token_accuracy": 0.958463329076767, + "num_tokens": 30586406.0, + "step": 15860 + }, + { + "entropy": 0.11503859348595143, + "epoch": 3.6982165753584333, + "grad_norm": 2.046875, + "learning_rate": 4.9806228326877056e-05, + "loss": 0.1439, + "mean_token_accuracy": 0.9584307610988617, + "num_tokens": 30596957.0, + "step": 15865 + }, + { + "entropy": 0.09770829975605011, + "epoch": 3.6993822123790654, + "grad_norm": 6.96875, + "learning_rate": 4.980596051178261e-05, + "loss": 0.1714, + "mean_token_accuracy": 0.9600382685661316, + "num_tokens": 30608791.0, + "step": 15870 + }, + { + "entropy": 0.13881364446133376, + "epoch": 3.700547849399697, + "grad_norm": 2.640625, + "learning_rate": 4.9805692513187774e-05, + "loss": 0.2144, + "mean_token_accuracy": 0.9591614782810212, + "num_tokens": 30638592.0, + "step": 15875 + }, + { + "entropy": 0.08609495665878057, + "epoch": 3.7017134864203287, + "grad_norm": 0.703125, + "learning_rate": 4.980542433109656e-05, + "loss": 0.1357, + "mean_token_accuracy": 0.9631729364395142, + "num_tokens": 30659261.0, + "step": 15880 + }, + { + "entropy": 0.08448141608387232, + "epoch": 3.7028791234409604, + "grad_norm": 4.875, + "learning_rate": 4.980515596551296e-05, + "loss": 0.0884, + "mean_token_accuracy": 0.9661660194396973, + "num_tokens": 30687231.0, + "step": 15885 + }, + { + "entropy": 0.08586684390902519, + "epoch": 3.704044760461592, + "grad_norm": 1.7265625, + "learning_rate": 4.980488741644098e-05, + "loss": 0.1603, + "mean_token_accuracy": 0.9657577693462371, + "num_tokens": 30714234.0, + "step": 15890 + }, + { + "entropy": 0.1457499351352453, + "epoch": 3.705210397482224, + "grad_norm": 2.8125, + "learning_rate": 4.980461868388462e-05, + "loss": 0.1848, + "mean_token_accuracy": 0.9565096437931061, + "num_tokens": 30732637.0, + "step": 15895 + }, + { + "entropy": 0.16683135665953158, + "epoch": 3.706376034502856, + "grad_norm": 2.40625, + "learning_rate": 4.9804349767847877e-05, + "loss": 0.2308, + "mean_token_accuracy": 0.9465544700622559, + "num_tokens": 30743422.0, + "step": 15900 + }, + { + "entropy": 0.08053812086582184, + "epoch": 3.7075416715234875, + "grad_norm": 13.3125, + "learning_rate": 4.9804080668334784e-05, + "loss": 0.1439, + "mean_token_accuracy": 0.9672502219676972, + "num_tokens": 30763615.0, + "step": 15905 + }, + { + "entropy": 0.09644758738577366, + "epoch": 3.7087073085441196, + "grad_norm": 3.21875, + "learning_rate": 4.980381138534934e-05, + "loss": 0.0996, + "mean_token_accuracy": 0.9660190284252167, + "num_tokens": 30777896.0, + "step": 15910 + }, + { + "entropy": 0.09025746416300535, + "epoch": 3.7098729455647512, + "grad_norm": 1.46875, + "learning_rate": 4.9803541918895565e-05, + "loss": 0.1656, + "mean_token_accuracy": 0.9610617697238922, + "num_tokens": 30791889.0, + "step": 15915 + }, + { + "entropy": 0.09136622017249466, + "epoch": 3.711038582585383, + "grad_norm": 2.5, + "learning_rate": 4.980327226897747e-05, + "loss": 0.0989, + "mean_token_accuracy": 0.9618954002857208, + "num_tokens": 30811042.0, + "step": 15920 + }, + { + "entropy": 0.0829636923968792, + "epoch": 3.7122042196060145, + "grad_norm": 2.0, + "learning_rate": 4.9803002435599076e-05, + "loss": 0.0862, + "mean_token_accuracy": 0.9666243731975556, + "num_tokens": 30829122.0, + "step": 15925 + }, + { + "entropy": 0.11056047528982163, + "epoch": 3.713369856626646, + "grad_norm": 1.390625, + "learning_rate": 4.980273241876442e-05, + "loss": 0.1123, + "mean_token_accuracy": 0.9589561939239502, + "num_tokens": 30850324.0, + "step": 15930 + }, + { + "entropy": 0.10144733544439077, + "epoch": 3.7145354936472783, + "grad_norm": 0.61328125, + "learning_rate": 4.980246221847751e-05, + "loss": 0.2017, + "mean_token_accuracy": 0.9625826895236969, + "num_tokens": 30873421.0, + "step": 15935 + }, + { + "entropy": 0.11644894815981388, + "epoch": 3.71570113066791, + "grad_norm": 9.5625, + "learning_rate": 4.9802191834742395e-05, + "loss": 0.1696, + "mean_token_accuracy": 0.9588033437728882, + "num_tokens": 30885393.0, + "step": 15940 + }, + { + "entropy": 0.07029726225882768, + "epoch": 3.7168667676885416, + "grad_norm": 2.671875, + "learning_rate": 4.980192126756309e-05, + "loss": 0.0853, + "mean_token_accuracy": 0.965891820192337, + "num_tokens": 30915056.0, + "step": 15945 + }, + { + "entropy": 0.08881367389112711, + "epoch": 3.7180324047091737, + "grad_norm": 0.470703125, + "learning_rate": 4.9801650516943636e-05, + "loss": 0.1229, + "mean_token_accuracy": 0.9748471915721894, + "num_tokens": 30940177.0, + "step": 15950 + }, + { + "entropy": 0.12720200717449187, + "epoch": 3.7191980417298054, + "grad_norm": 2.03125, + "learning_rate": 4.980137958288808e-05, + "loss": 0.1362, + "mean_token_accuracy": 0.9587613344192505, + "num_tokens": 30956685.0, + "step": 15955 + }, + { + "entropy": 0.10853961212560534, + "epoch": 3.720363678750437, + "grad_norm": 6.25, + "learning_rate": 4.980110846540044e-05, + "loss": 0.1353, + "mean_token_accuracy": 0.9610925018787384, + "num_tokens": 30976571.0, + "step": 15960 + }, + { + "entropy": 0.2672706731595099, + "epoch": 3.721529315771069, + "grad_norm": 0.498046875, + "learning_rate": 4.980083716448477e-05, + "loss": 0.4138, + "mean_token_accuracy": 0.9393226504325867, + "num_tokens": 31008250.0, + "step": 15965 + }, + { + "entropy": 0.18283664286136628, + "epoch": 3.722694952791701, + "grad_norm": 1.078125, + "learning_rate": 4.980056568014512e-05, + "loss": 0.2432, + "mean_token_accuracy": 0.9456781387329102, + "num_tokens": 31030726.0, + "step": 15970 + }, + { + "entropy": 0.10231209546327591, + "epoch": 3.7238605898123325, + "grad_norm": 3.703125, + "learning_rate": 4.9800294012385534e-05, + "loss": 0.1536, + "mean_token_accuracy": 0.964515072107315, + "num_tokens": 31044932.0, + "step": 15975 + }, + { + "entropy": 0.09682576023042203, + "epoch": 3.725026226832964, + "grad_norm": 2.125, + "learning_rate": 4.9800022161210066e-05, + "loss": 0.1251, + "mean_token_accuracy": 0.9686741828918457, + "num_tokens": 31070171.0, + "step": 15980 + }, + { + "entropy": 0.09693482723087073, + "epoch": 3.726191863853596, + "grad_norm": 6.0, + "learning_rate": 4.9799750126622766e-05, + "loss": 0.0948, + "mean_token_accuracy": 0.96324702501297, + "num_tokens": 31095575.0, + "step": 15985 + }, + { + "entropy": 0.07029144568368792, + "epoch": 3.727357500874228, + "grad_norm": 0.703125, + "learning_rate": 4.979947790862769e-05, + "loss": 0.0542, + "mean_token_accuracy": 0.9817209541797638, + "num_tokens": 31131076.0, + "step": 15990 + }, + { + "entropy": 0.11360028982162476, + "epoch": 3.7285231378948596, + "grad_norm": 3.84375, + "learning_rate": 4.97992055072289e-05, + "loss": 0.1979, + "mean_token_accuracy": 0.9619390547275544, + "num_tokens": 31161312.0, + "step": 15995 + }, + { + "entropy": 0.09870643690228462, + "epoch": 3.7296887749154912, + "grad_norm": 8.75, + "learning_rate": 4.979893292243045e-05, + "loss": 0.1504, + "mean_token_accuracy": 0.9549363851547241, + "num_tokens": 31177428.0, + "step": 16000 + }, + { + "entropy": 0.09069200027734041, + "epoch": 3.7308544119361233, + "grad_norm": 7.0, + "learning_rate": 4.9798660154236425e-05, + "loss": 0.1552, + "mean_token_accuracy": 0.9646099269390106, + "num_tokens": 31192384.0, + "step": 16005 + }, + { + "entropy": 0.10570933558046818, + "epoch": 3.732020048956755, + "grad_norm": 4.46875, + "learning_rate": 4.979838720265087e-05, + "loss": 0.2227, + "mean_token_accuracy": 0.9609293758869171, + "num_tokens": 31203928.0, + "step": 16010 + }, + { + "entropy": 0.08798787742853165, + "epoch": 3.7331856859773866, + "grad_norm": 1.828125, + "learning_rate": 4.979811406767787e-05, + "loss": 0.0826, + "mean_token_accuracy": 0.9753459453582763, + "num_tokens": 31232416.0, + "step": 16015 + }, + { + "entropy": 0.09456034004688263, + "epoch": 3.7343513229980183, + "grad_norm": 0.4140625, + "learning_rate": 4.979784074932149e-05, + "loss": 0.099, + "mean_token_accuracy": 0.9756748259067536, + "num_tokens": 31260445.0, + "step": 16020 + }, + { + "entropy": 0.08893920592963696, + "epoch": 3.73551696001865, + "grad_norm": 4.71875, + "learning_rate": 4.97975672475858e-05, + "loss": 0.1274, + "mean_token_accuracy": 0.9707845091819763, + "num_tokens": 31280077.0, + "step": 16025 + }, + { + "entropy": 0.11007880065590143, + "epoch": 3.736682597039282, + "grad_norm": 7.15625, + "learning_rate": 4.97972935624749e-05, + "loss": 0.158, + "mean_token_accuracy": 0.967463493347168, + "num_tokens": 31300264.0, + "step": 16030 + }, + { + "entropy": 0.11120393015444278, + "epoch": 3.7378482340599137, + "grad_norm": 1.625, + "learning_rate": 4.979701969399284e-05, + "loss": 0.1794, + "mean_token_accuracy": 0.9625828862190247, + "num_tokens": 31313073.0, + "step": 16035 + }, + { + "entropy": 0.12304305955767632, + "epoch": 3.7390138710805454, + "grad_norm": 5.03125, + "learning_rate": 4.979674564214373e-05, + "loss": 0.2017, + "mean_token_accuracy": 0.9605163991451263, + "num_tokens": 31323788.0, + "step": 16040 + }, + { + "entropy": 0.10260249320417643, + "epoch": 3.7401795081011775, + "grad_norm": 1.3046875, + "learning_rate": 4.979647140693165e-05, + "loss": 0.1287, + "mean_token_accuracy": 0.9643874883651733, + "num_tokens": 31345258.0, + "step": 16045 + }, + { + "entropy": 0.15542761236429214, + "epoch": 3.741345145121809, + "grad_norm": 1.1953125, + "learning_rate": 4.979619698836068e-05, + "loss": 0.2069, + "mean_token_accuracy": 0.9525025904178619, + "num_tokens": 31363352.0, + "step": 16050 + }, + { + "entropy": 0.10133458431810141, + "epoch": 3.742510782142441, + "grad_norm": 4.9375, + "learning_rate": 4.979592238643492e-05, + "loss": 0.143, + "mean_token_accuracy": 0.9657983779907227, + "num_tokens": 31392370.0, + "step": 16055 + }, + { + "entropy": 0.11571516077965498, + "epoch": 3.7436764191630725, + "grad_norm": 0.51171875, + "learning_rate": 4.979564760115846e-05, + "loss": 0.1645, + "mean_token_accuracy": 0.9564596891403199, + "num_tokens": 31417487.0, + "step": 16060 + }, + { + "entropy": 0.10357567742466926, + "epoch": 3.744842056183704, + "grad_norm": 1.3515625, + "learning_rate": 4.979537263253541e-05, + "loss": 0.2739, + "mean_token_accuracy": 0.9473852813243866, + "num_tokens": 31430943.0, + "step": 16065 + }, + { + "entropy": 0.12955602668225766, + "epoch": 3.7460076932043362, + "grad_norm": 7.78125, + "learning_rate": 4.9795097480569854e-05, + "loss": 0.2137, + "mean_token_accuracy": 0.9514732956886292, + "num_tokens": 31439010.0, + "step": 16070 + }, + { + "entropy": 0.1175201378762722, + "epoch": 3.747173330224968, + "grad_norm": 9.75, + "learning_rate": 4.97948221452659e-05, + "loss": 0.1842, + "mean_token_accuracy": 0.9582869231700897, + "num_tokens": 31448832.0, + "step": 16075 + }, + { + "entropy": 0.08986708372831345, + "epoch": 3.7483389672455996, + "grad_norm": 0.77734375, + "learning_rate": 4.979454662662766e-05, + "loss": 0.078, + "mean_token_accuracy": 0.9791659891605378, + "num_tokens": 31474476.0, + "step": 16080 + }, + { + "entropy": 0.12361742258071899, + "epoch": 3.7495046042662317, + "grad_norm": 5.15625, + "learning_rate": 4.979427092465924e-05, + "loss": 0.163, + "mean_token_accuracy": 0.9587664127349853, + "num_tokens": 31497137.0, + "step": 16085 + }, + { + "entropy": 0.09666433380916714, + "epoch": 3.7506702412868633, + "grad_norm": 5.03125, + "learning_rate": 4.9793995039364736e-05, + "loss": 0.1255, + "mean_token_accuracy": 0.9669358968734741, + "num_tokens": 31520290.0, + "step": 16090 + }, + { + "entropy": 0.1968850590288639, + "epoch": 3.751835878307495, + "grad_norm": 3.828125, + "learning_rate": 4.979371897074829e-05, + "loss": 0.3538, + "mean_token_accuracy": 0.9254569470882416, + "num_tokens": 31538203.0, + "step": 16095 + }, + { + "entropy": 0.11581053957343102, + "epoch": 3.753001515328127, + "grad_norm": 6.875, + "learning_rate": 4.9793442718814e-05, + "loss": 0.2938, + "mean_token_accuracy": 0.9382094621658326, + "num_tokens": 31549347.0, + "step": 16100 + }, + { + "entropy": 0.11443403884768485, + "epoch": 3.7541671523487588, + "grad_norm": 6.96875, + "learning_rate": 4.979316628356599e-05, + "loss": 0.1811, + "mean_token_accuracy": 0.9656529247760772, + "num_tokens": 31562419.0, + "step": 16105 + }, + { + "entropy": 0.10604585809633135, + "epoch": 3.7553327893693904, + "grad_norm": 0.390625, + "learning_rate": 4.979288966500837e-05, + "loss": 0.1336, + "mean_token_accuracy": 0.966714721918106, + "num_tokens": 31580920.0, + "step": 16110 + }, + { + "entropy": 0.1127666326239705, + "epoch": 3.756498426390022, + "grad_norm": 6.625, + "learning_rate": 4.9792612863145284e-05, + "loss": 0.1463, + "mean_token_accuracy": 0.9673124790191651, + "num_tokens": 31594740.0, + "step": 16115 + }, + { + "entropy": 0.14349405989050865, + "epoch": 3.7576640634106537, + "grad_norm": 4.40625, + "learning_rate": 4.9792335877980844e-05, + "loss": 0.2122, + "mean_token_accuracy": 0.9504148721694946, + "num_tokens": 31604473.0, + "step": 16120 + }, + { + "entropy": 0.16313403341919183, + "epoch": 3.758829700431286, + "grad_norm": 3.046875, + "learning_rate": 4.9792058709519194e-05, + "loss": 0.2319, + "mean_token_accuracy": 0.9375991344451904, + "num_tokens": 31633345.0, + "step": 16125 + }, + { + "entropy": 0.1955052137374878, + "epoch": 3.7599953374519175, + "grad_norm": 5.03125, + "learning_rate": 4.979178135776446e-05, + "loss": 0.3376, + "mean_token_accuracy": 0.9329494535923004, + "num_tokens": 31642905.0, + "step": 16130 + }, + { + "entropy": 0.12828119061887264, + "epoch": 3.761160974472549, + "grad_norm": 4.75, + "learning_rate": 4.979150382272078e-05, + "loss": 0.1884, + "mean_token_accuracy": 0.9502387046813965, + "num_tokens": 31678138.0, + "step": 16135 + }, + { + "entropy": 0.1014595903456211, + "epoch": 3.7623266114931813, + "grad_norm": 5.8125, + "learning_rate": 4.979122610439229e-05, + "loss": 0.1236, + "mean_token_accuracy": 0.9615878939628602, + "num_tokens": 31705191.0, + "step": 16140 + }, + { + "entropy": 0.09820052236318588, + "epoch": 3.763492248513813, + "grad_norm": 13.0625, + "learning_rate": 4.979094820278312e-05, + "loss": 0.195, + "mean_token_accuracy": 0.9605471730232239, + "num_tokens": 31727042.0, + "step": 16145 + }, + { + "entropy": 0.113966304063797, + "epoch": 3.7646578855344446, + "grad_norm": 4.5625, + "learning_rate": 4.9790670117897434e-05, + "loss": 0.1288, + "mean_token_accuracy": 0.9665563642978668, + "num_tokens": 31747200.0, + "step": 16150 + }, + { + "entropy": 0.18605943005532027, + "epoch": 3.7658235225550762, + "grad_norm": 8.5, + "learning_rate": 4.979039184973937e-05, + "loss": 0.3071, + "mean_token_accuracy": 0.9375176846981048, + "num_tokens": 31767394.0, + "step": 16155 + }, + { + "entropy": 0.10228492282330989, + "epoch": 3.766989159575708, + "grad_norm": 3.859375, + "learning_rate": 4.979011339831308e-05, + "loss": 0.1293, + "mean_token_accuracy": 0.9703926205635071, + "num_tokens": 31782097.0, + "step": 16160 + }, + { + "entropy": 0.13661051522940398, + "epoch": 3.76815479659634, + "grad_norm": 1.125, + "learning_rate": 4.9789834763622715e-05, + "loss": 0.1347, + "mean_token_accuracy": 0.9575334310531616, + "num_tokens": 31810446.0, + "step": 16165 + }, + { + "entropy": 0.0924876093864441, + "epoch": 3.7693204336169717, + "grad_norm": 1.921875, + "learning_rate": 4.9789555945672426e-05, + "loss": 0.1324, + "mean_token_accuracy": 0.9688528537750244, + "num_tokens": 31823541.0, + "step": 16170 + }, + { + "entropy": 0.13186742961406708, + "epoch": 3.7704860706376033, + "grad_norm": 9.5, + "learning_rate": 4.978927694446637e-05, + "loss": 0.249, + "mean_token_accuracy": 0.9531232118606567, + "num_tokens": 31832728.0, + "step": 16175 + }, + { + "entropy": 0.10348946945741773, + "epoch": 3.7716517076582354, + "grad_norm": 2.265625, + "learning_rate": 4.9788997760008713e-05, + "loss": 0.1732, + "mean_token_accuracy": 0.9628577351570129, + "num_tokens": 31850856.0, + "step": 16180 + }, + { + "entropy": 0.1540275054052472, + "epoch": 3.772817344678867, + "grad_norm": 1.5859375, + "learning_rate": 4.9788718392303624e-05, + "loss": 0.2778, + "mean_token_accuracy": 0.9363715410232544, + "num_tokens": 31881657.0, + "step": 16185 + }, + { + "entropy": 0.11113776378333569, + "epoch": 3.7739829816994988, + "grad_norm": 0.59375, + "learning_rate": 4.978843884135525e-05, + "loss": 0.1466, + "mean_token_accuracy": 0.9614432454109192, + "num_tokens": 31905652.0, + "step": 16190 + }, + { + "entropy": 0.11307852622121572, + "epoch": 3.7751486187201304, + "grad_norm": 3.71875, + "learning_rate": 4.9788159107167774e-05, + "loss": 0.1413, + "mean_token_accuracy": 0.9594118535518646, + "num_tokens": 31927717.0, + "step": 16195 + }, + { + "entropy": 0.1015424283221364, + "epoch": 3.776314255740762, + "grad_norm": 0.55078125, + "learning_rate": 4.978787918974536e-05, + "loss": 0.1726, + "mean_token_accuracy": 0.9541314899921417, + "num_tokens": 31948045.0, + "step": 16200 + }, + { + "entropy": 0.09296782370656728, + "epoch": 3.777479892761394, + "grad_norm": 3.90625, + "learning_rate": 4.9787599089092194e-05, + "loss": 0.112, + "mean_token_accuracy": 0.96722252368927, + "num_tokens": 31975131.0, + "step": 16205 + }, + { + "entropy": 0.07211828418076038, + "epoch": 3.778645529782026, + "grad_norm": 3.875, + "learning_rate": 4.9787318805212436e-05, + "loss": 0.0967, + "mean_token_accuracy": 0.9726602792739868, + "num_tokens": 32002367.0, + "step": 16210 + }, + { + "entropy": 0.09185118321329355, + "epoch": 3.7798111668026575, + "grad_norm": 3.59375, + "learning_rate": 4.978703833811028e-05, + "loss": 0.1118, + "mean_token_accuracy": 0.9684987306594849, + "num_tokens": 32026660.0, + "step": 16215 + }, + { + "entropy": 0.0935923472046852, + "epoch": 3.7809768038232896, + "grad_norm": 0.8671875, + "learning_rate": 4.978675768778989e-05, + "loss": 0.1291, + "mean_token_accuracy": 0.9658624827861786, + "num_tokens": 32059305.0, + "step": 16220 + }, + { + "entropy": 0.08868596963584423, + "epoch": 3.7821424408439213, + "grad_norm": 3.0, + "learning_rate": 4.978647685425547e-05, + "loss": 0.1118, + "mean_token_accuracy": 0.973666113615036, + "num_tokens": 32089774.0, + "step": 16225 + }, + { + "entropy": 0.1755599969998002, + "epoch": 3.783308077864553, + "grad_norm": 5.9375, + "learning_rate": 4.97861958375112e-05, + "loss": 0.2921, + "mean_token_accuracy": 0.940058308839798, + "num_tokens": 32102936.0, + "step": 16230 + }, + { + "entropy": 0.08324588984251022, + "epoch": 3.784473714885185, + "grad_norm": 0.53125, + "learning_rate": 4.978591463756127e-05, + "loss": 0.1106, + "mean_token_accuracy": 0.9641181349754333, + "num_tokens": 32136473.0, + "step": 16235 + }, + { + "entropy": 0.10371949858963489, + "epoch": 3.7856393519058167, + "grad_norm": 4.875, + "learning_rate": 4.978563325440988e-05, + "loss": 0.1634, + "mean_token_accuracy": 0.9677753865718841, + "num_tokens": 32147038.0, + "step": 16240 + }, + { + "entropy": 0.08506152108311653, + "epoch": 3.7868049889264483, + "grad_norm": 0.453125, + "learning_rate": 4.978535168806121e-05, + "loss": 0.1288, + "mean_token_accuracy": 0.973295658826828, + "num_tokens": 32173810.0, + "step": 16245 + }, + { + "entropy": 0.10174280107021332, + "epoch": 3.78797062594708, + "grad_norm": 8.375, + "learning_rate": 4.978506993851947e-05, + "loss": 0.1985, + "mean_token_accuracy": 0.9564942598342896, + "num_tokens": 32186356.0, + "step": 16250 + }, + { + "entropy": 0.12382913529872894, + "epoch": 3.7891362629677117, + "grad_norm": 6.28125, + "learning_rate": 4.9784788005788866e-05, + "loss": 0.1887, + "mean_token_accuracy": 0.9540922164916992, + "num_tokens": 32201292.0, + "step": 16255 + }, + { + "entropy": 0.12052395232021809, + "epoch": 3.7903018999883438, + "grad_norm": 7.15625, + "learning_rate": 4.978450588987359e-05, + "loss": 0.1952, + "mean_token_accuracy": 0.9510783553123474, + "num_tokens": 32215733.0, + "step": 16260 + }, + { + "entropy": 0.09282411560416222, + "epoch": 3.7914675370089754, + "grad_norm": 3.15625, + "learning_rate": 4.9784223590777854e-05, + "loss": 0.082, + "mean_token_accuracy": 0.9700927495956421, + "num_tokens": 32236636.0, + "step": 16265 + }, + { + "entropy": 0.1389080710709095, + "epoch": 3.792633174029607, + "grad_norm": 5.46875, + "learning_rate": 4.978394110850587e-05, + "loss": 0.2553, + "mean_token_accuracy": 0.9495690643787384, + "num_tokens": 32245289.0, + "step": 16270 + }, + { + "entropy": 0.10744792697951197, + "epoch": 3.793798811050239, + "grad_norm": 7.4375, + "learning_rate": 4.978365844306184e-05, + "loss": 0.1681, + "mean_token_accuracy": 0.9592471480369568, + "num_tokens": 32262720.0, + "step": 16275 + }, + { + "entropy": 0.10109451450407506, + "epoch": 3.794964448070871, + "grad_norm": 3.734375, + "learning_rate": 4.978337559445e-05, + "loss": 0.1505, + "mean_token_accuracy": 0.9673482775688171, + "num_tokens": 32283036.0, + "step": 16280 + }, + { + "entropy": 0.10058681219816208, + "epoch": 3.7961300850915025, + "grad_norm": 5.03125, + "learning_rate": 4.978309256267455e-05, + "loss": 0.1738, + "mean_token_accuracy": 0.9640094101428985, + "num_tokens": 32293183.0, + "step": 16285 + }, + { + "entropy": 0.07011372428387404, + "epoch": 3.797295722112134, + "grad_norm": 1.0078125, + "learning_rate": 4.978280934773971e-05, + "loss": 0.0912, + "mean_token_accuracy": 0.9802879512310028, + "num_tokens": 32311618.0, + "step": 16290 + }, + { + "entropy": 0.09535004459321499, + "epoch": 3.798461359132766, + "grad_norm": 1.0234375, + "learning_rate": 4.978252594964971e-05, + "loss": 0.1341, + "mean_token_accuracy": 0.9680737018585205, + "num_tokens": 32328529.0, + "step": 16295 + }, + { + "entropy": 0.14605763144791126, + "epoch": 3.799626996153398, + "grad_norm": 1.734375, + "learning_rate": 4.9782242368408775e-05, + "loss": 0.1694, + "mean_token_accuracy": 0.9519593000411988, + "num_tokens": 32365656.0, + "step": 16300 + }, + { + "entropy": 0.09797099642455578, + "epoch": 3.8007926331740296, + "grad_norm": 2.046875, + "learning_rate": 4.978195860402114e-05, + "loss": 0.1491, + "mean_token_accuracy": 0.9651980400085449, + "num_tokens": 32377851.0, + "step": 16305 + }, + { + "entropy": 0.14139844849705696, + "epoch": 3.8019582701946613, + "grad_norm": 4.25, + "learning_rate": 4.9781674656491016e-05, + "loss": 0.2437, + "mean_token_accuracy": 0.9429092288017273, + "num_tokens": 32394288.0, + "step": 16310 + }, + { + "entropy": 0.09763580206781626, + "epoch": 3.8031239072152934, + "grad_norm": 0.71875, + "learning_rate": 4.9781390525822655e-05, + "loss": 0.1644, + "mean_token_accuracy": 0.9650467038154602, + "num_tokens": 32412578.0, + "step": 16315 + }, + { + "entropy": 0.10009733010083437, + "epoch": 3.804289544235925, + "grad_norm": 3.890625, + "learning_rate": 4.9781106212020295e-05, + "loss": 0.1664, + "mean_token_accuracy": 0.9597991704940796, + "num_tokens": 32426556.0, + "step": 16320 + }, + { + "entropy": 0.10506035517901183, + "epoch": 3.8054551812565567, + "grad_norm": 5.03125, + "learning_rate": 4.9780821715088166e-05, + "loss": 0.1962, + "mean_token_accuracy": 0.9597808599472046, + "num_tokens": 32438924.0, + "step": 16325 + }, + { + "entropy": 0.10086954236030579, + "epoch": 3.8066208182771883, + "grad_norm": 3.703125, + "learning_rate": 4.9780537035030515e-05, + "loss": 0.155, + "mean_token_accuracy": 0.9629509508609772, + "num_tokens": 32463099.0, + "step": 16330 + }, + { + "entropy": 0.09417558461427689, + "epoch": 3.80778645529782, + "grad_norm": 3.578125, + "learning_rate": 4.9780252171851584e-05, + "loss": 0.1001, + "mean_token_accuracy": 0.9691354632377625, + "num_tokens": 32481659.0, + "step": 16335 + }, + { + "entropy": 0.09145726338028907, + "epoch": 3.808952092318452, + "grad_norm": 1.2734375, + "learning_rate": 4.9779967125555623e-05, + "loss": 0.169, + "mean_token_accuracy": 0.9573414266109467, + "num_tokens": 32497033.0, + "step": 16340 + }, + { + "entropy": 0.10342365652322769, + "epoch": 3.8101177293390838, + "grad_norm": 1.2421875, + "learning_rate": 4.977968189614688e-05, + "loss": 0.133, + "mean_token_accuracy": 0.9625088393688201, + "num_tokens": 32516572.0, + "step": 16345 + }, + { + "entropy": 0.09053916297852993, + "epoch": 3.8112833663597154, + "grad_norm": 5.8125, + "learning_rate": 4.977939648362961e-05, + "loss": 0.117, + "mean_token_accuracy": 0.965921676158905, + "num_tokens": 32538406.0, + "step": 16350 + }, + { + "entropy": 0.12717281207442283, + "epoch": 3.8124490033803475, + "grad_norm": 0.9609375, + "learning_rate": 4.977911088800807e-05, + "loss": 0.2096, + "mean_token_accuracy": 0.9550489544868469, + "num_tokens": 32572408.0, + "step": 16355 + }, + { + "entropy": 0.10790450349450112, + "epoch": 3.813614640400979, + "grad_norm": 8.625, + "learning_rate": 4.977882510928652e-05, + "loss": 0.1321, + "mean_token_accuracy": 0.9582541406154632, + "num_tokens": 32586738.0, + "step": 16360 + }, + { + "entropy": 0.11157775335013867, + "epoch": 3.814780277421611, + "grad_norm": 5.46875, + "learning_rate": 4.977853914746922e-05, + "loss": 0.1499, + "mean_token_accuracy": 0.9643312692642212, + "num_tokens": 32598309.0, + "step": 16365 + }, + { + "entropy": 0.10543946204707026, + "epoch": 3.815945914442243, + "grad_norm": 7.3125, + "learning_rate": 4.9778253002560434e-05, + "loss": 0.1986, + "mean_token_accuracy": 0.9579822719097137, + "num_tokens": 32626437.0, + "step": 16370 + }, + { + "entropy": 0.07996916975826025, + "epoch": 3.8171115514628746, + "grad_norm": 2.484375, + "learning_rate": 4.9777966674564426e-05, + "loss": 0.128, + "mean_token_accuracy": 0.9667159378528595, + "num_tokens": 32647112.0, + "step": 16375 + }, + { + "entropy": 0.15802016891539097, + "epoch": 3.8182771884835063, + "grad_norm": 3.609375, + "learning_rate": 4.9777680163485465e-05, + "loss": 0.2629, + "mean_token_accuracy": 0.9438641905784607, + "num_tokens": 32663012.0, + "step": 16380 + }, + { + "entropy": 0.08313553333282471, + "epoch": 3.819442825504138, + "grad_norm": 3.390625, + "learning_rate": 4.977739346932783e-05, + "loss": 0.0886, + "mean_token_accuracy": 0.973399305343628, + "num_tokens": 32685324.0, + "step": 16385 + }, + { + "entropy": 0.09466358497738839, + "epoch": 3.8206084625247696, + "grad_norm": 4.0625, + "learning_rate": 4.977710659209579e-05, + "loss": 0.1754, + "mean_token_accuracy": 0.9577878236770629, + "num_tokens": 32703309.0, + "step": 16390 + }, + { + "entropy": 0.09490567035973071, + "epoch": 3.8217740995454017, + "grad_norm": 1.09375, + "learning_rate": 4.977681953179363e-05, + "loss": 0.1239, + "mean_token_accuracy": 0.9690981268882751, + "num_tokens": 32725065.0, + "step": 16395 + }, + { + "entropy": 0.08475140482187271, + "epoch": 3.8229397365660334, + "grad_norm": 3.4375, + "learning_rate": 4.9776532288425616e-05, + "loss": 0.1152, + "mean_token_accuracy": 0.9685661673545838, + "num_tokens": 32742199.0, + "step": 16400 + }, + { + "entropy": 0.1452885389328003, + "epoch": 3.824105373586665, + "grad_norm": 3.21875, + "learning_rate": 4.977624486199605e-05, + "loss": 0.1667, + "mean_token_accuracy": 0.9582402050495148, + "num_tokens": 32753828.0, + "step": 16405 + }, + { + "entropy": 0.12028260957449674, + "epoch": 3.825271010607297, + "grad_norm": 2.6875, + "learning_rate": 4.97759572525092e-05, + "loss": 0.1261, + "mean_token_accuracy": 0.9706174254417419, + "num_tokens": 32782189.0, + "step": 16410 + }, + { + "entropy": 0.07044358663260937, + "epoch": 3.826436647627929, + "grad_norm": 1.0859375, + "learning_rate": 4.9775669459969364e-05, + "loss": 0.0639, + "mean_token_accuracy": 0.9793649017810822, + "num_tokens": 32816185.0, + "step": 16415 + }, + { + "entropy": 0.08048469722270965, + "epoch": 3.8276022846485604, + "grad_norm": 0.5859375, + "learning_rate": 4.977538148438084e-05, + "loss": 0.1187, + "mean_token_accuracy": 0.9718262076377868, + "num_tokens": 32835221.0, + "step": 16420 + }, + { + "entropy": 0.08252397803589702, + "epoch": 3.828767921669192, + "grad_norm": 1.0625, + "learning_rate": 4.977509332574791e-05, + "loss": 0.1253, + "mean_token_accuracy": 0.9690129697322846, + "num_tokens": 32869527.0, + "step": 16425 + }, + { + "entropy": 0.06499332496896386, + "epoch": 3.8299335586898238, + "grad_norm": 0.275390625, + "learning_rate": 4.977480498407488e-05, + "loss": 0.0647, + "mean_token_accuracy": 0.979715633392334, + "num_tokens": 32894850.0, + "step": 16430 + }, + { + "entropy": 0.13496083430945874, + "epoch": 3.831099195710456, + "grad_norm": 6.75, + "learning_rate": 4.977451645936604e-05, + "loss": 0.2355, + "mean_token_accuracy": 0.9405577182769775, + "num_tokens": 32926236.0, + "step": 16435 + }, + { + "entropy": 0.07254137042909861, + "epoch": 3.8322648327310875, + "grad_norm": 1.5078125, + "learning_rate": 4.97742277516257e-05, + "loss": 0.0907, + "mean_token_accuracy": 0.9758977711200714, + "num_tokens": 32950365.0, + "step": 16440 + }, + { + "entropy": 0.11544609442353249, + "epoch": 3.833430469751719, + "grad_norm": 4.125, + "learning_rate": 4.977393886085816e-05, + "loss": 0.1671, + "mean_token_accuracy": 0.9559564173221589, + "num_tokens": 32976927.0, + "step": 16445 + }, + { + "entropy": 0.1054431926459074, + "epoch": 3.8345961067723513, + "grad_norm": 4.625, + "learning_rate": 4.977364978706773e-05, + "loss": 0.1556, + "mean_token_accuracy": 0.9585894703865051, + "num_tokens": 32991700.0, + "step": 16450 + }, + { + "entropy": 0.10639844667166472, + "epoch": 3.835761743792983, + "grad_norm": 0.68359375, + "learning_rate": 4.977336053025873e-05, + "loss": 0.0992, + "mean_token_accuracy": 0.9651355922222138, + "num_tokens": 33013040.0, + "step": 16455 + }, + { + "entropy": 0.09124952163547277, + "epoch": 3.8369273808136146, + "grad_norm": 8.75, + "learning_rate": 4.977307109043546e-05, + "loss": 0.1246, + "mean_token_accuracy": 0.9677917957305908, + "num_tokens": 33033465.0, + "step": 16460 + }, + { + "entropy": 0.09483155831694604, + "epoch": 3.8380930178342463, + "grad_norm": 3.46875, + "learning_rate": 4.9772781467602235e-05, + "loss": 0.1548, + "mean_token_accuracy": 0.9650797605514526, + "num_tokens": 33046989.0, + "step": 16465 + }, + { + "entropy": 0.17525366619229316, + "epoch": 3.839258654854878, + "grad_norm": 0.64453125, + "learning_rate": 4.977249166176339e-05, + "loss": 0.3451, + "mean_token_accuracy": 0.9297631740570068, + "num_tokens": 33061848.0, + "step": 16470 + }, + { + "entropy": 0.08751837406307458, + "epoch": 3.84042429187551, + "grad_norm": 4.1875, + "learning_rate": 4.9772201672923225e-05, + "loss": 0.095, + "mean_token_accuracy": 0.9782902419567108, + "num_tokens": 33080890.0, + "step": 16475 + }, + { + "entropy": 0.07149492613971234, + "epoch": 3.8415899288961417, + "grad_norm": 1.328125, + "learning_rate": 4.977191150108608e-05, + "loss": 0.1256, + "mean_token_accuracy": 0.9729875683784485, + "num_tokens": 33095960.0, + "step": 16480 + }, + { + "entropy": 0.08944402430206537, + "epoch": 3.8427555659167734, + "grad_norm": 0.94921875, + "learning_rate": 4.977162114625627e-05, + "loss": 0.0872, + "mean_token_accuracy": 0.9720249235630035, + "num_tokens": 33126818.0, + "step": 16485 + }, + { + "entropy": 0.09480830989778041, + "epoch": 3.8439212029374055, + "grad_norm": 1.6640625, + "learning_rate": 4.9771330608438136e-05, + "loss": 0.1766, + "mean_token_accuracy": 0.9600850522518158, + "num_tokens": 33139661.0, + "step": 16490 + }, + { + "entropy": 0.08334624543786048, + "epoch": 3.845086839958037, + "grad_norm": 7.75, + "learning_rate": 4.977103988763601e-05, + "loss": 0.1408, + "mean_token_accuracy": 0.9648266136646271, + "num_tokens": 33154190.0, + "step": 16495 + }, + { + "entropy": 0.10106358705088496, + "epoch": 3.846252476978669, + "grad_norm": 1.328125, + "learning_rate": 4.977074898385421e-05, + "loss": 0.1453, + "mean_token_accuracy": 0.9647497057914733, + "num_tokens": 33185980.0, + "step": 16500 + }, + { + "entropy": 0.10884370524436235, + "epoch": 3.847418113999301, + "grad_norm": 0.90625, + "learning_rate": 4.97704578970971e-05, + "loss": 0.0902, + "mean_token_accuracy": 0.9579053461551666, + "num_tokens": 33212368.0, + "step": 16505 + }, + { + "entropy": 0.07987378798425197, + "epoch": 3.8485837510199326, + "grad_norm": 2.921875, + "learning_rate": 4.9770166627369e-05, + "loss": 0.0731, + "mean_token_accuracy": 0.975167590379715, + "num_tokens": 33240176.0, + "step": 16510 + }, + { + "entropy": 0.0864196315407753, + "epoch": 3.849749388040564, + "grad_norm": 1.8359375, + "learning_rate": 4.9769875174674265e-05, + "loss": 0.1128, + "mean_token_accuracy": 0.9740726411342621, + "num_tokens": 33252259.0, + "step": 16515 + }, + { + "entropy": 0.09710243921726942, + "epoch": 3.850915025061196, + "grad_norm": 3.921875, + "learning_rate": 4.976958353901723e-05, + "loss": 0.1266, + "mean_token_accuracy": 0.9676422059535981, + "num_tokens": 33281031.0, + "step": 16520 + }, + { + "entropy": 0.1039828835055232, + "epoch": 3.8520806620818275, + "grad_norm": 3.640625, + "learning_rate": 4.9769291720402254e-05, + "loss": 0.0985, + "mean_token_accuracy": 0.9699716866016388, + "num_tokens": 33308560.0, + "step": 16525 + }, + { + "entropy": 0.11291272137314082, + "epoch": 3.8532462991024596, + "grad_norm": 7.21875, + "learning_rate": 4.976899971883368e-05, + "loss": 0.1184, + "mean_token_accuracy": 0.9514906764030456, + "num_tokens": 33336129.0, + "step": 16530 + }, + { + "entropy": 0.12264008279889822, + "epoch": 3.8544119361230913, + "grad_norm": 4.75, + "learning_rate": 4.976870753431587e-05, + "loss": 0.185, + "mean_token_accuracy": 0.9556744515895843, + "num_tokens": 33354277.0, + "step": 16535 + }, + { + "entropy": 0.09388621672987937, + "epoch": 3.855577573143723, + "grad_norm": 6.34375, + "learning_rate": 4.9768415166853175e-05, + "loss": 0.1608, + "mean_token_accuracy": 0.9654055655002594, + "num_tokens": 33364865.0, + "step": 16540 + }, + { + "entropy": 0.14195291325449944, + "epoch": 3.856743210164355, + "grad_norm": 1.8203125, + "learning_rate": 4.976812261644996e-05, + "loss": 0.2461, + "mean_token_accuracy": 0.9434653699398041, + "num_tokens": 33375626.0, + "step": 16545 + }, + { + "entropy": 0.10139231681823731, + "epoch": 3.8579088471849867, + "grad_norm": 1.28125, + "learning_rate": 4.976782988311058e-05, + "loss": 0.1317, + "mean_token_accuracy": 0.9686038613319397, + "num_tokens": 33392944.0, + "step": 16550 + }, + { + "entropy": 0.10185308828949928, + "epoch": 3.8590744842056184, + "grad_norm": 4.3125, + "learning_rate": 4.9767536966839414e-05, + "loss": 0.1471, + "mean_token_accuracy": 0.9665589034557343, + "num_tokens": 33412158.0, + "step": 16555 + }, + { + "entropy": 0.09140795618295669, + "epoch": 3.86024012122625, + "grad_norm": 1.296875, + "learning_rate": 4.976724386764081e-05, + "loss": 0.1415, + "mean_token_accuracy": 0.9718575298786163, + "num_tokens": 33424867.0, + "step": 16560 + }, + { + "entropy": 0.09763126391917468, + "epoch": 3.8614057582468817, + "grad_norm": 9.1875, + "learning_rate": 4.976695058551916e-05, + "loss": 0.1007, + "mean_token_accuracy": 0.9644618034362793, + "num_tokens": 33443510.0, + "step": 16565 + }, + { + "entropy": 0.15136264748871325, + "epoch": 3.862571395267514, + "grad_norm": 5.4375, + "learning_rate": 4.976665712047882e-05, + "loss": 0.1573, + "mean_token_accuracy": 0.9575160503387451, + "num_tokens": 33452758.0, + "step": 16570 + }, + { + "entropy": 0.0832042837049812, + "epoch": 3.8637370322881455, + "grad_norm": 1.140625, + "learning_rate": 4.976636347252417e-05, + "loss": 0.1049, + "mean_token_accuracy": 0.9735616624355317, + "num_tokens": 33487033.0, + "step": 16575 + }, + { + "entropy": 0.130936024710536, + "epoch": 3.864902669308777, + "grad_norm": 11.75, + "learning_rate": 4.976606964165959e-05, + "loss": 0.3414, + "mean_token_accuracy": 0.9347775518894196, + "num_tokens": 33512831.0, + "step": 16580 + }, + { + "entropy": 0.09509001523256302, + "epoch": 3.8660683063294092, + "grad_norm": 0.85546875, + "learning_rate": 4.9765775627889466e-05, + "loss": 0.0911, + "mean_token_accuracy": 0.9706315398216248, + "num_tokens": 33540628.0, + "step": 16585 + }, + { + "entropy": 0.09184669237583876, + "epoch": 3.867233943350041, + "grad_norm": 3.25, + "learning_rate": 4.9765481431218176e-05, + "loss": 0.1339, + "mean_token_accuracy": 0.9704009413719177, + "num_tokens": 33557555.0, + "step": 16590 + }, + { + "entropy": 0.10518418364226818, + "epoch": 3.8683995803706726, + "grad_norm": 1.3671875, + "learning_rate": 4.976518705165011e-05, + "loss": 0.1223, + "mean_token_accuracy": 0.9689244747161865, + "num_tokens": 33582110.0, + "step": 16595 + }, + { + "entropy": 0.10216858349740505, + "epoch": 3.869565217391304, + "grad_norm": 4.1875, + "learning_rate": 4.976489248918966e-05, + "loss": 0.2179, + "mean_token_accuracy": 0.9570053875446319, + "num_tokens": 33592239.0, + "step": 16600 + }, + { + "entropy": 0.11914208475500346, + "epoch": 3.870730854411936, + "grad_norm": 1.375, + "learning_rate": 4.976459774384121e-05, + "loss": 0.165, + "mean_token_accuracy": 0.9575796544551849, + "num_tokens": 33607680.0, + "step": 16605 + }, + { + "entropy": 0.07731572240591049, + "epoch": 3.871896491432568, + "grad_norm": 1.3671875, + "learning_rate": 4.976430281560917e-05, + "loss": 0.1315, + "mean_token_accuracy": 0.9717673599720001, + "num_tokens": 33626618.0, + "step": 16610 + }, + { + "entropy": 0.1043876113370061, + "epoch": 3.8730621284531996, + "grad_norm": 2.484375, + "learning_rate": 4.976400770449792e-05, + "loss": 0.1537, + "mean_token_accuracy": 0.9649846971035003, + "num_tokens": 33645847.0, + "step": 16615 + }, + { + "entropy": 0.18505688859149813, + "epoch": 3.8742277654738313, + "grad_norm": 0.91796875, + "learning_rate": 4.9763712410511874e-05, + "loss": 0.2365, + "mean_token_accuracy": 0.9463666260242463, + "num_tokens": 33676871.0, + "step": 16620 + }, + { + "entropy": 0.08650575876235962, + "epoch": 3.8753934024944634, + "grad_norm": 4.75, + "learning_rate": 4.9763416933655425e-05, + "loss": 0.1445, + "mean_token_accuracy": 0.967893385887146, + "num_tokens": 33691010.0, + "step": 16625 + }, + { + "entropy": 0.16968498565256596, + "epoch": 3.876559039515095, + "grad_norm": 1.3203125, + "learning_rate": 4.976312127393299e-05, + "loss": 0.1554, + "mean_token_accuracy": 0.9321939647197723, + "num_tokens": 33724208.0, + "step": 16630 + }, + { + "entropy": 0.07792753130197524, + "epoch": 3.8777246765357267, + "grad_norm": 5.0, + "learning_rate": 4.976282543134897e-05, + "loss": 0.1136, + "mean_token_accuracy": 0.9741957724094391, + "num_tokens": 33742948.0, + "step": 16635 + }, + { + "entropy": 0.08758415430784225, + "epoch": 3.878890313556359, + "grad_norm": 1.0390625, + "learning_rate": 4.976252940590778e-05, + "loss": 0.1177, + "mean_token_accuracy": 0.972603303194046, + "num_tokens": 33766005.0, + "step": 16640 + }, + { + "entropy": 0.1088608767837286, + "epoch": 3.8800559505769905, + "grad_norm": 5.0625, + "learning_rate": 4.9762233197613837e-05, + "loss": 0.2298, + "mean_token_accuracy": 0.9552813589572906, + "num_tokens": 33779897.0, + "step": 16645 + }, + { + "entropy": 0.09404038712382316, + "epoch": 3.881221587597622, + "grad_norm": 3.140625, + "learning_rate": 4.976193680647154e-05, + "loss": 0.1389, + "mean_token_accuracy": 0.9661530613899231, + "num_tokens": 33795574.0, + "step": 16650 + }, + { + "entropy": 0.12309183105826378, + "epoch": 3.882387224618254, + "grad_norm": 5.0625, + "learning_rate": 4.9761640232485334e-05, + "loss": 0.2158, + "mean_token_accuracy": 0.9480292320251464, + "num_tokens": 33805200.0, + "step": 16655 + }, + { + "entropy": 0.07593281920999288, + "epoch": 3.8835528616388855, + "grad_norm": 2.703125, + "learning_rate": 4.976134347565963e-05, + "loss": 0.0476, + "mean_token_accuracy": 0.9800086498260498, + "num_tokens": 33846497.0, + "step": 16660 + }, + { + "entropy": 0.07238452583551407, + "epoch": 3.8847184986595176, + "grad_norm": 2.3125, + "learning_rate": 4.976104653599884e-05, + "loss": 0.0828, + "mean_token_accuracy": 0.9750954031944274, + "num_tokens": 33890541.0, + "step": 16665 + }, + { + "entropy": 0.11538115590810775, + "epoch": 3.8858841356801492, + "grad_norm": 4.03125, + "learning_rate": 4.976074941350741e-05, + "loss": 0.1476, + "mean_token_accuracy": 0.9668866634368897, + "num_tokens": 33904145.0, + "step": 16670 + }, + { + "entropy": 0.1241513341665268, + "epoch": 3.887049772700781, + "grad_norm": 0.73046875, + "learning_rate": 4.976045210818977e-05, + "loss": 0.1267, + "mean_token_accuracy": 0.966703736782074, + "num_tokens": 33924578.0, + "step": 16675 + }, + { + "entropy": 0.0924358457326889, + "epoch": 3.888215409721413, + "grad_norm": 8.9375, + "learning_rate": 4.976015462005035e-05, + "loss": 0.1662, + "mean_token_accuracy": 0.9649093568325042, + "num_tokens": 33941729.0, + "step": 16680 + }, + { + "entropy": 0.11906333230435848, + "epoch": 3.8893810467420447, + "grad_norm": 1.4921875, + "learning_rate": 4.975985694909358e-05, + "loss": 0.1623, + "mean_token_accuracy": 0.9645924746990204, + "num_tokens": 33965840.0, + "step": 16685 + }, + { + "entropy": 0.07723705116659403, + "epoch": 3.8905466837626763, + "grad_norm": 2.03125, + "learning_rate": 4.975955909532391e-05, + "loss": 0.0603, + "mean_token_accuracy": 0.9721529066562653, + "num_tokens": 33993315.0, + "step": 16690 + }, + { + "entropy": 0.15692530367523433, + "epoch": 3.891712320783308, + "grad_norm": 0.890625, + "learning_rate": 4.9759261058745756e-05, + "loss": 0.1724, + "mean_token_accuracy": 0.9513945758342743, + "num_tokens": 34021121.0, + "step": 16695 + }, + { + "entropy": 0.06238477006554603, + "epoch": 3.8928779578039396, + "grad_norm": 4.59375, + "learning_rate": 4.97589628393636e-05, + "loss": 0.0723, + "mean_token_accuracy": 0.9769518494606018, + "num_tokens": 34037255.0, + "step": 16700 + }, + { + "entropy": 0.0868049081414938, + "epoch": 3.8940435948245717, + "grad_norm": 3.234375, + "learning_rate": 4.9758664437181856e-05, + "loss": 0.1231, + "mean_token_accuracy": 0.9709369957447052, + "num_tokens": 34049033.0, + "step": 16705 + }, + { + "entropy": 0.12496979609131813, + "epoch": 3.8952092318452034, + "grad_norm": 3.375, + "learning_rate": 4.9758365852205e-05, + "loss": 0.2056, + "mean_token_accuracy": 0.9534848451614379, + "num_tokens": 34059272.0, + "step": 16710 + }, + { + "entropy": 0.08685902096331119, + "epoch": 3.896374868865835, + "grad_norm": 4.21875, + "learning_rate": 4.975806708443746e-05, + "loss": 0.1175, + "mean_token_accuracy": 0.970902019739151, + "num_tokens": 34072449.0, + "step": 16715 + }, + { + "entropy": 0.07332117343321443, + "epoch": 3.897540505886467, + "grad_norm": 5.65625, + "learning_rate": 4.975776813388371e-05, + "loss": 0.118, + "mean_token_accuracy": 0.9707030892372132, + "num_tokens": 34099071.0, + "step": 16720 + }, + { + "entropy": 0.1274700254201889, + "epoch": 3.898706142907099, + "grad_norm": 7.5625, + "learning_rate": 4.9757469000548194e-05, + "loss": 0.2366, + "mean_token_accuracy": 0.9488331198692321, + "num_tokens": 34115790.0, + "step": 16725 + }, + { + "entropy": 0.18221108466386796, + "epoch": 3.8998717799277305, + "grad_norm": 0.84375, + "learning_rate": 4.9757169684435385e-05, + "loss": 0.142, + "mean_token_accuracy": 0.9406643450260163, + "num_tokens": 34150985.0, + "step": 16730 + }, + { + "entropy": 0.10956341233104468, + "epoch": 3.901037416948362, + "grad_norm": 4.6875, + "learning_rate": 4.975687018554974e-05, + "loss": 0.1113, + "mean_token_accuracy": 0.9680880844593048, + "num_tokens": 34177333.0, + "step": 16735 + }, + { + "entropy": 0.093040444329381, + "epoch": 3.902203053968994, + "grad_norm": 7.5625, + "learning_rate": 4.975657050389572e-05, + "loss": 0.0971, + "mean_token_accuracy": 0.9710028231143951, + "num_tokens": 34205723.0, + "step": 16740 + }, + { + "entropy": 0.08593805404379964, + "epoch": 3.903368690989626, + "grad_norm": 6.875, + "learning_rate": 4.9756270639477804e-05, + "loss": 0.1844, + "mean_token_accuracy": 0.9584748089313507, + "num_tokens": 34221572.0, + "step": 16745 + }, + { + "entropy": 0.11587801575660706, + "epoch": 3.9045343280102576, + "grad_norm": 3.90625, + "learning_rate": 4.9755970592300454e-05, + "loss": 0.1936, + "mean_token_accuracy": 0.9567054510116577, + "num_tokens": 34231895.0, + "step": 16750 + }, + { + "entropy": 0.08428452573716641, + "epoch": 3.9056999650308892, + "grad_norm": 2.6875, + "learning_rate": 4.975567036236815e-05, + "loss": 0.2177, + "mean_token_accuracy": 0.9556152582168579, + "num_tokens": 34261270.0, + "step": 16755 + }, + { + "entropy": 0.08318189261481165, + "epoch": 3.9068656020515213, + "grad_norm": 0.90625, + "learning_rate": 4.975536994968537e-05, + "loss": 0.0855, + "mean_token_accuracy": 0.9774995982646942, + "num_tokens": 34288938.0, + "step": 16760 + }, + { + "entropy": 0.07968087457120418, + "epoch": 3.908031239072153, + "grad_norm": 0.6796875, + "learning_rate": 4.975506935425659e-05, + "loss": 0.1019, + "mean_token_accuracy": 0.9670353651046752, + "num_tokens": 34311606.0, + "step": 16765 + }, + { + "entropy": 0.11844033598899842, + "epoch": 3.9091968760927847, + "grad_norm": 0.7421875, + "learning_rate": 4.975476857608629e-05, + "loss": 0.1238, + "mean_token_accuracy": 0.962626975774765, + "num_tokens": 34334720.0, + "step": 16770 + }, + { + "entropy": 0.11227045767009258, + "epoch": 3.9103625131134163, + "grad_norm": 8.3125, + "learning_rate": 4.9754467615178965e-05, + "loss": 0.1875, + "mean_token_accuracy": 0.9611839234828949, + "num_tokens": 34365228.0, + "step": 16775 + }, + { + "entropy": 0.09844578094780446, + "epoch": 3.9115281501340484, + "grad_norm": 5.53125, + "learning_rate": 4.975416647153909e-05, + "loss": 0.1339, + "mean_token_accuracy": 0.9626381635665894, + "num_tokens": 34385551.0, + "step": 16780 + }, + { + "entropy": 0.08762693898752331, + "epoch": 3.91269378715468, + "grad_norm": 1.2578125, + "learning_rate": 4.975386514517116e-05, + "loss": 0.1465, + "mean_token_accuracy": 0.9660818219184876, + "num_tokens": 34426491.0, + "step": 16785 + }, + { + "entropy": 0.11643909402191639, + "epoch": 3.9138594241753117, + "grad_norm": 3.0625, + "learning_rate": 4.975356363607967e-05, + "loss": 0.1443, + "mean_token_accuracy": 0.9647420525550843, + "num_tokens": 34445386.0, + "step": 16790 + }, + { + "entropy": 0.07333908714354039, + "epoch": 3.9150250611959434, + "grad_norm": 2.234375, + "learning_rate": 4.975326194426913e-05, + "loss": 0.0984, + "mean_token_accuracy": 0.9713580191135407, + "num_tokens": 34463617.0, + "step": 16795 + }, + { + "entropy": 0.13890503272414206, + "epoch": 3.9161906982165755, + "grad_norm": 2.921875, + "learning_rate": 4.9752960069744e-05, + "loss": 0.17, + "mean_token_accuracy": 0.9563362002372742, + "num_tokens": 34491601.0, + "step": 16800 + }, + { + "entropy": 0.0995341569185257, + "epoch": 3.917356335237207, + "grad_norm": 9.625, + "learning_rate": 4.975265801250882e-05, + "loss": 0.2026, + "mean_token_accuracy": 0.9588484287261962, + "num_tokens": 34500896.0, + "step": 16805 + }, + { + "entropy": 0.13597904723137616, + "epoch": 3.918521972257839, + "grad_norm": 1.40625, + "learning_rate": 4.9752355772568084e-05, + "loss": 0.2381, + "mean_token_accuracy": 0.9384510278701782, + "num_tokens": 34515025.0, + "step": 16810 + }, + { + "entropy": 0.11005090977996587, + "epoch": 3.919687609278471, + "grad_norm": 8.8125, + "learning_rate": 4.9752053349926284e-05, + "loss": 0.1877, + "mean_token_accuracy": 0.9612878561019897, + "num_tokens": 34535079.0, + "step": 16815 + }, + { + "entropy": 0.1047309897840023, + "epoch": 3.9208532462991026, + "grad_norm": 6.03125, + "learning_rate": 4.9751750744587955e-05, + "loss": 0.1837, + "mean_token_accuracy": 0.9651427209377289, + "num_tokens": 34563356.0, + "step": 16820 + }, + { + "entropy": 0.09921911545097828, + "epoch": 3.9220188833197343, + "grad_norm": 1.8359375, + "learning_rate": 4.975144795655758e-05, + "loss": 0.124, + "mean_token_accuracy": 0.9721204698085785, + "num_tokens": 34576625.0, + "step": 16825 + }, + { + "entropy": 0.15272005051374435, + "epoch": 3.923184520340366, + "grad_norm": 2.234375, + "learning_rate": 4.97511449858397e-05, + "loss": 0.1905, + "mean_token_accuracy": 0.9491037011146546, + "num_tokens": 34615813.0, + "step": 16830 + }, + { + "entropy": 0.08232778124511242, + "epoch": 3.9243501573609976, + "grad_norm": 1.359375, + "learning_rate": 4.975084183243882e-05, + "loss": 0.1059, + "mean_token_accuracy": 0.9762903690338135, + "num_tokens": 34626874.0, + "step": 16835 + }, + { + "entropy": 0.0976740401238203, + "epoch": 3.9255157943816297, + "grad_norm": 5.96875, + "learning_rate": 4.975053849635946e-05, + "loss": 0.1366, + "mean_token_accuracy": 0.9610688865184784, + "num_tokens": 34641904.0, + "step": 16840 + }, + { + "entropy": 0.07497776001691818, + "epoch": 3.9266814314022613, + "grad_norm": 0.61328125, + "learning_rate": 4.9750234977606135e-05, + "loss": 0.1081, + "mean_token_accuracy": 0.9722188532352447, + "num_tokens": 34663203.0, + "step": 16845 + }, + { + "entropy": 0.10160439331084489, + "epoch": 3.927847068422893, + "grad_norm": 0.82421875, + "learning_rate": 4.9749931276183394e-05, + "loss": 0.1481, + "mean_token_accuracy": 0.9629201531410218, + "num_tokens": 34682215.0, + "step": 16850 + }, + { + "entropy": 0.08226745296269655, + "epoch": 3.929012705443525, + "grad_norm": 1.328125, + "learning_rate": 4.974962739209574e-05, + "loss": 0.0816, + "mean_token_accuracy": 0.9744136035442352, + "num_tokens": 34698813.0, + "step": 16855 + }, + { + "entropy": 0.16998137403279542, + "epoch": 3.9301783424641568, + "grad_norm": 1.546875, + "learning_rate": 4.974932332534773e-05, + "loss": 0.1684, + "mean_token_accuracy": 0.9295632243156433, + "num_tokens": 34727365.0, + "step": 16860 + }, + { + "entropy": 0.11339530013501645, + "epoch": 3.9313439794847884, + "grad_norm": 3.453125, + "learning_rate": 4.974901907594388e-05, + "loss": 0.2006, + "mean_token_accuracy": 0.9534684479236603, + "num_tokens": 34737824.0, + "step": 16865 + }, + { + "entropy": 0.08613226562738419, + "epoch": 3.93250961650542, + "grad_norm": 2.046875, + "learning_rate": 4.9748714643888736e-05, + "loss": 0.0918, + "mean_token_accuracy": 0.9665167927742004, + "num_tokens": 34767020.0, + "step": 16870 + }, + { + "entropy": 0.08076264001429082, + "epoch": 3.9336752535260517, + "grad_norm": 2.421875, + "learning_rate": 4.9748410029186824e-05, + "loss": 0.0912, + "mean_token_accuracy": 0.978045392036438, + "num_tokens": 34781216.0, + "step": 16875 + }, + { + "entropy": 0.11103322636336088, + "epoch": 3.934840890546684, + "grad_norm": 6.34375, + "learning_rate": 4.974810523184271e-05, + "loss": 0.1654, + "mean_token_accuracy": 0.9646231949329376, + "num_tokens": 34795320.0, + "step": 16880 + }, + { + "entropy": 0.09688258673995734, + "epoch": 3.9360065275673155, + "grad_norm": 0.8515625, + "learning_rate": 4.974780025186091e-05, + "loss": 0.141, + "mean_token_accuracy": 0.9685312807559967, + "num_tokens": 34816339.0, + "step": 16885 + }, + { + "entropy": 0.07648933194577694, + "epoch": 3.937172164587947, + "grad_norm": 1.6328125, + "learning_rate": 4.9747495089246e-05, + "loss": 0.0967, + "mean_token_accuracy": 0.978608375787735, + "num_tokens": 34833298.0, + "step": 16890 + }, + { + "entropy": 0.08467755373567343, + "epoch": 3.9383378016085793, + "grad_norm": 0.5, + "learning_rate": 4.974718974400251e-05, + "loss": 0.0979, + "mean_token_accuracy": 0.9738011240959168, + "num_tokens": 34862645.0, + "step": 16895 + }, + { + "entropy": 0.11560116838663817, + "epoch": 3.939503438629211, + "grad_norm": 1.25, + "learning_rate": 4.9746884216135e-05, + "loss": 0.1156, + "mean_token_accuracy": 0.9693970024585724, + "num_tokens": 34885171.0, + "step": 16900 + }, + { + "entropy": 0.08131022192537785, + "epoch": 3.9406690756498426, + "grad_norm": 4.03125, + "learning_rate": 4.974657850564802e-05, + "loss": 0.1359, + "mean_token_accuracy": 0.9690242052078247, + "num_tokens": 34902737.0, + "step": 16905 + }, + { + "entropy": 0.08744903868064284, + "epoch": 3.9418347126704743, + "grad_norm": 1.59375, + "learning_rate": 4.974627261254614e-05, + "loss": 0.1904, + "mean_token_accuracy": 0.9598405420780182, + "num_tokens": 34922558.0, + "step": 16910 + }, + { + "entropy": 0.09291955698281526, + "epoch": 3.9430003496911064, + "grad_norm": 5.53125, + "learning_rate": 4.974596653683392e-05, + "loss": 0.1861, + "mean_token_accuracy": 0.9584287703037262, + "num_tokens": 34940507.0, + "step": 16915 + }, + { + "entropy": 0.09390394520014525, + "epoch": 3.944165986711738, + "grad_norm": 4.59375, + "learning_rate": 4.9745660278515916e-05, + "loss": 0.1347, + "mean_token_accuracy": 0.9714871406555176, + "num_tokens": 34960577.0, + "step": 16920 + }, + { + "entropy": 0.13802707754075527, + "epoch": 3.9453316237323697, + "grad_norm": 1.7734375, + "learning_rate": 4.97453538375967e-05, + "loss": 0.1361, + "mean_token_accuracy": 0.9666981935501099, + "num_tokens": 34979635.0, + "step": 16925 + }, + { + "entropy": 0.07457001972943544, + "epoch": 3.9464972607530013, + "grad_norm": 0.6875, + "learning_rate": 4.974504721408084e-05, + "loss": 0.1188, + "mean_token_accuracy": 0.9700695991516113, + "num_tokens": 35005275.0, + "step": 16930 + }, + { + "entropy": 0.09323781207203866, + "epoch": 3.9476628977736334, + "grad_norm": 7.4375, + "learning_rate": 4.974474040797291e-05, + "loss": 0.1387, + "mean_token_accuracy": 0.9643499076366424, + "num_tokens": 35024268.0, + "step": 16935 + }, + { + "entropy": 0.10064857602119445, + "epoch": 3.948828534794265, + "grad_norm": 3.96875, + "learning_rate": 4.974443341927748e-05, + "loss": 0.204, + "mean_token_accuracy": 0.9560014724731445, + "num_tokens": 35034041.0, + "step": 16940 + }, + { + "entropy": 0.11008808445185422, + "epoch": 3.9499941718148968, + "grad_norm": 7.28125, + "learning_rate": 4.974412624799913e-05, + "loss": 0.1293, + "mean_token_accuracy": 0.9637737035751343, + "num_tokens": 35054443.0, + "step": 16945 + }, + { + "entropy": 0.1091458585113287, + "epoch": 3.951159808835529, + "grad_norm": 2.984375, + "learning_rate": 4.9743818894142445e-05, + "loss": 0.1661, + "mean_token_accuracy": 0.9582258284091949, + "num_tokens": 35072895.0, + "step": 16950 + }, + { + "entropy": 0.09630255661904812, + "epoch": 3.9523254458561605, + "grad_norm": 2.28125, + "learning_rate": 4.974351135771201e-05, + "loss": 0.1452, + "mean_token_accuracy": 0.9703953325748443, + "num_tokens": 35084442.0, + "step": 16955 + }, + { + "entropy": 0.08614437934011221, + "epoch": 3.953491082876792, + "grad_norm": 3.1875, + "learning_rate": 4.9743203638712394e-05, + "loss": 0.1202, + "mean_token_accuracy": 0.9727041780948639, + "num_tokens": 35118913.0, + "step": 16960 + }, + { + "entropy": 0.09501389507204294, + "epoch": 3.954656719897424, + "grad_norm": 6.75, + "learning_rate": 4.9742895737148204e-05, + "loss": 0.1839, + "mean_token_accuracy": 0.9623470664024353, + "num_tokens": 35142002.0, + "step": 16965 + }, + { + "entropy": 0.10413766689598561, + "epoch": 3.9558223569180555, + "grad_norm": 3.21875, + "learning_rate": 4.974258765302402e-05, + "loss": 0.1568, + "mean_token_accuracy": 0.9626488566398621, + "num_tokens": 35160878.0, + "step": 16970 + }, + { + "entropy": 0.13350182436406613, + "epoch": 3.9569879939386876, + "grad_norm": 4.6875, + "learning_rate": 4.974227938634444e-05, + "loss": 0.1619, + "mean_token_accuracy": 0.9620459258556366, + "num_tokens": 35170449.0, + "step": 16975 + }, + { + "entropy": 0.09258143836632371, + "epoch": 3.9581536309593193, + "grad_norm": 11.0, + "learning_rate": 4.974197093711407e-05, + "loss": 0.1253, + "mean_token_accuracy": 0.9680348873138428, + "num_tokens": 35195509.0, + "step": 16980 + }, + { + "entropy": 0.12735018711537122, + "epoch": 3.959319267979951, + "grad_norm": 2.53125, + "learning_rate": 4.974166230533749e-05, + "loss": 0.1687, + "mean_token_accuracy": 0.9616983890533447, + "num_tokens": 35209393.0, + "step": 16985 + }, + { + "entropy": 0.08677174393087625, + "epoch": 3.960484905000583, + "grad_norm": 8.25, + "learning_rate": 4.974135349101932e-05, + "loss": 0.1203, + "mean_token_accuracy": 0.9645900964736939, + "num_tokens": 35226604.0, + "step": 16990 + }, + { + "entropy": 0.07679356653243304, + "epoch": 3.9616505420212147, + "grad_norm": 1.640625, + "learning_rate": 4.9741044494164155e-05, + "loss": 0.0642, + "mean_token_accuracy": 0.9766354739665986, + "num_tokens": 35255439.0, + "step": 16995 + }, + { + "entropy": 0.12029009442776442, + "epoch": 3.9628161790418464, + "grad_norm": 2.5625, + "learning_rate": 4.97407353147766e-05, + "loss": 0.1779, + "mean_token_accuracy": 0.9526813089847564, + "num_tokens": 35273550.0, + "step": 17000 + }, + { + "entropy": 0.10844596447423101, + "epoch": 3.963981816062478, + "grad_norm": 4.90625, + "learning_rate": 4.9740425952861275e-05, + "loss": 0.1563, + "mean_token_accuracy": 0.964635568857193, + "num_tokens": 35287014.0, + "step": 17005 + }, + { + "entropy": 0.10331882648169995, + "epoch": 3.9651474530831097, + "grad_norm": 1.078125, + "learning_rate": 4.9740116408422786e-05, + "loss": 0.1502, + "mean_token_accuracy": 0.9674645781517028, + "num_tokens": 35308920.0, + "step": 17010 + }, + { + "entropy": 0.08422896657139063, + "epoch": 3.966313090103742, + "grad_norm": 1.59375, + "learning_rate": 4.973980668146575e-05, + "loss": 0.1126, + "mean_token_accuracy": 0.9708844125270844, + "num_tokens": 35324331.0, + "step": 17015 + }, + { + "entropy": 0.09905137531459332, + "epoch": 3.9674787271243734, + "grad_norm": 4.6875, + "learning_rate": 4.973949677199479e-05, + "loss": 0.1579, + "mean_token_accuracy": 0.9681069612503052, + "num_tokens": 35337312.0, + "step": 17020 + }, + { + "entropy": 0.10593837816268206, + "epoch": 3.968644364145005, + "grad_norm": 8.75, + "learning_rate": 4.9739186680014525e-05, + "loss": 0.1572, + "mean_token_accuracy": 0.9576498866081238, + "num_tokens": 35369925.0, + "step": 17025 + }, + { + "entropy": 0.1549563642591238, + "epoch": 3.969810001165637, + "grad_norm": 3.921875, + "learning_rate": 4.973887640552958e-05, + "loss": 0.2213, + "mean_token_accuracy": 0.9465202629566193, + "num_tokens": 35394038.0, + "step": 17030 + }, + { + "entropy": 0.0999618673697114, + "epoch": 3.970975638186269, + "grad_norm": 0.8203125, + "learning_rate": 4.973856594854457e-05, + "loss": 0.1193, + "mean_token_accuracy": 0.9646108150482178, + "num_tokens": 35422201.0, + "step": 17035 + }, + { + "entropy": 0.0862261445261538, + "epoch": 3.9721412752069005, + "grad_norm": 0.94140625, + "learning_rate": 4.973825530906414e-05, + "loss": 0.0418, + "mean_token_accuracy": 0.9761014401912689, + "num_tokens": 35458470.0, + "step": 17040 + }, + { + "entropy": 0.0714691722765565, + "epoch": 3.973306912227532, + "grad_norm": 0.87109375, + "learning_rate": 4.9737944487092914e-05, + "loss": 0.0984, + "mean_token_accuracy": 0.9735678911209107, + "num_tokens": 35478787.0, + "step": 17045 + }, + { + "entropy": 0.12230119397863745, + "epoch": 3.9744725492481643, + "grad_norm": 0.75, + "learning_rate": 4.973763348263553e-05, + "loss": 0.1428, + "mean_token_accuracy": 0.9668144583702087, + "num_tokens": 35505383.0, + "step": 17050 + }, + { + "entropy": 0.1031965110450983, + "epoch": 3.975638186268796, + "grad_norm": 3.875, + "learning_rate": 4.973732229569662e-05, + "loss": 0.1709, + "mean_token_accuracy": 0.9613557398319245, + "num_tokens": 35518250.0, + "step": 17055 + }, + { + "entropy": 0.09054919630289078, + "epoch": 3.9768038232894276, + "grad_norm": 4.78125, + "learning_rate": 4.9737010926280844e-05, + "loss": 0.0919, + "mean_token_accuracy": 0.968346893787384, + "num_tokens": 35553291.0, + "step": 17060 + }, + { + "entropy": 0.1308944163378328, + "epoch": 3.9779694603100593, + "grad_norm": 0.69921875, + "learning_rate": 4.9736699374392816e-05, + "loss": 0.1891, + "mean_token_accuracy": 0.9522222697734832, + "num_tokens": 35577233.0, + "step": 17065 + }, + { + "entropy": 0.09694569632411003, + "epoch": 3.9791350973306914, + "grad_norm": 5.125, + "learning_rate": 4.9736387640037195e-05, + "loss": 0.1191, + "mean_token_accuracy": 0.969031709432602, + "num_tokens": 35610320.0, + "step": 17070 + }, + { + "entropy": 0.08072161357849836, + "epoch": 3.980300734351323, + "grad_norm": 3.25, + "learning_rate": 4.9736075723218636e-05, + "loss": 0.1532, + "mean_token_accuracy": 0.9647042334079743, + "num_tokens": 35626307.0, + "step": 17075 + }, + { + "entropy": 0.07369415275752544, + "epoch": 3.9814663713719547, + "grad_norm": 1.1015625, + "learning_rate": 4.973576362394178e-05, + "loss": 0.0809, + "mean_token_accuracy": 0.9759980022907258, + "num_tokens": 35644968.0, + "step": 17080 + }, + { + "entropy": 0.10177280581556261, + "epoch": 3.982632008392587, + "grad_norm": 1.9765625, + "learning_rate": 4.973545134221128e-05, + "loss": 0.0818, + "mean_token_accuracy": 0.9739609241485596, + "num_tokens": 35666050.0, + "step": 17085 + }, + { + "entropy": 0.06352849621325732, + "epoch": 3.9837976454132185, + "grad_norm": 0.921875, + "learning_rate": 4.973513887803181e-05, + "loss": 0.0537, + "mean_token_accuracy": 0.9793843626976013, + "num_tokens": 35689090.0, + "step": 17090 + }, + { + "entropy": 0.09791185222566127, + "epoch": 3.98496328243385, + "grad_norm": 4.53125, + "learning_rate": 4.973482623140801e-05, + "loss": 0.1461, + "mean_token_accuracy": 0.9674197316169739, + "num_tokens": 35698247.0, + "step": 17095 + }, + { + "entropy": 0.06252257125452161, + "epoch": 3.986128919454482, + "grad_norm": 0.59375, + "learning_rate": 4.973451340234454e-05, + "loss": 0.076, + "mean_token_accuracy": 0.9785351634025574, + "num_tokens": 35722602.0, + "step": 17100 + }, + { + "entropy": 0.09273346532136202, + "epoch": 3.9872945564751134, + "grad_norm": 0.5546875, + "learning_rate": 4.973420039084608e-05, + "loss": 0.1375, + "mean_token_accuracy": 0.965330445766449, + "num_tokens": 35759998.0, + "step": 17105 + }, + { + "entropy": 0.10029075648635626, + "epoch": 3.9884601934957455, + "grad_norm": 6.0, + "learning_rate": 4.97338871969173e-05, + "loss": 0.1704, + "mean_token_accuracy": 0.962207305431366, + "num_tokens": 35778283.0, + "step": 17110 + }, + { + "entropy": 0.16393357664346694, + "epoch": 3.989625830516377, + "grad_norm": 7.5625, + "learning_rate": 4.973357382056285e-05, + "loss": 0.2825, + "mean_token_accuracy": 0.9334946393966674, + "num_tokens": 35790208.0, + "step": 17115 + }, + { + "entropy": 0.10811770148575306, + "epoch": 3.990791467537009, + "grad_norm": 2.953125, + "learning_rate": 4.9733260261787415e-05, + "loss": 0.1301, + "mean_token_accuracy": 0.9654451310634613, + "num_tokens": 35807988.0, + "step": 17120 + }, + { + "entropy": 0.11802220270037651, + "epoch": 3.991957104557641, + "grad_norm": 0.734375, + "learning_rate": 4.973294652059568e-05, + "loss": 0.121, + "mean_token_accuracy": 0.9620232224464417, + "num_tokens": 35830113.0, + "step": 17125 + }, + { + "entropy": 0.12250925246626139, + "epoch": 3.9931227415782726, + "grad_norm": 5.1875, + "learning_rate": 4.9732632596992304e-05, + "loss": 0.1242, + "mean_token_accuracy": 0.9520814001560212, + "num_tokens": 35852219.0, + "step": 17130 + }, + { + "entropy": 0.11308533251285553, + "epoch": 3.9942883785989043, + "grad_norm": 5.9375, + "learning_rate": 4.973231849098197e-05, + "loss": 0.1257, + "mean_token_accuracy": 0.9568483293056488, + "num_tokens": 35869300.0, + "step": 17135 + }, + { + "entropy": 0.0839522771537304, + "epoch": 3.995454015619536, + "grad_norm": 3.046875, + "learning_rate": 4.973200420256938e-05, + "loss": 0.1611, + "mean_token_accuracy": 0.9654144763946533, + "num_tokens": 35886692.0, + "step": 17140 + }, + { + "entropy": 0.09023104514926672, + "epoch": 3.9966196526401676, + "grad_norm": 4.96875, + "learning_rate": 4.9731689731759204e-05, + "loss": 0.0774, + "mean_token_accuracy": 0.9785473227500916, + "num_tokens": 35910204.0, + "step": 17145 + }, + { + "entropy": 0.11231829710304737, + "epoch": 3.9977852896607997, + "grad_norm": 2.640625, + "learning_rate": 4.973137507855614e-05, + "loss": 0.0889, + "mean_token_accuracy": 0.9724850535392762, + "num_tokens": 35927582.0, + "step": 17150 + }, + { + "entropy": 0.0904229398816824, + "epoch": 3.9989509266814314, + "grad_norm": 3.125, + "learning_rate": 4.9731060242964875e-05, + "loss": 0.1323, + "mean_token_accuracy": 0.967452985048294, + "num_tokens": 35941857.0, + "step": 17155 + }, + { + "entropy": 0.09246301154295604, + "epoch": 4.0, + "grad_norm": 2.234375, + "learning_rate": 4.97307452249901e-05, + "loss": 0.107, + "mean_token_accuracy": 0.9710255795054965, + "num_tokens": 35961500.0, + "step": 17160 + }, + { + "entropy": 0.07402282971888781, + "epoch": 4.001165637020632, + "grad_norm": 1.84375, + "learning_rate": 4.973043002463653e-05, + "loss": 0.0689, + "mean_token_accuracy": 0.9799567878246307, + "num_tokens": 35977206.0, + "step": 17165 + }, + { + "entropy": 0.054261915292590855, + "epoch": 4.002331274041263, + "grad_norm": 2.078125, + "learning_rate": 4.973011464190884e-05, + "loss": 0.0509, + "mean_token_accuracy": 0.9852329611778259, + "num_tokens": 35999402.0, + "step": 17170 + }, + { + "entropy": 0.07321383291855454, + "epoch": 4.003496911061895, + "grad_norm": 1.34375, + "learning_rate": 4.9729799076811756e-05, + "loss": 0.0939, + "mean_token_accuracy": 0.9778148233890533, + "num_tokens": 36018866.0, + "step": 17175 + }, + { + "entropy": 0.07864432521164418, + "epoch": 4.0046625480825275, + "grad_norm": 4.71875, + "learning_rate": 4.972948332934997e-05, + "loss": 0.1028, + "mean_token_accuracy": 0.9760511696338654, + "num_tokens": 36037205.0, + "step": 17180 + }, + { + "entropy": 0.04691657172515988, + "epoch": 4.005828185103159, + "grad_norm": 0.337890625, + "learning_rate": 4.972916739952819e-05, + "loss": 0.0351, + "mean_token_accuracy": 0.9869137167930603, + "num_tokens": 36064889.0, + "step": 17185 + }, + { + "entropy": 0.0748681684024632, + "epoch": 4.006993822123791, + "grad_norm": 5.0625, + "learning_rate": 4.972885128735113e-05, + "loss": 0.0701, + "mean_token_accuracy": 0.9769394755363464, + "num_tokens": 36098320.0, + "step": 17190 + }, + { + "entropy": 0.08668388687074184, + "epoch": 4.008159459144422, + "grad_norm": 2.75, + "learning_rate": 4.972853499282351e-05, + "loss": 0.1585, + "mean_token_accuracy": 0.9724346101284027, + "num_tokens": 36117696.0, + "step": 17195 + }, + { + "entropy": 0.07840844243764877, + "epoch": 4.009325096165054, + "grad_norm": 3.40625, + "learning_rate": 4.9728218515950034e-05, + "loss": 0.0892, + "mean_token_accuracy": 0.9731666326522828, + "num_tokens": 36134581.0, + "step": 17200 + }, + { + "entropy": 0.07370718382298946, + "epoch": 4.010490733185686, + "grad_norm": 0.734375, + "learning_rate": 4.972790185673544e-05, + "loss": 0.0803, + "mean_token_accuracy": 0.9789455652236938, + "num_tokens": 36153047.0, + "step": 17205 + }, + { + "entropy": 0.083255111053586, + "epoch": 4.0116563702063175, + "grad_norm": 3.453125, + "learning_rate": 4.972758501518442e-05, + "loss": 0.0689, + "mean_token_accuracy": 0.9771815776824951, + "num_tokens": 36185090.0, + "step": 17210 + }, + { + "entropy": 0.05564054492861033, + "epoch": 4.01282200722695, + "grad_norm": 0.97265625, + "learning_rate": 4.972726799130173e-05, + "loss": 0.0613, + "mean_token_accuracy": 0.9788758993148804, + "num_tokens": 36198374.0, + "step": 17215 + }, + { + "entropy": 0.061925210803747174, + "epoch": 4.013987644247582, + "grad_norm": 0.8828125, + "learning_rate": 4.9726950785092074e-05, + "loss": 0.077, + "mean_token_accuracy": 0.9766283810138703, + "num_tokens": 36219825.0, + "step": 17220 + }, + { + "entropy": 0.05524555239826441, + "epoch": 4.015153281268213, + "grad_norm": 3.859375, + "learning_rate": 4.9726633396560195e-05, + "loss": 0.0552, + "mean_token_accuracy": 0.9815401554107666, + "num_tokens": 36243277.0, + "step": 17225 + }, + { + "entropy": 0.05170870460569858, + "epoch": 4.016318918288845, + "grad_norm": 0.86328125, + "learning_rate": 4.9726315825710824e-05, + "loss": 0.0542, + "mean_token_accuracy": 0.9843271493911743, + "num_tokens": 36258311.0, + "step": 17230 + }, + { + "entropy": 0.08237220542505383, + "epoch": 4.017484555309476, + "grad_norm": 1.1953125, + "learning_rate": 4.972599807254869e-05, + "loss": 0.0775, + "mean_token_accuracy": 0.9783424258232116, + "num_tokens": 36277676.0, + "step": 17235 + }, + { + "entropy": 0.09206105470657348, + "epoch": 4.018650192330108, + "grad_norm": 2.28125, + "learning_rate": 4.972568013707854e-05, + "loss": 0.115, + "mean_token_accuracy": 0.9744808137416839, + "num_tokens": 36285581.0, + "step": 17240 + }, + { + "entropy": 0.061206897348165513, + "epoch": 4.01981582935074, + "grad_norm": 2.5625, + "learning_rate": 4.9725362019305113e-05, + "loss": 0.0575, + "mean_token_accuracy": 0.9826982498168946, + "num_tokens": 36299733.0, + "step": 17245 + }, + { + "entropy": 0.07337401024997234, + "epoch": 4.020981466371372, + "grad_norm": 3.25, + "learning_rate": 4.9725043719233145e-05, + "loss": 0.1017, + "mean_token_accuracy": 0.9742031693458557, + "num_tokens": 36309070.0, + "step": 17250 + }, + { + "entropy": 0.07909293696284295, + "epoch": 4.022147103392004, + "grad_norm": 1.046875, + "learning_rate": 4.972472523686739e-05, + "loss": 0.095, + "mean_token_accuracy": 0.9729432463645935, + "num_tokens": 36320642.0, + "step": 17255 + }, + { + "entropy": 0.07407073359936475, + "epoch": 4.023312740412636, + "grad_norm": 3.109375, + "learning_rate": 4.972440657221259e-05, + "loss": 0.0553, + "mean_token_accuracy": 0.9820687472820282, + "num_tokens": 36339974.0, + "step": 17260 + }, + { + "entropy": 0.05830348208546639, + "epoch": 4.024478377433267, + "grad_norm": 0.97265625, + "learning_rate": 4.9724087725273504e-05, + "loss": 0.0615, + "mean_token_accuracy": 0.9839694261550903, + "num_tokens": 36352595.0, + "step": 17265 + }, + { + "entropy": 0.07738137934356928, + "epoch": 4.025644014453899, + "grad_norm": 0.451171875, + "learning_rate": 4.972376869605489e-05, + "loss": 0.0682, + "mean_token_accuracy": 0.9771198868751526, + "num_tokens": 36371942.0, + "step": 17270 + }, + { + "entropy": 0.07826981553807855, + "epoch": 4.02680965147453, + "grad_norm": 1.9765625, + "learning_rate": 4.972344948456149e-05, + "loss": 0.0744, + "mean_token_accuracy": 0.9794186532497406, + "num_tokens": 36402138.0, + "step": 17275 + }, + { + "entropy": 0.07734978701919318, + "epoch": 4.0279752884951625, + "grad_norm": 0.59375, + "learning_rate": 4.9723130090798083e-05, + "loss": 0.0745, + "mean_token_accuracy": 0.9787839591503144, + "num_tokens": 36425296.0, + "step": 17280 + }, + { + "entropy": 0.06703787222504616, + "epoch": 4.029140925515795, + "grad_norm": 0.734375, + "learning_rate": 4.972281051476941e-05, + "loss": 0.0682, + "mean_token_accuracy": 0.981098610162735, + "num_tokens": 36439694.0, + "step": 17285 + }, + { + "entropy": 0.06402443414554, + "epoch": 4.030306562536426, + "grad_norm": 0.205078125, + "learning_rate": 4.9722490756480256e-05, + "loss": 0.0877, + "mean_token_accuracy": 0.9755900740623474, + "num_tokens": 36469001.0, + "step": 17290 + }, + { + "entropy": 0.05503675839863718, + "epoch": 4.031472199557058, + "grad_norm": 4.875, + "learning_rate": 4.972217081593538e-05, + "loss": 0.0577, + "mean_token_accuracy": 0.982936006784439, + "num_tokens": 36506712.0, + "step": 17295 + }, + { + "entropy": 0.05822970187291503, + "epoch": 4.03263783657769, + "grad_norm": 0.60546875, + "learning_rate": 4.9721850693139555e-05, + "loss": 0.0667, + "mean_token_accuracy": 0.9810732364654541, + "num_tokens": 36533546.0, + "step": 17300 + }, + { + "entropy": 0.060767234954983, + "epoch": 4.033803473598321, + "grad_norm": 1.1171875, + "learning_rate": 4.972153038809755e-05, + "loss": 0.067, + "mean_token_accuracy": 0.9782026529312133, + "num_tokens": 36552953.0, + "step": 17305 + }, + { + "entropy": 0.10037664864212274, + "epoch": 4.034969110618953, + "grad_norm": 0.8359375, + "learning_rate": 4.9721209900814144e-05, + "loss": 0.0946, + "mean_token_accuracy": 0.9746870577335358, + "num_tokens": 36582455.0, + "step": 17310 + }, + { + "entropy": 0.07259051175788045, + "epoch": 4.0361347476395855, + "grad_norm": 5.28125, + "learning_rate": 4.972088923129412e-05, + "loss": 0.0942, + "mean_token_accuracy": 0.9792055189609528, + "num_tokens": 36600081.0, + "step": 17315 + }, + { + "entropy": 0.0708609121851623, + "epoch": 4.037300384660217, + "grad_norm": 4.71875, + "learning_rate": 4.972056837954226e-05, + "loss": 0.1032, + "mean_token_accuracy": 0.97678844332695, + "num_tokens": 36616329.0, + "step": 17320 + }, + { + "entropy": 0.07505875267088413, + "epoch": 4.038466021680849, + "grad_norm": 3.09375, + "learning_rate": 4.972024734556334e-05, + "loss": 0.1161, + "mean_token_accuracy": 0.9763146281242371, + "num_tokens": 36636315.0, + "step": 17325 + }, + { + "entropy": 0.07826881892979146, + "epoch": 4.03963165870148, + "grad_norm": 2.109375, + "learning_rate": 4.971992612936215e-05, + "loss": 0.0653, + "mean_token_accuracy": 0.981414407491684, + "num_tokens": 36651632.0, + "step": 17330 + }, + { + "entropy": 0.06351137179881335, + "epoch": 4.040797295722112, + "grad_norm": 0.47265625, + "learning_rate": 4.9719604730943485e-05, + "loss": 0.0714, + "mean_token_accuracy": 0.9803904712200164, + "num_tokens": 36678307.0, + "step": 17335 + }, + { + "entropy": 0.05514317499473691, + "epoch": 4.041962932742744, + "grad_norm": 3.28125, + "learning_rate": 4.971928315031213e-05, + "loss": 0.0582, + "mean_token_accuracy": 0.9824005544185639, + "num_tokens": 36705678.0, + "step": 17340 + }, + { + "entropy": 0.08399317860603332, + "epoch": 4.043128569763375, + "grad_norm": 3.71875, + "learning_rate": 4.971896138747289e-05, + "loss": 0.1178, + "mean_token_accuracy": 0.9715703725814819, + "num_tokens": 36717184.0, + "step": 17345 + }, + { + "entropy": 0.06782543286681175, + "epoch": 4.0442942067840075, + "grad_norm": 1.5078125, + "learning_rate": 4.971863944243055e-05, + "loss": 0.0828, + "mean_token_accuracy": 0.9791005492210388, + "num_tokens": 36728421.0, + "step": 17350 + }, + { + "entropy": 0.06589771201834083, + "epoch": 4.04545984380464, + "grad_norm": 2.03125, + "learning_rate": 4.9718317315189926e-05, + "loss": 0.0638, + "mean_token_accuracy": 0.9831419050693512, + "num_tokens": 36745939.0, + "step": 17355 + }, + { + "entropy": 0.06487294095568359, + "epoch": 4.046625480825271, + "grad_norm": 0.37109375, + "learning_rate": 4.971799500575581e-05, + "loss": 0.0922, + "mean_token_accuracy": 0.9786526322364807, + "num_tokens": 36768446.0, + "step": 17360 + }, + { + "entropy": 0.0821248460561037, + "epoch": 4.047791117845903, + "grad_norm": 6.84375, + "learning_rate": 4.971767251413301e-05, + "loss": 0.1065, + "mean_token_accuracy": 0.9760565519332886, + "num_tokens": 36781290.0, + "step": 17365 + }, + { + "entropy": 0.08352366182953119, + "epoch": 4.048956754866534, + "grad_norm": 0.5703125, + "learning_rate": 4.9717349840326344e-05, + "loss": 0.0809, + "mean_token_accuracy": 0.977725625038147, + "num_tokens": 36803795.0, + "step": 17370 + }, + { + "entropy": 0.07390208840370179, + "epoch": 4.050122391887166, + "grad_norm": 1.9453125, + "learning_rate": 4.971702698434061e-05, + "loss": 0.0779, + "mean_token_accuracy": 0.9757682621479035, + "num_tokens": 36816091.0, + "step": 17375 + }, + { + "entropy": 0.0756676783785224, + "epoch": 4.051288028907798, + "grad_norm": 7.28125, + "learning_rate": 4.9716703946180626e-05, + "loss": 0.0949, + "mean_token_accuracy": 0.9772179245948791, + "num_tokens": 36827385.0, + "step": 17380 + }, + { + "entropy": 0.06811714279465378, + "epoch": 4.05245366592843, + "grad_norm": 5.96875, + "learning_rate": 4.971638072585121e-05, + "loss": 0.0769, + "mean_token_accuracy": 0.9762770175933838, + "num_tokens": 36846237.0, + "step": 17385 + }, + { + "entropy": 0.07330656386911868, + "epoch": 4.053619302949062, + "grad_norm": 2.578125, + "learning_rate": 4.971605732335719e-05, + "loss": 0.0601, + "mean_token_accuracy": 0.9823174595832824, + "num_tokens": 36863604.0, + "step": 17390 + }, + { + "entropy": 0.08408041261136531, + "epoch": 4.054784939969694, + "grad_norm": 5.4375, + "learning_rate": 4.971573373870338e-05, + "loss": 0.0875, + "mean_token_accuracy": 0.9787028014659882, + "num_tokens": 36873623.0, + "step": 17395 + }, + { + "entropy": 0.09213002622127534, + "epoch": 4.055950576990325, + "grad_norm": 6.6875, + "learning_rate": 4.97154099718946e-05, + "loss": 0.1429, + "mean_token_accuracy": 0.9639403820037842, + "num_tokens": 36882941.0, + "step": 17400 + }, + { + "entropy": 0.04812620813027024, + "epoch": 4.057116214010957, + "grad_norm": 1.3203125, + "learning_rate": 4.971508602293569e-05, + "loss": 0.057, + "mean_token_accuracy": 0.9856364369392395, + "num_tokens": 36906085.0, + "step": 17405 + }, + { + "entropy": 0.08739807661622763, + "epoch": 4.058281851031588, + "grad_norm": 3.53125, + "learning_rate": 4.971476189183148e-05, + "loss": 0.1254, + "mean_token_accuracy": 0.9713817596435547, + "num_tokens": 36930703.0, + "step": 17410 + }, + { + "entropy": 0.07933282367885112, + "epoch": 4.05944748805222, + "grad_norm": 1.9765625, + "learning_rate": 4.971443757858679e-05, + "loss": 0.0951, + "mean_token_accuracy": 0.9795554041862488, + "num_tokens": 36939229.0, + "step": 17415 + }, + { + "entropy": 0.054528007935732606, + "epoch": 4.0606131250728525, + "grad_norm": 1.546875, + "learning_rate": 4.971411308320646e-05, + "loss": 0.0366, + "mean_token_accuracy": 0.987425422668457, + "num_tokens": 36965884.0, + "step": 17420 + }, + { + "entropy": 0.06623607650399208, + "epoch": 4.061778762093484, + "grad_norm": 1.8046875, + "learning_rate": 4.971378840569534e-05, + "loss": 0.0669, + "mean_token_accuracy": 0.9831463754177093, + "num_tokens": 36983099.0, + "step": 17425 + }, + { + "entropy": 0.06812118962407113, + "epoch": 4.062944399114116, + "grad_norm": 2.0, + "learning_rate": 4.971346354605826e-05, + "loss": 0.0707, + "mean_token_accuracy": 0.9800431668758393, + "num_tokens": 37001866.0, + "step": 17430 + }, + { + "entropy": 0.05892649330198765, + "epoch": 4.064110036134748, + "grad_norm": 1.6953125, + "learning_rate": 4.971313850430007e-05, + "loss": 0.0749, + "mean_token_accuracy": 0.9826079845428467, + "num_tokens": 37019444.0, + "step": 17435 + }, + { + "entropy": 0.06661005020141601, + "epoch": 4.065275673155379, + "grad_norm": 1.328125, + "learning_rate": 4.971281328042562e-05, + "loss": 0.0779, + "mean_token_accuracy": 0.9750029504299164, + "num_tokens": 37041691.0, + "step": 17440 + }, + { + "entropy": 0.07503235340118408, + "epoch": 4.066441310176011, + "grad_norm": 1.1328125, + "learning_rate": 4.971248787443975e-05, + "loss": 0.0753, + "mean_token_accuracy": 0.9808429062366486, + "num_tokens": 37064095.0, + "step": 17445 + }, + { + "entropy": 0.05903323907405138, + "epoch": 4.067606947196643, + "grad_norm": 1.2421875, + "learning_rate": 4.971216228634732e-05, + "loss": 0.0432, + "mean_token_accuracy": 0.9815847158432007, + "num_tokens": 37084047.0, + "step": 17450 + }, + { + "entropy": 0.07304359413683414, + "epoch": 4.068772584217275, + "grad_norm": 1.1015625, + "learning_rate": 4.971183651615318e-05, + "loss": 0.0475, + "mean_token_accuracy": 0.9817767202854156, + "num_tokens": 37108418.0, + "step": 17455 + }, + { + "entropy": 0.06335212141275406, + "epoch": 4.069938221237907, + "grad_norm": 2.109375, + "learning_rate": 4.971151056386219e-05, + "loss": 0.0475, + "mean_token_accuracy": 0.9812131524085999, + "num_tokens": 37141312.0, + "step": 17460 + }, + { + "entropy": 0.10107212541624903, + "epoch": 4.071103858258538, + "grad_norm": 4.40625, + "learning_rate": 4.9711184429479215e-05, + "loss": 0.1212, + "mean_token_accuracy": 0.9684840321540833, + "num_tokens": 37154069.0, + "step": 17465 + }, + { + "entropy": 0.09124523922801017, + "epoch": 4.07226949527917, + "grad_norm": 4.5625, + "learning_rate": 4.971085811300911e-05, + "loss": 0.1407, + "mean_token_accuracy": 0.9691009283065796, + "num_tokens": 37163700.0, + "step": 17470 + }, + { + "entropy": 0.05337847024202347, + "epoch": 4.073435132299802, + "grad_norm": 0.8984375, + "learning_rate": 4.971053161445674e-05, + "loss": 0.0633, + "mean_token_accuracy": 0.9758882224559784, + "num_tokens": 37192961.0, + "step": 17475 + }, + { + "entropy": 0.0389715145342052, + "epoch": 4.074600769320433, + "grad_norm": 2.875, + "learning_rate": 4.971020493382698e-05, + "loss": 0.0377, + "mean_token_accuracy": 0.9893102467060089, + "num_tokens": 37223557.0, + "step": 17480 + }, + { + "entropy": 0.09736668840050697, + "epoch": 4.0757664063410655, + "grad_norm": 3.5, + "learning_rate": 4.97098780711247e-05, + "loss": 0.0921, + "mean_token_accuracy": 0.9757306635379791, + "num_tokens": 37241031.0, + "step": 17485 + }, + { + "entropy": 0.07404592633247375, + "epoch": 4.076932043361698, + "grad_norm": 1.828125, + "learning_rate": 4.9709551026354775e-05, + "loss": 0.0784, + "mean_token_accuracy": 0.9740024745464325, + "num_tokens": 37274714.0, + "step": 17490 + }, + { + "entropy": 0.06818081270903349, + "epoch": 4.078097680382329, + "grad_norm": 4.1875, + "learning_rate": 4.970922379952208e-05, + "loss": 0.0871, + "mean_token_accuracy": 0.9766332447528839, + "num_tokens": 37285581.0, + "step": 17495 + }, + { + "entropy": 0.056024549342691896, + "epoch": 4.079263317402961, + "grad_norm": 0.38671875, + "learning_rate": 4.97088963906315e-05, + "loss": 0.0482, + "mean_token_accuracy": 0.9804753720760345, + "num_tokens": 37305223.0, + "step": 17500 + }, + { + "entropy": 0.06913188584148884, + "epoch": 4.080428954423592, + "grad_norm": 7.875, + "learning_rate": 4.97085687996879e-05, + "loss": 0.097, + "mean_token_accuracy": 0.9744450330734253, + "num_tokens": 37325723.0, + "step": 17505 + }, + { + "entropy": 0.04517441475763917, + "epoch": 4.081594591444224, + "grad_norm": 1.609375, + "learning_rate": 4.9708241026696186e-05, + "loss": 0.0373, + "mean_token_accuracy": 0.9858837187290191, + "num_tokens": 37353645.0, + "step": 17510 + }, + { + "entropy": 0.06553271301090717, + "epoch": 4.082760228464856, + "grad_norm": 2.34375, + "learning_rate": 4.9707913071661225e-05, + "loss": 0.0637, + "mean_token_accuracy": 0.983744639158249, + "num_tokens": 37375003.0, + "step": 17515 + }, + { + "entropy": 0.06103522032499313, + "epoch": 4.0839258654854875, + "grad_norm": 2.4375, + "learning_rate": 4.970758493458793e-05, + "loss": 0.0575, + "mean_token_accuracy": 0.9773332476615906, + "num_tokens": 37398649.0, + "step": 17520 + }, + { + "entropy": 0.06978270523250103, + "epoch": 4.08509150250612, + "grad_norm": 2.640625, + "learning_rate": 4.970725661548118e-05, + "loss": 0.0539, + "mean_token_accuracy": 0.9787450671195984, + "num_tokens": 37421294.0, + "step": 17525 + }, + { + "entropy": 0.06557549051940441, + "epoch": 4.086257139526752, + "grad_norm": 0.291015625, + "learning_rate": 4.970692811434587e-05, + "loss": 0.0543, + "mean_token_accuracy": 0.9738318443298339, + "num_tokens": 37458108.0, + "step": 17530 + }, + { + "entropy": 0.0798257420770824, + "epoch": 4.087422776547383, + "grad_norm": 3.21875, + "learning_rate": 4.97065994311869e-05, + "loss": 0.0796, + "mean_token_accuracy": 0.9754477143287659, + "num_tokens": 37477809.0, + "step": 17535 + }, + { + "entropy": 0.04820524742826819, + "epoch": 4.088588413568015, + "grad_norm": 0.51953125, + "learning_rate": 4.9706270566009174e-05, + "loss": 0.0464, + "mean_token_accuracy": 0.9842685222625732, + "num_tokens": 37509032.0, + "step": 17540 + }, + { + "entropy": 0.07802566029131412, + "epoch": 4.089754050588646, + "grad_norm": 0.6328125, + "learning_rate": 4.9705941518817594e-05, + "loss": 0.1019, + "mean_token_accuracy": 0.9741903364658355, + "num_tokens": 37532273.0, + "step": 17545 + }, + { + "entropy": 0.06755186822265387, + "epoch": 4.090919687609278, + "grad_norm": 0.6796875, + "learning_rate": 4.970561228961707e-05, + "loss": 0.0649, + "mean_token_accuracy": 0.9817712068557739, + "num_tokens": 37546529.0, + "step": 17550 + }, + { + "entropy": 0.06730409227311611, + "epoch": 4.0920853246299105, + "grad_norm": 0.462890625, + "learning_rate": 4.970528287841251e-05, + "loss": 0.0784, + "mean_token_accuracy": 0.9731522858142853, + "num_tokens": 37567044.0, + "step": 17555 + }, + { + "entropy": 0.05120256347581744, + "epoch": 4.093250961650542, + "grad_norm": 1.0078125, + "learning_rate": 4.9704953285208825e-05, + "loss": 0.055, + "mean_token_accuracy": 0.983589905500412, + "num_tokens": 37594124.0, + "step": 17560 + }, + { + "entropy": 0.04717886643484235, + "epoch": 4.094416598671174, + "grad_norm": 0.8828125, + "learning_rate": 4.9704623510010926e-05, + "loss": 0.0396, + "mean_token_accuracy": 0.9903761804103851, + "num_tokens": 37624057.0, + "step": 17565 + }, + { + "entropy": 0.0668092598207295, + "epoch": 4.095582235691806, + "grad_norm": 0.357421875, + "learning_rate": 4.9704293552823736e-05, + "loss": 0.0836, + "mean_token_accuracy": 0.975571358203888, + "num_tokens": 37645861.0, + "step": 17570 + }, + { + "entropy": 0.08568244017660617, + "epoch": 4.096747872712437, + "grad_norm": 2.4375, + "learning_rate": 4.970396341365217e-05, + "loss": 0.1166, + "mean_token_accuracy": 0.9714062631130218, + "num_tokens": 37655194.0, + "step": 17575 + }, + { + "entropy": 0.08841509446501732, + "epoch": 4.097913509733069, + "grad_norm": 3.578125, + "learning_rate": 4.970363309250115e-05, + "loss": 0.1411, + "mean_token_accuracy": 0.9614146292209625, + "num_tokens": 37664421.0, + "step": 17580 + }, + { + "entropy": 0.07518529873341322, + "epoch": 4.099079146753701, + "grad_norm": 1.4765625, + "learning_rate": 4.970330258937561e-05, + "loss": 0.066, + "mean_token_accuracy": 0.9781446993350983, + "num_tokens": 37686842.0, + "step": 17585 + }, + { + "entropy": 0.061594282276928425, + "epoch": 4.1002447837743325, + "grad_norm": 0.484375, + "learning_rate": 4.970297190428047e-05, + "loss": 0.0369, + "mean_token_accuracy": 0.9831397414207459, + "num_tokens": 37710171.0, + "step": 17590 + }, + { + "entropy": 0.075823188200593, + "epoch": 4.101410420794965, + "grad_norm": 2.40625, + "learning_rate": 4.9702641037220674e-05, + "loss": 0.1047, + "mean_token_accuracy": 0.9743071496486664, + "num_tokens": 37720106.0, + "step": 17595 + }, + { + "entropy": 0.04717852883040905, + "epoch": 4.102576057815596, + "grad_norm": 0.61328125, + "learning_rate": 4.970230998820114e-05, + "loss": 0.0397, + "mean_token_accuracy": 0.9836111307144165, + "num_tokens": 37761881.0, + "step": 17600 + }, + { + "entropy": 0.07047794573009014, + "epoch": 4.103741694836228, + "grad_norm": 0.7578125, + "learning_rate": 4.970197875722681e-05, + "loss": 0.0958, + "mean_token_accuracy": 0.9757523596286773, + "num_tokens": 37779264.0, + "step": 17605 + }, + { + "entropy": 0.06992449425160885, + "epoch": 4.10490733185686, + "grad_norm": 4.53125, + "learning_rate": 4.9701647344302624e-05, + "loss": 0.082, + "mean_token_accuracy": 0.9756997048854827, + "num_tokens": 37808887.0, + "step": 17610 + }, + { + "entropy": 0.05605229511857033, + "epoch": 4.106072968877491, + "grad_norm": 1.3203125, + "learning_rate": 4.970131574943352e-05, + "loss": 0.0675, + "mean_token_accuracy": 0.9793569207191467, + "num_tokens": 37832410.0, + "step": 17615 + }, + { + "entropy": 0.07954696863889694, + "epoch": 4.107238605898123, + "grad_norm": 2.5, + "learning_rate": 4.970098397262445e-05, + "loss": 0.0884, + "mean_token_accuracy": 0.9773578584194184, + "num_tokens": 37852432.0, + "step": 17620 + }, + { + "entropy": 0.06925328876823186, + "epoch": 4.1084042429187555, + "grad_norm": 0.8125, + "learning_rate": 4.970065201388036e-05, + "loss": 0.0791, + "mean_token_accuracy": 0.9751325309276581, + "num_tokens": 37866193.0, + "step": 17625 + }, + { + "entropy": 0.0733806163072586, + "epoch": 4.109569879939387, + "grad_norm": 1.328125, + "learning_rate": 4.97003198732062e-05, + "loss": 0.1035, + "mean_token_accuracy": 0.972558718919754, + "num_tokens": 37875542.0, + "step": 17630 + }, + { + "entropy": 0.06492809355258941, + "epoch": 4.110735516960019, + "grad_norm": 0.2041015625, + "learning_rate": 4.9699987550606916e-05, + "loss": 0.0683, + "mean_token_accuracy": 0.9816878855228424, + "num_tokens": 37915793.0, + "step": 17635 + }, + { + "entropy": 0.06843568123877049, + "epoch": 4.11190115398065, + "grad_norm": 3.046875, + "learning_rate": 4.969965504608747e-05, + "loss": 0.0727, + "mean_token_accuracy": 0.980006742477417, + "num_tokens": 37934429.0, + "step": 17640 + }, + { + "entropy": 0.07521101422607898, + "epoch": 4.113066791001282, + "grad_norm": 1.4609375, + "learning_rate": 4.969932235965281e-05, + "loss": 0.0788, + "mean_token_accuracy": 0.9817327499389649, + "num_tokens": 37948166.0, + "step": 17645 + }, + { + "entropy": 0.07668667826801538, + "epoch": 4.114232428021914, + "grad_norm": 0.6328125, + "learning_rate": 4.969898949130791e-05, + "loss": 0.0861, + "mean_token_accuracy": 0.976869261264801, + "num_tokens": 37964025.0, + "step": 17650 + }, + { + "entropy": 0.0827770703472197, + "epoch": 4.1153980650425455, + "grad_norm": 1.609375, + "learning_rate": 4.969865644105773e-05, + "loss": 0.0854, + "mean_token_accuracy": 0.9789432466030121, + "num_tokens": 37982005.0, + "step": 17655 + }, + { + "entropy": 0.06624055663123726, + "epoch": 4.116563702063178, + "grad_norm": 0.400390625, + "learning_rate": 4.969832320890724e-05, + "loss": 0.0507, + "mean_token_accuracy": 0.9831128180027008, + "num_tokens": 38002010.0, + "step": 17660 + }, + { + "entropy": 0.06770137306302786, + "epoch": 4.11772933908381, + "grad_norm": 2.296875, + "learning_rate": 4.96979897948614e-05, + "loss": 0.0553, + "mean_token_accuracy": 0.9802413940429687, + "num_tokens": 38018709.0, + "step": 17665 + }, + { + "entropy": 0.08137813555076719, + "epoch": 4.118894976104441, + "grad_norm": 2.484375, + "learning_rate": 4.969765619892518e-05, + "loss": 0.0963, + "mean_token_accuracy": 0.9716586530208587, + "num_tokens": 38040955.0, + "step": 17670 + }, + { + "entropy": 0.06633272357285022, + "epoch": 4.120060613125073, + "grad_norm": 1.4765625, + "learning_rate": 4.9697322421103564e-05, + "loss": 0.0787, + "mean_token_accuracy": 0.9814836859703064, + "num_tokens": 38063011.0, + "step": 17675 + }, + { + "entropy": 0.0662471629679203, + "epoch": 4.121226250145704, + "grad_norm": 0.56640625, + "learning_rate": 4.969698846140152e-05, + "loss": 0.0929, + "mean_token_accuracy": 0.9769213080406189, + "num_tokens": 38077718.0, + "step": 17680 + }, + { + "entropy": 0.08769634570926428, + "epoch": 4.122391887166336, + "grad_norm": 4.21875, + "learning_rate": 4.969665431982404e-05, + "loss": 0.0851, + "mean_token_accuracy": 0.9764888525009155, + "num_tokens": 38090501.0, + "step": 17685 + }, + { + "entropy": 0.051108538545668124, + "epoch": 4.123557524186968, + "grad_norm": 5.9375, + "learning_rate": 4.96963199963761e-05, + "loss": 0.0593, + "mean_token_accuracy": 0.9822099328041076, + "num_tokens": 38115330.0, + "step": 17690 + }, + { + "entropy": 0.06495916079729795, + "epoch": 4.1247231612076, + "grad_norm": 0.34375, + "learning_rate": 4.9695985491062674e-05, + "loss": 0.0726, + "mean_token_accuracy": 0.9816126704216004, + "num_tokens": 38129691.0, + "step": 17695 + }, + { + "entropy": 0.07993561625480652, + "epoch": 4.125888798228232, + "grad_norm": 0.6796875, + "learning_rate": 4.9695650803888764e-05, + "loss": 0.0707, + "mean_token_accuracy": 0.9806294143199921, + "num_tokens": 38148764.0, + "step": 17700 + }, + { + "entropy": 0.06839036452583969, + "epoch": 4.127054435248864, + "grad_norm": 3.03125, + "learning_rate": 4.969531593485937e-05, + "loss": 0.0456, + "mean_token_accuracy": 0.9780635833740234, + "num_tokens": 38196213.0, + "step": 17705 + }, + { + "entropy": 0.08365984680131078, + "epoch": 4.128220072269495, + "grad_norm": 2.53125, + "learning_rate": 4.969498088397946e-05, + "loss": 0.0896, + "mean_token_accuracy": 0.9761638879776001, + "num_tokens": 38211434.0, + "step": 17710 + }, + { + "entropy": 0.063323774933815, + "epoch": 4.129385709290127, + "grad_norm": 2.0625, + "learning_rate": 4.969464565125404e-05, + "loss": 0.0562, + "mean_token_accuracy": 0.9819753110408783, + "num_tokens": 38228816.0, + "step": 17715 + }, + { + "entropy": 0.08252546712756156, + "epoch": 4.130551346310758, + "grad_norm": 2.203125, + "learning_rate": 4.969431023668812e-05, + "loss": 0.1077, + "mean_token_accuracy": 0.9741408407688141, + "num_tokens": 38238471.0, + "step": 17720 + }, + { + "entropy": 0.06147185619920492, + "epoch": 4.1317169833313905, + "grad_norm": 4.0, + "learning_rate": 4.969397464028669e-05, + "loss": 0.0671, + "mean_token_accuracy": 0.9822175681591034, + "num_tokens": 38256366.0, + "step": 17725 + }, + { + "entropy": 0.057942528277635574, + "epoch": 4.132882620352023, + "grad_norm": 1.703125, + "learning_rate": 4.969363886205476e-05, + "loss": 0.0512, + "mean_token_accuracy": 0.9869576156139374, + "num_tokens": 38283809.0, + "step": 17730 + }, + { + "entropy": 0.0631272815167904, + "epoch": 4.134048257372654, + "grad_norm": 2.53125, + "learning_rate": 4.969330290199733e-05, + "loss": 0.0823, + "mean_token_accuracy": 0.981304931640625, + "num_tokens": 38296792.0, + "step": 17735 + }, + { + "entropy": 0.21083402447402477, + "epoch": 4.135213894393286, + "grad_norm": 1.171875, + "learning_rate": 4.969296676011941e-05, + "loss": 0.4323, + "mean_token_accuracy": 0.95402672290802, + "num_tokens": 38323429.0, + "step": 17740 + }, + { + "entropy": 0.05955973602831364, + "epoch": 4.136379531413918, + "grad_norm": 1.8203125, + "learning_rate": 4.969263043642602e-05, + "loss": 0.0549, + "mean_token_accuracy": 0.9837433099746704, + "num_tokens": 38355751.0, + "step": 17745 + }, + { + "entropy": 0.07743876222521066, + "epoch": 4.137545168434549, + "grad_norm": 1.3515625, + "learning_rate": 4.969229393092218e-05, + "loss": 0.0934, + "mean_token_accuracy": 0.9781845211982727, + "num_tokens": 38374521.0, + "step": 17750 + }, + { + "entropy": 0.059616895485669376, + "epoch": 4.138710805455181, + "grad_norm": 3.875, + "learning_rate": 4.969195724361289e-05, + "loss": 0.0718, + "mean_token_accuracy": 0.9815534234046936, + "num_tokens": 38401132.0, + "step": 17755 + }, + { + "entropy": 0.07294055828824639, + "epoch": 4.139876442475813, + "grad_norm": 2.484375, + "learning_rate": 4.969162037450318e-05, + "loss": 0.1015, + "mean_token_accuracy": 0.9737964630126953, + "num_tokens": 38423254.0, + "step": 17760 + }, + { + "entropy": 0.09278177451342344, + "epoch": 4.141042079496445, + "grad_norm": 3.484375, + "learning_rate": 4.969128332359808e-05, + "loss": 0.0911, + "mean_token_accuracy": 0.9769309103488922, + "num_tokens": 38453027.0, + "step": 17765 + }, + { + "entropy": 0.08387061543762683, + "epoch": 4.142207716517077, + "grad_norm": 1.359375, + "learning_rate": 4.969094609090261e-05, + "loss": 0.1126, + "mean_token_accuracy": 0.9712442636489869, + "num_tokens": 38462528.0, + "step": 17770 + }, + { + "entropy": 0.07613757401704788, + "epoch": 4.143373353537708, + "grad_norm": 5.34375, + "learning_rate": 4.969060867642179e-05, + "loss": 0.0924, + "mean_token_accuracy": 0.97659130692482, + "num_tokens": 38473163.0, + "step": 17775 + }, + { + "entropy": 0.09136496745049953, + "epoch": 4.14453899055834, + "grad_norm": 3.75, + "learning_rate": 4.969027108016065e-05, + "loss": 0.1467, + "mean_token_accuracy": 0.9635896980762482, + "num_tokens": 38493707.0, + "step": 17780 + }, + { + "entropy": 0.1104026323184371, + "epoch": 4.145704627578972, + "grad_norm": 1.734375, + "learning_rate": 4.9689933302124255e-05, + "loss": 0.1351, + "mean_token_accuracy": 0.9675468266010284, + "num_tokens": 38515217.0, + "step": 17785 + }, + { + "entropy": 0.0909642169252038, + "epoch": 4.146870264599603, + "grad_norm": 1.625, + "learning_rate": 4.968959534231761e-05, + "loss": 0.1052, + "mean_token_accuracy": 0.9722971677780151, + "num_tokens": 38533758.0, + "step": 17790 + }, + { + "entropy": 0.09088087901473045, + "epoch": 4.1480359016202355, + "grad_norm": 1.3046875, + "learning_rate": 4.968925720074576e-05, + "loss": 0.1113, + "mean_token_accuracy": 0.9731328010559082, + "num_tokens": 38563161.0, + "step": 17795 + }, + { + "entropy": 0.08471710272133351, + "epoch": 4.149201538640868, + "grad_norm": 2.25, + "learning_rate": 4.9688918877413756e-05, + "loss": 0.1387, + "mean_token_accuracy": 0.9690379083156586, + "num_tokens": 38572656.0, + "step": 17800 + }, + { + "entropy": 0.08337556011974812, + "epoch": 4.150367175661499, + "grad_norm": 1.7421875, + "learning_rate": 4.968858037232663e-05, + "loss": 0.1503, + "mean_token_accuracy": 0.9631303191184998, + "num_tokens": 38582815.0, + "step": 17805 + }, + { + "entropy": 0.07923096120357513, + "epoch": 4.151532812682131, + "grad_norm": 4.625, + "learning_rate": 4.968824168548945e-05, + "loss": 0.0877, + "mean_token_accuracy": 0.9760537326335907, + "num_tokens": 38602476.0, + "step": 17810 + }, + { + "entropy": 0.07994878720492124, + "epoch": 4.152698449702762, + "grad_norm": 4.5625, + "learning_rate": 4.968790281690725e-05, + "loss": 0.081, + "mean_token_accuracy": 0.9776585280895234, + "num_tokens": 38616516.0, + "step": 17815 + }, + { + "entropy": 0.069281514454633, + "epoch": 4.153864086723394, + "grad_norm": 0.5546875, + "learning_rate": 4.9687563766585086e-05, + "loss": 0.1041, + "mean_token_accuracy": 0.9738664746284484, + "num_tokens": 38643610.0, + "step": 17820 + }, + { + "entropy": 0.06928181694820523, + "epoch": 4.155029723744026, + "grad_norm": 3.421875, + "learning_rate": 4.9687224534528015e-05, + "loss": 0.0961, + "mean_token_accuracy": 0.9771531283855438, + "num_tokens": 38665170.0, + "step": 17825 + }, + { + "entropy": 0.08383396286517382, + "epoch": 4.156195360764658, + "grad_norm": 0.79296875, + "learning_rate": 4.96868851207411e-05, + "loss": 0.0858, + "mean_token_accuracy": 0.9715451300144196, + "num_tokens": 38683037.0, + "step": 17830 + }, + { + "entropy": 0.1132634285837412, + "epoch": 4.15736099778529, + "grad_norm": 4.25, + "learning_rate": 4.968654552522939e-05, + "loss": 0.1926, + "mean_token_accuracy": 0.9579161942005158, + "num_tokens": 38702652.0, + "step": 17835 + }, + { + "entropy": 0.08075078241527081, + "epoch": 4.158526634805922, + "grad_norm": 3.484375, + "learning_rate": 4.968620574799796e-05, + "loss": 0.0914, + "mean_token_accuracy": 0.9716982245445251, + "num_tokens": 38725463.0, + "step": 17840 + }, + { + "entropy": 0.060110028833150864, + "epoch": 4.159692271826553, + "grad_norm": 4.53125, + "learning_rate": 4.968586578905188e-05, + "loss": 0.0584, + "mean_token_accuracy": 0.9838882625102997, + "num_tokens": 38744794.0, + "step": 17845 + }, + { + "entropy": 0.051263061724603175, + "epoch": 4.160857908847185, + "grad_norm": 0.330078125, + "learning_rate": 4.9685525648396205e-05, + "loss": 0.0322, + "mean_token_accuracy": 0.9860285639762878, + "num_tokens": 38773689.0, + "step": 17850 + }, + { + "entropy": 0.04591662436723709, + "epoch": 4.162023545867816, + "grad_norm": 0.197265625, + "learning_rate": 4.968518532603601e-05, + "loss": 0.0386, + "mean_token_accuracy": 0.9850606024265289, + "num_tokens": 38799162.0, + "step": 17855 + }, + { + "entropy": 0.091443323623389, + "epoch": 4.163189182888448, + "grad_norm": 10.6875, + "learning_rate": 4.968484482197639e-05, + "loss": 0.1426, + "mean_token_accuracy": 0.9635275840759278, + "num_tokens": 38814549.0, + "step": 17860 + }, + { + "entropy": 0.05765872802585363, + "epoch": 4.1643548199090805, + "grad_norm": 1.71875, + "learning_rate": 4.9684504136222386e-05, + "loss": 0.0688, + "mean_token_accuracy": 0.982217013835907, + "num_tokens": 38837038.0, + "step": 17865 + }, + { + "entropy": 0.06157895103096962, + "epoch": 4.165520456929712, + "grad_norm": 0.73046875, + "learning_rate": 4.968416326877911e-05, + "loss": 0.0598, + "mean_token_accuracy": 0.9825385868549347, + "num_tokens": 38855466.0, + "step": 17870 + }, + { + "entropy": 0.05540731241926551, + "epoch": 4.166686093950344, + "grad_norm": 1.078125, + "learning_rate": 4.9683822219651636e-05, + "loss": 0.0447, + "mean_token_accuracy": 0.9826449275016784, + "num_tokens": 38884459.0, + "step": 17875 + }, + { + "entropy": 0.13959228973835708, + "epoch": 4.167851730970976, + "grad_norm": 2.328125, + "learning_rate": 4.9683480988845045e-05, + "loss": 0.1112, + "mean_token_accuracy": 0.9676654815673829, + "num_tokens": 38916660.0, + "step": 17880 + }, + { + "entropy": 0.07438411880284548, + "epoch": 4.169017367991607, + "grad_norm": 0.91015625, + "learning_rate": 4.968313957636442e-05, + "loss": 0.0582, + "mean_token_accuracy": 0.9798523426055908, + "num_tokens": 38929734.0, + "step": 17885 + }, + { + "entropy": 0.07969954237341881, + "epoch": 4.170183005012239, + "grad_norm": 5.125, + "learning_rate": 4.968279798221487e-05, + "loss": 0.1093, + "mean_token_accuracy": 0.9738732576370239, + "num_tokens": 38943465.0, + "step": 17890 + }, + { + "entropy": 0.06127149565145373, + "epoch": 4.171348642032871, + "grad_norm": 2.9375, + "learning_rate": 4.9682456206401476e-05, + "loss": 0.0771, + "mean_token_accuracy": 0.9770678341388702, + "num_tokens": 38962693.0, + "step": 17895 + }, + { + "entropy": 0.07192683843895793, + "epoch": 4.172514279053503, + "grad_norm": 1.9609375, + "learning_rate": 4.968211424892934e-05, + "loss": 0.049, + "mean_token_accuracy": 0.9862643003463745, + "num_tokens": 38986065.0, + "step": 17900 + }, + { + "entropy": 0.06356288734823465, + "epoch": 4.173679916074135, + "grad_norm": 0.6484375, + "learning_rate": 4.968177210980355e-05, + "loss": 0.0678, + "mean_token_accuracy": 0.9798180103302002, + "num_tokens": 39013369.0, + "step": 17905 + }, + { + "entropy": 0.0715787623077631, + "epoch": 4.174845553094766, + "grad_norm": 3.09375, + "learning_rate": 4.9681429789029216e-05, + "loss": 0.0793, + "mean_token_accuracy": 0.9760327398777008, + "num_tokens": 39028300.0, + "step": 17910 + }, + { + "entropy": 0.06488925497978926, + "epoch": 4.176011190115398, + "grad_norm": 0.5859375, + "learning_rate": 4.9681087286611445e-05, + "loss": 0.072, + "mean_token_accuracy": 0.9803975045680999, + "num_tokens": 39044881.0, + "step": 17915 + }, + { + "entropy": 0.05183368735015392, + "epoch": 4.17717682713603, + "grad_norm": 0.69140625, + "learning_rate": 4.968074460255534e-05, + "loss": 0.0576, + "mean_token_accuracy": 0.9789083540439606, + "num_tokens": 39066214.0, + "step": 17920 + }, + { + "entropy": 0.06136147491633892, + "epoch": 4.178342464156661, + "grad_norm": 0.9375, + "learning_rate": 4.968040173686601e-05, + "loss": 0.0694, + "mean_token_accuracy": 0.9841551721096039, + "num_tokens": 39094170.0, + "step": 17925 + }, + { + "entropy": 0.048913817014545204, + "epoch": 4.179508101177293, + "grad_norm": 2.515625, + "learning_rate": 4.968005868954857e-05, + "loss": 0.0519, + "mean_token_accuracy": 0.9821430623531342, + "num_tokens": 39121933.0, + "step": 17930 + }, + { + "entropy": 0.08498697010800242, + "epoch": 4.1806737381979255, + "grad_norm": 5.34375, + "learning_rate": 4.967971546060814e-05, + "loss": 0.1469, + "mean_token_accuracy": 0.9696698248386383, + "num_tokens": 39155746.0, + "step": 17935 + }, + { + "entropy": 0.06178603731095791, + "epoch": 4.181839375218557, + "grad_norm": 0.5859375, + "learning_rate": 4.967937205004983e-05, + "loss": 0.0313, + "mean_token_accuracy": 0.9859042346477509, + "num_tokens": 39182421.0, + "step": 17940 + }, + { + "entropy": 0.07212042324244976, + "epoch": 4.183005012239189, + "grad_norm": 4.8125, + "learning_rate": 4.9679028457878764e-05, + "loss": 0.0923, + "mean_token_accuracy": 0.9786004006862641, + "num_tokens": 39202742.0, + "step": 17945 + }, + { + "entropy": 0.07441701972857118, + "epoch": 4.18417064925982, + "grad_norm": 0.5859375, + "learning_rate": 4.967868468410006e-05, + "loss": 0.0483, + "mean_token_accuracy": 0.9796132147312164, + "num_tokens": 39247317.0, + "step": 17950 + }, + { + "entropy": 0.07962618097662925, + "epoch": 4.185336286280452, + "grad_norm": 3.21875, + "learning_rate": 4.967834072871886e-05, + "loss": 0.0798, + "mean_token_accuracy": 0.978544956445694, + "num_tokens": 39258240.0, + "step": 17955 + }, + { + "entropy": 0.06471439627930523, + "epoch": 4.186501923301084, + "grad_norm": 4.03125, + "learning_rate": 4.9677996591740277e-05, + "loss": 0.0547, + "mean_token_accuracy": 0.9785925149917603, + "num_tokens": 39281630.0, + "step": 17960 + }, + { + "entropy": 0.06293684775009752, + "epoch": 4.1876675603217155, + "grad_norm": 0.7109375, + "learning_rate": 4.967765227316945e-05, + "loss": 0.0562, + "mean_token_accuracy": 0.9819996774196624, + "num_tokens": 39308837.0, + "step": 17965 + }, + { + "entropy": 0.06763023445382714, + "epoch": 4.188833197342348, + "grad_norm": 0.482421875, + "learning_rate": 4.96773077730115e-05, + "loss": 0.0221, + "mean_token_accuracy": 0.9898829996585846, + "num_tokens": 39345381.0, + "step": 17970 + }, + { + "entropy": 0.0727597183547914, + "epoch": 4.18999883436298, + "grad_norm": 0.337890625, + "learning_rate": 4.967696309127159e-05, + "loss": 0.0545, + "mean_token_accuracy": 0.9819984555244445, + "num_tokens": 39364933.0, + "step": 17975 + }, + { + "entropy": 0.0631346826441586, + "epoch": 4.191164471383611, + "grad_norm": 1.4921875, + "learning_rate": 4.967661822795485e-05, + "loss": 0.0549, + "mean_token_accuracy": 0.9786123156547546, + "num_tokens": 39384010.0, + "step": 17980 + }, + { + "entropy": 0.054526901617646215, + "epoch": 4.192330108404243, + "grad_norm": 0.48828125, + "learning_rate": 4.96762731830664e-05, + "loss": 0.053, + "mean_token_accuracy": 0.9847536087036133, + "num_tokens": 39411097.0, + "step": 17985 + }, + { + "entropy": 0.09764510169625282, + "epoch": 4.193495745424874, + "grad_norm": 3.484375, + "learning_rate": 4.9675927956611415e-05, + "loss": 0.1006, + "mean_token_accuracy": 0.9744187951087951, + "num_tokens": 39419977.0, + "step": 17990 + }, + { + "entropy": 0.05835923943668604, + "epoch": 4.194661382445506, + "grad_norm": 0.29296875, + "learning_rate": 4.9675582548595024e-05, + "loss": 0.0573, + "mean_token_accuracy": 0.9823970139026642, + "num_tokens": 39442667.0, + "step": 17995 + }, + { + "entropy": 0.06732565024867654, + "epoch": 4.1958270194661385, + "grad_norm": 2.4375, + "learning_rate": 4.9675236959022385e-05, + "loss": 0.0605, + "mean_token_accuracy": 0.9767836272716522, + "num_tokens": 39474692.0, + "step": 18000 + }, + { + "entropy": 0.06998653383925557, + "epoch": 4.19699265648677, + "grad_norm": 3.421875, + "learning_rate": 4.967489118789866e-05, + "loss": 0.0964, + "mean_token_accuracy": 0.9786720693111419, + "num_tokens": 39493889.0, + "step": 18005 + }, + { + "entropy": 0.055152688175439835, + "epoch": 4.198158293507402, + "grad_norm": 3.296875, + "learning_rate": 4.967454523522898e-05, + "loss": 0.0858, + "mean_token_accuracy": 0.9816052854061127, + "num_tokens": 39510892.0, + "step": 18010 + }, + { + "entropy": 0.059907327964901926, + "epoch": 4.199323930528034, + "grad_norm": 0.65625, + "learning_rate": 4.967419910101853e-05, + "loss": 0.0483, + "mean_token_accuracy": 0.9820407211780549, + "num_tokens": 39536981.0, + "step": 18015 + }, + { + "entropy": 0.13438040837645532, + "epoch": 4.200489567548665, + "grad_norm": 4.25, + "learning_rate": 4.9673852785272456e-05, + "loss": 0.2446, + "mean_token_accuracy": 0.9476132929325104, + "num_tokens": 39555531.0, + "step": 18020 + }, + { + "entropy": 0.059680545888841155, + "epoch": 4.201655204569297, + "grad_norm": 2.34375, + "learning_rate": 4.9673506287995926e-05, + "loss": 0.0636, + "mean_token_accuracy": 0.9831001102924347, + "num_tokens": 39572444.0, + "step": 18025 + }, + { + "entropy": 0.07699852250516415, + "epoch": 4.202820841589929, + "grad_norm": 4.71875, + "learning_rate": 4.967315960919411e-05, + "loss": 0.1543, + "mean_token_accuracy": 0.9660988092422486, + "num_tokens": 39584117.0, + "step": 18030 + }, + { + "entropy": 0.08583300039172173, + "epoch": 4.2039864786105605, + "grad_norm": 2.015625, + "learning_rate": 4.967281274887217e-05, + "loss": 0.0709, + "mean_token_accuracy": 0.9798553586006165, + "num_tokens": 39594900.0, + "step": 18035 + }, + { + "entropy": 0.0706296787597239, + "epoch": 4.205152115631193, + "grad_norm": 0.28515625, + "learning_rate": 4.967246570703529e-05, + "loss": 0.0989, + "mean_token_accuracy": 0.9756020784378052, + "num_tokens": 39616958.0, + "step": 18040 + }, + { + "entropy": 0.05387895340099931, + "epoch": 4.206317752651824, + "grad_norm": 0.5859375, + "learning_rate": 4.967211848368863e-05, + "loss": 0.0769, + "mean_token_accuracy": 0.9803040981292724, + "num_tokens": 39635355.0, + "step": 18045 + }, + { + "entropy": 0.07473008846864104, + "epoch": 4.207483389672456, + "grad_norm": 1.375, + "learning_rate": 4.967177107883738e-05, + "loss": 0.0567, + "mean_token_accuracy": 0.9801276922225952, + "num_tokens": 39659844.0, + "step": 18050 + }, + { + "entropy": 0.10034794714301824, + "epoch": 4.208649026693088, + "grad_norm": 3.4375, + "learning_rate": 4.967142349248671e-05, + "loss": 0.1122, + "mean_token_accuracy": 0.964007842540741, + "num_tokens": 39690318.0, + "step": 18055 + }, + { + "entropy": 0.07286425046622753, + "epoch": 4.209814663713719, + "grad_norm": 1.3203125, + "learning_rate": 4.967107572464182e-05, + "loss": 0.0714, + "mean_token_accuracy": 0.9797383069992065, + "num_tokens": 39701986.0, + "step": 18060 + }, + { + "entropy": 0.047049607150256634, + "epoch": 4.210980300734351, + "grad_norm": 1.40625, + "learning_rate": 4.967072777530788e-05, + "loss": 0.069, + "mean_token_accuracy": 0.9827369391918183, + "num_tokens": 39723435.0, + "step": 18065 + }, + { + "entropy": 0.054467049427330494, + "epoch": 4.2121459377549835, + "grad_norm": 1.734375, + "learning_rate": 4.967037964449008e-05, + "loss": 0.037, + "mean_token_accuracy": 0.9811189293861389, + "num_tokens": 39741765.0, + "step": 18070 + }, + { + "entropy": 0.06904594264924527, + "epoch": 4.213311574775615, + "grad_norm": 1.109375, + "learning_rate": 4.967003133219361e-05, + "loss": 0.0568, + "mean_token_accuracy": 0.9788180887699127, + "num_tokens": 39760359.0, + "step": 18075 + }, + { + "entropy": 0.07464666366577148, + "epoch": 4.214477211796247, + "grad_norm": 2.703125, + "learning_rate": 4.966968283842368e-05, + "loss": 0.0931, + "mean_token_accuracy": 0.9809688687324524, + "num_tokens": 39775682.0, + "step": 18080 + }, + { + "entropy": 0.2211771672591567, + "epoch": 4.215642848816878, + "grad_norm": 2.90625, + "learning_rate": 4.9669334163185466e-05, + "loss": 0.3894, + "mean_token_accuracy": 0.9507070541381836, + "num_tokens": 39802482.0, + "step": 18085 + }, + { + "entropy": 0.0948345759883523, + "epoch": 4.21680848583751, + "grad_norm": 0.59375, + "learning_rate": 4.9668985306484175e-05, + "loss": 0.1101, + "mean_token_accuracy": 0.9713608205318451, + "num_tokens": 39811315.0, + "step": 18090 + }, + { + "entropy": 0.11147435549646616, + "epoch": 4.217974122858142, + "grad_norm": 3.21875, + "learning_rate": 4.966863626832502e-05, + "loss": 0.1236, + "mean_token_accuracy": 0.9693335950374603, + "num_tokens": 39821734.0, + "step": 18095 + }, + { + "entropy": 0.05048245368525386, + "epoch": 4.219139759878773, + "grad_norm": 2.109375, + "learning_rate": 4.966828704871319e-05, + "loss": 0.0381, + "mean_token_accuracy": 0.9865420877933502, + "num_tokens": 39846251.0, + "step": 18100 + }, + { + "entropy": 0.09908029530197382, + "epoch": 4.2203053968994055, + "grad_norm": 2.4375, + "learning_rate": 4.9667937647653894e-05, + "loss": 0.1254, + "mean_token_accuracy": 0.9632340371608734, + "num_tokens": 39880560.0, + "step": 18105 + }, + { + "entropy": 0.07764818742871285, + "epoch": 4.221471033920038, + "grad_norm": 2.609375, + "learning_rate": 4.966758806515235e-05, + "loss": 0.0853, + "mean_token_accuracy": 0.9731331884860992, + "num_tokens": 39902255.0, + "step": 18110 + }, + { + "entropy": 0.061224596202373506, + "epoch": 4.222636670940669, + "grad_norm": 2.375, + "learning_rate": 4.966723830121377e-05, + "loss": 0.0735, + "mean_token_accuracy": 0.9794311761856079, + "num_tokens": 39921371.0, + "step": 18115 + }, + { + "entropy": 0.07678266037255525, + "epoch": 4.223802307961301, + "grad_norm": 4.28125, + "learning_rate": 4.966688835584336e-05, + "loss": 0.1015, + "mean_token_accuracy": 0.9721475660800933, + "num_tokens": 39940535.0, + "step": 18120 + }, + { + "entropy": 0.08287368789315223, + "epoch": 4.224967944981932, + "grad_norm": 5.25, + "learning_rate": 4.966653822904634e-05, + "loss": 0.1251, + "mean_token_accuracy": 0.9700684547424316, + "num_tokens": 39949215.0, + "step": 18125 + }, + { + "entropy": 0.0665527991950512, + "epoch": 4.226133582002564, + "grad_norm": 3.984375, + "learning_rate": 4.966618792082794e-05, + "loss": 0.0779, + "mean_token_accuracy": 0.980407428741455, + "num_tokens": 39963174.0, + "step": 18130 + }, + { + "entropy": 0.09068935289978981, + "epoch": 4.227299219023196, + "grad_norm": 1.71875, + "learning_rate": 4.9665837431193387e-05, + "loss": 0.0771, + "mean_token_accuracy": 0.9779536366462708, + "num_tokens": 39975323.0, + "step": 18135 + }, + { + "entropy": 0.0727505961433053, + "epoch": 4.228464856043828, + "grad_norm": 3.046875, + "learning_rate": 4.9665486760147895e-05, + "loss": 0.073, + "mean_token_accuracy": 0.9798406302928925, + "num_tokens": 40001191.0, + "step": 18140 + }, + { + "entropy": 0.06337338108569383, + "epoch": 4.22963049306446, + "grad_norm": 2.46875, + "learning_rate": 4.96651359076967e-05, + "loss": 0.0603, + "mean_token_accuracy": 0.9807958424091339, + "num_tokens": 40015378.0, + "step": 18145 + }, + { + "entropy": 0.061356103606522085, + "epoch": 4.230796130085092, + "grad_norm": 2.46875, + "learning_rate": 4.9664784873845025e-05, + "loss": 0.075, + "mean_token_accuracy": 0.9822879672050476, + "num_tokens": 40029798.0, + "step": 18150 + }, + { + "entropy": 0.06835889825597405, + "epoch": 4.231961767105723, + "grad_norm": 1.5390625, + "learning_rate": 4.966443365859812e-05, + "loss": 0.0821, + "mean_token_accuracy": 0.9771220684051514, + "num_tokens": 40044355.0, + "step": 18155 + }, + { + "entropy": 0.07135962946340442, + "epoch": 4.233127404126355, + "grad_norm": 0.38671875, + "learning_rate": 4.96640822619612e-05, + "loss": 0.067, + "mean_token_accuracy": 0.9825757741928101, + "num_tokens": 40061672.0, + "step": 18160 + }, + { + "entropy": 0.05592022901400924, + "epoch": 4.234293041146987, + "grad_norm": 0.55859375, + "learning_rate": 4.9663730683939524e-05, + "loss": 0.0706, + "mean_token_accuracy": 0.984505432844162, + "num_tokens": 40085250.0, + "step": 18165 + }, + { + "entropy": 0.08148605488240719, + "epoch": 4.2354586781676185, + "grad_norm": 2.09375, + "learning_rate": 4.966337892453833e-05, + "loss": 0.0727, + "mean_token_accuracy": 0.9765782058238983, + "num_tokens": 40114042.0, + "step": 18170 + }, + { + "entropy": 0.0687420979142189, + "epoch": 4.2366243151882506, + "grad_norm": 1.484375, + "learning_rate": 4.9663026983762855e-05, + "loss": 0.0751, + "mean_token_accuracy": 0.981961190700531, + "num_tokens": 40129373.0, + "step": 18175 + }, + { + "entropy": 0.07640882469713688, + "epoch": 4.237789952208882, + "grad_norm": 10.4375, + "learning_rate": 4.966267486161836e-05, + "loss": 0.1158, + "mean_token_accuracy": 0.9757577538490295, + "num_tokens": 40149001.0, + "step": 18180 + }, + { + "entropy": 0.067472736351192, + "epoch": 4.238955589229514, + "grad_norm": 6.15625, + "learning_rate": 4.9662322558110084e-05, + "loss": 0.0897, + "mean_token_accuracy": 0.9813668429851532, + "num_tokens": 40162262.0, + "step": 18185 + }, + { + "entropy": 0.051194218918681145, + "epoch": 4.240121226250146, + "grad_norm": 3.4375, + "learning_rate": 4.966197007324329e-05, + "loss": 0.0542, + "mean_token_accuracy": 0.9833743572235107, + "num_tokens": 40189423.0, + "step": 18190 + }, + { + "entropy": 0.06599115077406167, + "epoch": 4.241286863270777, + "grad_norm": 2.59375, + "learning_rate": 4.966161740702323e-05, + "loss": 0.0717, + "mean_token_accuracy": 0.9787083387374877, + "num_tokens": 40212858.0, + "step": 18195 + }, + { + "entropy": 0.09262879192829132, + "epoch": 4.242452500291409, + "grad_norm": 2.015625, + "learning_rate": 4.966126455945516e-05, + "loss": 0.0772, + "mean_token_accuracy": 0.9747631430625916, + "num_tokens": 40231966.0, + "step": 18200 + }, + { + "entropy": 0.06321851387619973, + "epoch": 4.243618137312041, + "grad_norm": 4.625, + "learning_rate": 4.966091153054434e-05, + "loss": 0.0869, + "mean_token_accuracy": 0.9782408058643342, + "num_tokens": 40243075.0, + "step": 18205 + }, + { + "entropy": 0.07191295428201556, + "epoch": 4.244783774332673, + "grad_norm": 4.21875, + "learning_rate": 4.9660558320296045e-05, + "loss": 0.0713, + "mean_token_accuracy": 0.9761281490325928, + "num_tokens": 40268059.0, + "step": 18210 + }, + { + "entropy": 0.06418008059263229, + "epoch": 4.245949411353305, + "grad_norm": 3.171875, + "learning_rate": 4.966020492871553e-05, + "loss": 0.0995, + "mean_token_accuracy": 0.9752312481403351, + "num_tokens": 40288817.0, + "step": 18215 + }, + { + "entropy": 0.0693417014554143, + "epoch": 4.247115048373936, + "grad_norm": 2.421875, + "learning_rate": 4.9659851355808076e-05, + "loss": 0.0671, + "mean_token_accuracy": 0.9787382364273072, + "num_tokens": 40318263.0, + "step": 18220 + }, + { + "entropy": 0.06724477596580983, + "epoch": 4.248280685394568, + "grad_norm": 1.125, + "learning_rate": 4.965949760157894e-05, + "loss": 0.0847, + "mean_token_accuracy": 0.9804198741912842, + "num_tokens": 40328010.0, + "step": 18225 + }, + { + "entropy": 0.1094695933163166, + "epoch": 4.2494463224152, + "grad_norm": 5.09375, + "learning_rate": 4.9659143666033416e-05, + "loss": 0.1179, + "mean_token_accuracy": 0.9737821757793427, + "num_tokens": 40340341.0, + "step": 18230 + }, + { + "entropy": 0.053947434015572074, + "epoch": 4.250611959435831, + "grad_norm": 0.7578125, + "learning_rate": 4.965878954917676e-05, + "loss": 0.0439, + "mean_token_accuracy": 0.9878208816051484, + "num_tokens": 40379611.0, + "step": 18235 + }, + { + "entropy": 0.06869078604504467, + "epoch": 4.2517775964564635, + "grad_norm": 0.2734375, + "learning_rate": 4.965843525101427e-05, + "loss": 0.0668, + "mean_token_accuracy": 0.9799235224723816, + "num_tokens": 40396991.0, + "step": 18240 + }, + { + "entropy": 0.06360419914126396, + "epoch": 4.252943233477096, + "grad_norm": 0.88671875, + "learning_rate": 4.965808077155123e-05, + "loss": 0.0698, + "mean_token_accuracy": 0.9827209532260894, + "num_tokens": 40411549.0, + "step": 18245 + }, + { + "entropy": 0.21973246708512306, + "epoch": 4.254108870497727, + "grad_norm": 0.59375, + "learning_rate": 4.9657726110792914e-05, + "loss": 0.3485, + "mean_token_accuracy": 0.9430919229984284, + "num_tokens": 40453795.0, + "step": 18250 + }, + { + "entropy": 0.07258025612682104, + "epoch": 4.255274507518359, + "grad_norm": 0.546875, + "learning_rate": 4.965737126874461e-05, + "loss": 0.0478, + "mean_token_accuracy": 0.987526661157608, + "num_tokens": 40470031.0, + "step": 18255 + }, + { + "entropy": 0.06506192674860359, + "epoch": 4.25644014453899, + "grad_norm": 0.89453125, + "learning_rate": 4.9657016245411614e-05, + "loss": 0.07, + "mean_token_accuracy": 0.9759842813014984, + "num_tokens": 40494247.0, + "step": 18260 + }, + { + "entropy": 0.07043304983526469, + "epoch": 4.257605781559622, + "grad_norm": 0.51171875, + "learning_rate": 4.965666104079923e-05, + "loss": 0.0668, + "mean_token_accuracy": 0.9803885996341706, + "num_tokens": 40507863.0, + "step": 18265 + }, + { + "entropy": 0.08717799168080091, + "epoch": 4.258771418580254, + "grad_norm": 0.478515625, + "learning_rate": 4.965630565491274e-05, + "loss": 0.0781, + "mean_token_accuracy": 0.9728128552436829, + "num_tokens": 40523582.0, + "step": 18270 + }, + { + "entropy": 0.09992110282182694, + "epoch": 4.2599370556008855, + "grad_norm": 1.546875, + "learning_rate": 4.965595008775745e-05, + "loss": 0.1153, + "mean_token_accuracy": 0.9688679814338684, + "num_tokens": 40534326.0, + "step": 18275 + }, + { + "entropy": 0.060247833095490935, + "epoch": 4.261102692621518, + "grad_norm": 1.4609375, + "learning_rate": 4.9655594339338654e-05, + "loss": 0.0546, + "mean_token_accuracy": 0.9846014022827149, + "num_tokens": 40552515.0, + "step": 18280 + }, + { + "entropy": 0.07441337686032057, + "epoch": 4.26226832964215, + "grad_norm": 2.71875, + "learning_rate": 4.965523840966167e-05, + "loss": 0.0724, + "mean_token_accuracy": 0.9776188433170319, + "num_tokens": 40574150.0, + "step": 18285 + }, + { + "entropy": 0.08602970764040947, + "epoch": 4.263433966662781, + "grad_norm": 1.640625, + "learning_rate": 4.96548822987318e-05, + "loss": 0.1114, + "mean_token_accuracy": 0.9711106956005097, + "num_tokens": 40583114.0, + "step": 18290 + }, + { + "entropy": 0.05823810677975416, + "epoch": 4.264599603683413, + "grad_norm": 6.0, + "learning_rate": 4.965452600655435e-05, + "loss": 0.0864, + "mean_token_accuracy": 0.9759131968021393, + "num_tokens": 40611013.0, + "step": 18295 + }, + { + "entropy": 0.06771047422662377, + "epoch": 4.265765240704045, + "grad_norm": 2.09375, + "learning_rate": 4.965416953313463e-05, + "loss": 0.0728, + "mean_token_accuracy": 0.9816241145133973, + "num_tokens": 40627138.0, + "step": 18300 + }, + { + "entropy": 0.059956477768719194, + "epoch": 4.266930877724676, + "grad_norm": 2.40625, + "learning_rate": 4.9653812878477976e-05, + "loss": 0.0676, + "mean_token_accuracy": 0.9846004903316498, + "num_tokens": 40650411.0, + "step": 18305 + }, + { + "entropy": 0.061690607108175755, + "epoch": 4.2680965147453085, + "grad_norm": 1.0234375, + "learning_rate": 4.965345604258968e-05, + "loss": 0.0511, + "mean_token_accuracy": 0.9836044013500214, + "num_tokens": 40676939.0, + "step": 18310 + }, + { + "entropy": 0.06359901251271367, + "epoch": 4.26926215176594, + "grad_norm": 0.2158203125, + "learning_rate": 4.9653099025475076e-05, + "loss": 0.0546, + "mean_token_accuracy": 0.9798293471336365, + "num_tokens": 40711483.0, + "step": 18315 + }, + { + "entropy": 0.08764311112463474, + "epoch": 4.270427788786572, + "grad_norm": 0.203125, + "learning_rate": 4.965274182713949e-05, + "loss": 0.1347, + "mean_token_accuracy": 0.9690339505672455, + "num_tokens": 40730163.0, + "step": 18320 + }, + { + "entropy": 0.07748756930232048, + "epoch": 4.271593425807204, + "grad_norm": 1.171875, + "learning_rate": 4.965238444758824e-05, + "loss": 0.0988, + "mean_token_accuracy": 0.9777417302131652, + "num_tokens": 40739083.0, + "step": 18325 + }, + { + "entropy": 0.06385232815518975, + "epoch": 4.272759062827835, + "grad_norm": 0.396484375, + "learning_rate": 4.9652026886826666e-05, + "loss": 0.0722, + "mean_token_accuracy": 0.9804993093013763, + "num_tokens": 40764097.0, + "step": 18330 + }, + { + "entropy": 0.06116134990006685, + "epoch": 4.273924699848467, + "grad_norm": 0.44921875, + "learning_rate": 4.965166914486008e-05, + "loss": 0.065, + "mean_token_accuracy": 0.9793049991130829, + "num_tokens": 40785512.0, + "step": 18335 + }, + { + "entropy": 0.08398058190941811, + "epoch": 4.275090336869099, + "grad_norm": 4.03125, + "learning_rate": 4.9651311221693845e-05, + "loss": 0.0871, + "mean_token_accuracy": 0.9712727069854736, + "num_tokens": 40806382.0, + "step": 18340 + }, + { + "entropy": 0.17974007017910482, + "epoch": 4.2762559738897306, + "grad_norm": 4.03125, + "learning_rate": 4.9650953117333275e-05, + "loss": 0.2688, + "mean_token_accuracy": 0.9447404503822326, + "num_tokens": 40828370.0, + "step": 18345 + }, + { + "entropy": 0.059681543800979855, + "epoch": 4.277421610910363, + "grad_norm": 0.388671875, + "learning_rate": 4.9650594831783724e-05, + "loss": 0.0428, + "mean_token_accuracy": 0.9817675530910492, + "num_tokens": 40866610.0, + "step": 18350 + }, + { + "entropy": 0.09934973865747451, + "epoch": 4.278587247930994, + "grad_norm": 7.09375, + "learning_rate": 4.9650236365050525e-05, + "loss": 0.1247, + "mean_token_accuracy": 0.9634607434272766, + "num_tokens": 40879155.0, + "step": 18355 + }, + { + "entropy": 0.06509402473457157, + "epoch": 4.279752884951626, + "grad_norm": 0.341796875, + "learning_rate": 4.9649877717139026e-05, + "loss": 0.0763, + "mean_token_accuracy": 0.9754965901374817, + "num_tokens": 40898689.0, + "step": 18360 + }, + { + "entropy": 0.0826711606234312, + "epoch": 4.280918521972258, + "grad_norm": 2.46875, + "learning_rate": 4.964951888805458e-05, + "loss": 0.0876, + "mean_token_accuracy": 0.9787150621414185, + "num_tokens": 40917534.0, + "step": 18365 + }, + { + "entropy": 0.13180441725999117, + "epoch": 4.282084158992889, + "grad_norm": 2.453125, + "learning_rate": 4.9649159877802524e-05, + "loss": 0.1874, + "mean_token_accuracy": 0.9560307621955871, + "num_tokens": 40944410.0, + "step": 18370 + }, + { + "entropy": 0.06450794208794833, + "epoch": 4.283249796013521, + "grad_norm": 2.453125, + "learning_rate": 4.964880068638823e-05, + "loss": 0.0735, + "mean_token_accuracy": 0.9780965685844422, + "num_tokens": 40960755.0, + "step": 18375 + }, + { + "entropy": 0.04702851264737547, + "epoch": 4.2844154330341535, + "grad_norm": 0.2119140625, + "learning_rate": 4.964844131381704e-05, + "loss": 0.0457, + "mean_token_accuracy": 0.986723518371582, + "num_tokens": 40990086.0, + "step": 18380 + }, + { + "entropy": 0.11209305711090564, + "epoch": 4.285581070054785, + "grad_norm": 1.59375, + "learning_rate": 4.9648081760094324e-05, + "loss": 0.1346, + "mean_token_accuracy": 0.967307448387146, + "num_tokens": 41010686.0, + "step": 18385 + }, + { + "entropy": 0.08510778667405247, + "epoch": 4.286746707075417, + "grad_norm": 0.69921875, + "learning_rate": 4.964772202522543e-05, + "loss": 0.1131, + "mean_token_accuracy": 0.9747722148895264, + "num_tokens": 41029033.0, + "step": 18390 + }, + { + "entropy": 0.06896943571045995, + "epoch": 4.287912344096048, + "grad_norm": 5.53125, + "learning_rate": 4.9647362109215735e-05, + "loss": 0.0983, + "mean_token_accuracy": 0.9716754376888275, + "num_tokens": 41046478.0, + "step": 18395 + }, + { + "entropy": 0.04878347143530846, + "epoch": 4.28907798111668, + "grad_norm": 0.482421875, + "learning_rate": 4.96470020120706e-05, + "loss": 0.0798, + "mean_token_accuracy": 0.9811624825000763, + "num_tokens": 41078644.0, + "step": 18400 + }, + { + "entropy": 0.05125298034399748, + "epoch": 4.290243618137312, + "grad_norm": 1.4609375, + "learning_rate": 4.964664173379539e-05, + "loss": 0.0611, + "mean_token_accuracy": 0.9861126601696014, + "num_tokens": 41099848.0, + "step": 18405 + }, + { + "entropy": 0.07087703254073859, + "epoch": 4.2914092551579435, + "grad_norm": 0.53515625, + "learning_rate": 4.9646281274395484e-05, + "loss": 0.0763, + "mean_token_accuracy": 0.9810361623764038, + "num_tokens": 41127755.0, + "step": 18410 + }, + { + "entropy": 0.06897338693961501, + "epoch": 4.292574892178576, + "grad_norm": 0.3671875, + "learning_rate": 4.964592063387625e-05, + "loss": 0.0599, + "mean_token_accuracy": 0.9793107509613037, + "num_tokens": 41151591.0, + "step": 18415 + }, + { + "entropy": 0.17628241926431656, + "epoch": 4.293740529199208, + "grad_norm": 0.74609375, + "learning_rate": 4.964555981224308e-05, + "loss": 0.192, + "mean_token_accuracy": 0.937102484703064, + "num_tokens": 41186067.0, + "step": 18420 + }, + { + "entropy": 0.07771440567448736, + "epoch": 4.294906166219839, + "grad_norm": 4.8125, + "learning_rate": 4.964519880950134e-05, + "loss": 0.0837, + "mean_token_accuracy": 0.9757640421390533, + "num_tokens": 41206154.0, + "step": 18425 + }, + { + "entropy": 0.047894996032118796, + "epoch": 4.296071803240471, + "grad_norm": 2.03125, + "learning_rate": 4.9644837625656425e-05, + "loss": 0.0532, + "mean_token_accuracy": 0.9868818938732147, + "num_tokens": 41224421.0, + "step": 18430 + }, + { + "entropy": 0.05917087513953447, + "epoch": 4.297237440261103, + "grad_norm": 0.5078125, + "learning_rate": 4.964447626071371e-05, + "loss": 0.039, + "mean_token_accuracy": 0.9861246168613433, + "num_tokens": 41260158.0, + "step": 18435 + }, + { + "entropy": 0.06869417782872915, + "epoch": 4.298403077281734, + "grad_norm": 3.609375, + "learning_rate": 4.964411471467859e-05, + "loss": 0.0962, + "mean_token_accuracy": 0.9750606179237366, + "num_tokens": 41276685.0, + "step": 18440 + }, + { + "entropy": 0.050992142967879774, + "epoch": 4.299568714302366, + "grad_norm": 0.296875, + "learning_rate": 4.964375298755645e-05, + "loss": 0.0411, + "mean_token_accuracy": 0.9854860126972198, + "num_tokens": 41313001.0, + "step": 18445 + }, + { + "entropy": 0.05416600527241826, + "epoch": 4.300734351322998, + "grad_norm": 3.3125, + "learning_rate": 4.9643391079352684e-05, + "loss": 0.0433, + "mean_token_accuracy": 0.9865663528442383, + "num_tokens": 41350294.0, + "step": 18450 + }, + { + "entropy": 0.06011302322149277, + "epoch": 4.30189998834363, + "grad_norm": 1.3046875, + "learning_rate": 4.96430289900727e-05, + "loss": 0.0603, + "mean_token_accuracy": 0.9830720722675323, + "num_tokens": 41372750.0, + "step": 18455 + }, + { + "entropy": 0.07380872648209333, + "epoch": 4.303065625364262, + "grad_norm": 2.0, + "learning_rate": 4.964266671972189e-05, + "loss": 0.0509, + "mean_token_accuracy": 0.9841031551361084, + "num_tokens": 41393418.0, + "step": 18460 + }, + { + "entropy": 0.06437693070620298, + "epoch": 4.304231262384893, + "grad_norm": 2.5625, + "learning_rate": 4.964230426830564e-05, + "loss": 0.0539, + "mean_token_accuracy": 0.9840012729167938, + "num_tokens": 41413683.0, + "step": 18465 + }, + { + "entropy": 0.08422975167632103, + "epoch": 4.305396899405525, + "grad_norm": 0.828125, + "learning_rate": 4.9641941635829384e-05, + "loss": 0.0663, + "mean_token_accuracy": 0.9783717930316925, + "num_tokens": 41437122.0, + "step": 18470 + }, + { + "entropy": 0.0769905123859644, + "epoch": 4.306562536426157, + "grad_norm": 1.78125, + "learning_rate": 4.964157882229852e-05, + "loss": 0.1124, + "mean_token_accuracy": 0.9747823476791382, + "num_tokens": 41446558.0, + "step": 18475 + }, + { + "entropy": 0.0603149157948792, + "epoch": 4.3077281734467885, + "grad_norm": 4.46875, + "learning_rate": 4.9641215827718444e-05, + "loss": 0.078, + "mean_token_accuracy": 0.9764757096767426, + "num_tokens": 41466108.0, + "step": 18480 + }, + { + "entropy": 0.05723539115861058, + "epoch": 4.308893810467421, + "grad_norm": 0.50390625, + "learning_rate": 4.9640852652094586e-05, + "loss": 0.0428, + "mean_token_accuracy": 0.9779875040054321, + "num_tokens": 41494076.0, + "step": 18485 + }, + { + "entropy": 0.08054400235414505, + "epoch": 4.310059447488052, + "grad_norm": 0.546875, + "learning_rate": 4.964048929543235e-05, + "loss": 0.093, + "mean_token_accuracy": 0.9737628102302551, + "num_tokens": 41517453.0, + "step": 18490 + }, + { + "entropy": 0.07607092708349228, + "epoch": 4.311225084508684, + "grad_norm": 0.70703125, + "learning_rate": 4.9640125757737156e-05, + "loss": 0.0751, + "mean_token_accuracy": 0.9802489399909973, + "num_tokens": 41545604.0, + "step": 18495 + }, + { + "entropy": 0.06756752827204764, + "epoch": 4.312390721529316, + "grad_norm": 0.1767578125, + "learning_rate": 4.9639762039014434e-05, + "loss": 0.0619, + "mean_token_accuracy": 0.9819507598876953, + "num_tokens": 41571454.0, + "step": 18500 + }, + { + "entropy": 0.07994569651782513, + "epoch": 4.313556358549947, + "grad_norm": 3.953125, + "learning_rate": 4.9639398139269597e-05, + "loss": 0.105, + "mean_token_accuracy": 0.9740553617477417, + "num_tokens": 41586939.0, + "step": 18505 + }, + { + "entropy": 0.07430811729282141, + "epoch": 4.314721995570579, + "grad_norm": 0.97265625, + "learning_rate": 4.963903405850807e-05, + "loss": 0.1093, + "mean_token_accuracy": 0.9766282320022583, + "num_tokens": 41598356.0, + "step": 18510 + }, + { + "entropy": 0.0700147021561861, + "epoch": 4.315887632591211, + "grad_norm": 3.328125, + "learning_rate": 4.9638669796735295e-05, + "loss": 0.0764, + "mean_token_accuracy": 0.9803567469120026, + "num_tokens": 41611133.0, + "step": 18515 + }, + { + "entropy": 0.08196193277835846, + "epoch": 4.317053269611843, + "grad_norm": 0.74609375, + "learning_rate": 4.9638305353956694e-05, + "loss": 0.083, + "mean_token_accuracy": 0.975335168838501, + "num_tokens": 41632870.0, + "step": 18520 + }, + { + "entropy": 0.07142581269145012, + "epoch": 4.318218906632475, + "grad_norm": 4.3125, + "learning_rate": 4.96379407301777e-05, + "loss": 0.0885, + "mean_token_accuracy": 0.9793270707130433, + "num_tokens": 41654489.0, + "step": 18525 + }, + { + "entropy": 0.05450365114957094, + "epoch": 4.319384543653106, + "grad_norm": 0.76953125, + "learning_rate": 4.9637575925403755e-05, + "loss": 0.059, + "mean_token_accuracy": 0.9803845226764679, + "num_tokens": 41680852.0, + "step": 18530 + }, + { + "entropy": 0.08670583032071591, + "epoch": 4.320550180673738, + "grad_norm": 3.25, + "learning_rate": 4.96372109396403e-05, + "loss": 0.0813, + "mean_token_accuracy": 0.9761635422706604, + "num_tokens": 41699379.0, + "step": 18535 + }, + { + "entropy": 0.08022739067673683, + "epoch": 4.32171581769437, + "grad_norm": 1.1171875, + "learning_rate": 4.963684577289277e-05, + "loss": 0.0909, + "mean_token_accuracy": 0.9803108692169189, + "num_tokens": 41710174.0, + "step": 18540 + }, + { + "entropy": 0.05595881547778845, + "epoch": 4.322881454715001, + "grad_norm": 2.71875, + "learning_rate": 4.963648042516661e-05, + "loss": 0.0616, + "mean_token_accuracy": 0.9813150882720947, + "num_tokens": 41738717.0, + "step": 18545 + }, + { + "entropy": 0.06690775705501437, + "epoch": 4.3240470917356335, + "grad_norm": 2.546875, + "learning_rate": 4.963611489646728e-05, + "loss": 0.1002, + "mean_token_accuracy": 0.9740199089050293, + "num_tokens": 41755906.0, + "step": 18550 + }, + { + "entropy": 0.07001672107726335, + "epoch": 4.325212728756266, + "grad_norm": 0.640625, + "learning_rate": 4.9635749186800225e-05, + "loss": 0.0494, + "mean_token_accuracy": 0.9815467417240142, + "num_tokens": 41784497.0, + "step": 18555 + }, + { + "entropy": 0.053368914686143396, + "epoch": 4.326378365776897, + "grad_norm": 1.5625, + "learning_rate": 4.963538329617089e-05, + "loss": 0.0466, + "mean_token_accuracy": 0.9881910800933837, + "num_tokens": 41803148.0, + "step": 18560 + }, + { + "entropy": 0.07704963702708482, + "epoch": 4.327544002797529, + "grad_norm": 1.265625, + "learning_rate": 4.963501722458474e-05, + "loss": 0.0593, + "mean_token_accuracy": 0.9782488167285919, + "num_tokens": 41823408.0, + "step": 18565 + }, + { + "entropy": 0.07265666145831347, + "epoch": 4.328709639818161, + "grad_norm": 0.416015625, + "learning_rate": 4.9634650972047235e-05, + "loss": 0.0633, + "mean_token_accuracy": 0.9832420825958252, + "num_tokens": 41841017.0, + "step": 18570 + }, + { + "entropy": 0.06052091708406806, + "epoch": 4.329875276838792, + "grad_norm": 3.328125, + "learning_rate": 4.963428453856383e-05, + "loss": 0.046, + "mean_token_accuracy": 0.9867750465869903, + "num_tokens": 41875414.0, + "step": 18575 + }, + { + "entropy": 0.08009387571364641, + "epoch": 4.331040913859424, + "grad_norm": 2.5625, + "learning_rate": 4.963391792413999e-05, + "loss": 0.0915, + "mean_token_accuracy": 0.9757187187671661, + "num_tokens": 41902076.0, + "step": 18580 + }, + { + "entropy": 0.08106917953118682, + "epoch": 4.332206550880056, + "grad_norm": 1.296875, + "learning_rate": 4.9633551128781186e-05, + "loss": 0.0679, + "mean_token_accuracy": 0.980489581823349, + "num_tokens": 41933448.0, + "step": 18585 + }, + { + "entropy": 0.09040620159357786, + "epoch": 4.333372187900688, + "grad_norm": 3.53125, + "learning_rate": 4.963318415249289e-05, + "loss": 0.1102, + "mean_token_accuracy": 0.9753665149211883, + "num_tokens": 41953266.0, + "step": 18590 + }, + { + "entropy": 0.06375911897048354, + "epoch": 4.33453782492132, + "grad_norm": 6.1875, + "learning_rate": 4.963281699528055e-05, + "loss": 0.0859, + "mean_token_accuracy": 0.9811161577701568, + "num_tokens": 41983894.0, + "step": 18595 + }, + { + "entropy": 0.08802753314375877, + "epoch": 4.335703461941951, + "grad_norm": 3.421875, + "learning_rate": 4.963244965714968e-05, + "loss": 0.0661, + "mean_token_accuracy": 0.9772323071956635, + "num_tokens": 41997012.0, + "step": 18600 + }, + { + "entropy": 0.060184755455702545, + "epoch": 4.336869098962583, + "grad_norm": 1.2734375, + "learning_rate": 4.9632082138105726e-05, + "loss": 0.0496, + "mean_token_accuracy": 0.983251416683197, + "num_tokens": 42017776.0, + "step": 18605 + }, + { + "entropy": 0.06990799438208342, + "epoch": 4.338034735983215, + "grad_norm": 3.484375, + "learning_rate": 4.963171443815418e-05, + "loss": 0.0899, + "mean_token_accuracy": 0.9801129817962646, + "num_tokens": 42051407.0, + "step": 18610 + }, + { + "entropy": 0.06780791487544775, + "epoch": 4.339200373003846, + "grad_norm": 4.21875, + "learning_rate": 4.963134655730053e-05, + "loss": 0.0529, + "mean_token_accuracy": 0.9823335826396942, + "num_tokens": 42077316.0, + "step": 18615 + }, + { + "entropy": 0.09103262685239315, + "epoch": 4.3403660100244785, + "grad_norm": 0.7109375, + "learning_rate": 4.963097849555025e-05, + "loss": 0.0988, + "mean_token_accuracy": 0.9731744289398193, + "num_tokens": 42097680.0, + "step": 18620 + }, + { + "entropy": 0.06253818608820438, + "epoch": 4.34153164704511, + "grad_norm": 3.375, + "learning_rate": 4.963061025290884e-05, + "loss": 0.0494, + "mean_token_accuracy": 0.9833706498146058, + "num_tokens": 42118703.0, + "step": 18625 + }, + { + "entropy": 0.08283813260495662, + "epoch": 4.342697284065742, + "grad_norm": 2.234375, + "learning_rate": 4.963024182938179e-05, + "loss": 0.0689, + "mean_token_accuracy": 0.9750663280487061, + "num_tokens": 42135563.0, + "step": 18630 + }, + { + "entropy": 0.07473634798079729, + "epoch": 4.343862921086374, + "grad_norm": 2.390625, + "learning_rate": 4.962987322497458e-05, + "loss": 0.0715, + "mean_token_accuracy": 0.9804068505764008, + "num_tokens": 42154972.0, + "step": 18635 + }, + { + "entropy": 0.08067142516374588, + "epoch": 4.345028558107005, + "grad_norm": 1.5078125, + "learning_rate": 4.9629504439692717e-05, + "loss": 0.1013, + "mean_token_accuracy": 0.977970826625824, + "num_tokens": 42166016.0, + "step": 18640 + }, + { + "entropy": 0.045968357706442475, + "epoch": 4.346194195127637, + "grad_norm": 1.1171875, + "learning_rate": 4.96291354735417e-05, + "loss": 0.0356, + "mean_token_accuracy": 0.9882686376571655, + "num_tokens": 42192263.0, + "step": 18645 + }, + { + "entropy": 0.05396311990916729, + "epoch": 4.347359832148269, + "grad_norm": 0.73828125, + "learning_rate": 4.962876632652703e-05, + "loss": 0.045, + "mean_token_accuracy": 0.984005081653595, + "num_tokens": 42225107.0, + "step": 18650 + }, + { + "entropy": 0.07691260352730751, + "epoch": 4.348525469168901, + "grad_norm": 2.640625, + "learning_rate": 4.962839699865421e-05, + "loss": 0.0457, + "mean_token_accuracy": 0.9825477838516236, + "num_tokens": 42247555.0, + "step": 18655 + }, + { + "entropy": 0.07492669681087136, + "epoch": 4.349691106189533, + "grad_norm": 1.3671875, + "learning_rate": 4.962802748992875e-05, + "loss": 0.1, + "mean_token_accuracy": 0.9735464215278625, + "num_tokens": 42266583.0, + "step": 18660 + }, + { + "entropy": 0.04886567974463105, + "epoch": 4.350856743210164, + "grad_norm": 1.734375, + "learning_rate": 4.962765780035616e-05, + "loss": 0.0496, + "mean_token_accuracy": 0.9842284440994262, + "num_tokens": 42289094.0, + "step": 18665 + }, + { + "entropy": 0.08082684567198158, + "epoch": 4.352022380230796, + "grad_norm": 0.703125, + "learning_rate": 4.962728792994196e-05, + "loss": 0.0916, + "mean_token_accuracy": 0.9775077998638153, + "num_tokens": 42320993.0, + "step": 18670 + }, + { + "entropy": 0.14611582197248935, + "epoch": 4.353188017251428, + "grad_norm": 2.015625, + "learning_rate": 4.962691787869164e-05, + "loss": 0.2335, + "mean_token_accuracy": 0.9521386444568634, + "num_tokens": 42350358.0, + "step": 18675 + }, + { + "entropy": 0.057045502867549655, + "epoch": 4.354353654272059, + "grad_norm": 0.318359375, + "learning_rate": 4.962654764661074e-05, + "loss": 0.0453, + "mean_token_accuracy": 0.9859548568725586, + "num_tokens": 42391759.0, + "step": 18680 + }, + { + "entropy": 0.0750160675495863, + "epoch": 4.355519291292691, + "grad_norm": 0.30078125, + "learning_rate": 4.962617723370478e-05, + "loss": 0.0515, + "mean_token_accuracy": 0.9814619541168212, + "num_tokens": 42420280.0, + "step": 18685 + }, + { + "entropy": 0.06027145287953317, + "epoch": 4.3566849283133235, + "grad_norm": 0.30859375, + "learning_rate": 4.962580663997928e-05, + "loss": 0.0426, + "mean_token_accuracy": 0.9794057428836822, + "num_tokens": 42453195.0, + "step": 18690 + }, + { + "entropy": 0.0616011893376708, + "epoch": 4.357850565333955, + "grad_norm": 0.66796875, + "learning_rate": 4.9625435865439756e-05, + "loss": 0.0592, + "mean_token_accuracy": 0.9814781188964844, + "num_tokens": 42469008.0, + "step": 18695 + }, + { + "entropy": 0.06871538469567895, + "epoch": 4.359016202354587, + "grad_norm": 4.3125, + "learning_rate": 4.9625064910091753e-05, + "loss": 0.1116, + "mean_token_accuracy": 0.9721579551696777, + "num_tokens": 42484841.0, + "step": 18700 + }, + { + "entropy": 0.08792795054614544, + "epoch": 4.360181839375219, + "grad_norm": 7.78125, + "learning_rate": 4.962469377394079e-05, + "loss": 0.1013, + "mean_token_accuracy": 0.9731840312480926, + "num_tokens": 42513259.0, + "step": 18705 + }, + { + "entropy": 0.12590097589418292, + "epoch": 4.36134747639585, + "grad_norm": 0.5625, + "learning_rate": 4.962432245699241e-05, + "loss": 0.1736, + "mean_token_accuracy": 0.9640897929668426, + "num_tokens": 42537247.0, + "step": 18710 + }, + { + "entropy": 0.0786399308592081, + "epoch": 4.362513113416482, + "grad_norm": 2.625, + "learning_rate": 4.962395095925214e-05, + "loss": 0.104, + "mean_token_accuracy": 0.9724407613277435, + "num_tokens": 42555543.0, + "step": 18715 + }, + { + "entropy": 0.0821190400980413, + "epoch": 4.3636787504371135, + "grad_norm": 0.390625, + "learning_rate": 4.962357928072553e-05, + "loss": 0.0986, + "mean_token_accuracy": 0.9734818339347839, + "num_tokens": 42578885.0, + "step": 18720 + }, + { + "entropy": 0.08184778075665236, + "epoch": 4.364844387457746, + "grad_norm": 2.046875, + "learning_rate": 4.962320742141812e-05, + "loss": 0.0699, + "mean_token_accuracy": 0.9773243069648743, + "num_tokens": 42604667.0, + "step": 18725 + }, + { + "entropy": 0.07144193323329091, + "epoch": 4.366010024478378, + "grad_norm": 0.412109375, + "learning_rate": 4.962283538133545e-05, + "loss": 0.0712, + "mean_token_accuracy": 0.9800425589084625, + "num_tokens": 42618866.0, + "step": 18730 + }, + { + "entropy": 0.04785723239183426, + "epoch": 4.367175661499009, + "grad_norm": 3.78125, + "learning_rate": 4.962246316048307e-05, + "loss": 0.0507, + "mean_token_accuracy": 0.9854453265666961, + "num_tokens": 42638384.0, + "step": 18735 + }, + { + "entropy": 0.08500594049692153, + "epoch": 4.368341298519641, + "grad_norm": 2.875, + "learning_rate": 4.9622090758866534e-05, + "loss": 0.1412, + "mean_token_accuracy": 0.9683994054794312, + "num_tokens": 42646730.0, + "step": 18740 + }, + { + "entropy": 0.06168932262808084, + "epoch": 4.369506935540273, + "grad_norm": 4.0, + "learning_rate": 4.962171817649139e-05, + "loss": 0.0715, + "mean_token_accuracy": 0.9813813984394073, + "num_tokens": 42665053.0, + "step": 18745 + }, + { + "entropy": 0.06118791922926903, + "epoch": 4.370672572560904, + "grad_norm": 3.796875, + "learning_rate": 4.962134541336319e-05, + "loss": 0.0952, + "mean_token_accuracy": 0.9771844387054444, + "num_tokens": 42675690.0, + "step": 18750 + }, + { + "entropy": 0.08084941320121289, + "epoch": 4.3718382095815365, + "grad_norm": 1.8984375, + "learning_rate": 4.9620972469487515e-05, + "loss": 0.1111, + "mean_token_accuracy": 0.9746397316455842, + "num_tokens": 42689042.0, + "step": 18755 + }, + { + "entropy": 0.061972387880086896, + "epoch": 4.373003846602168, + "grad_norm": 2.265625, + "learning_rate": 4.96205993448699e-05, + "loss": 0.0656, + "mean_token_accuracy": 0.9821341514587403, + "num_tokens": 42711886.0, + "step": 18760 + }, + { + "entropy": 0.06485986206680536, + "epoch": 4.3741694836228, + "grad_norm": 0.6015625, + "learning_rate": 4.962022603951592e-05, + "loss": 0.0781, + "mean_token_accuracy": 0.9823011875152587, + "num_tokens": 42743918.0, + "step": 18765 + }, + { + "entropy": 0.06315469332039356, + "epoch": 4.375335120643432, + "grad_norm": 0.52734375, + "learning_rate": 4.961985255343113e-05, + "loss": 0.0627, + "mean_token_accuracy": 0.9816456615924836, + "num_tokens": 42758201.0, + "step": 18770 + }, + { + "entropy": 0.06303026769310235, + "epoch": 4.376500757664063, + "grad_norm": 1.71875, + "learning_rate": 4.961947888662112e-05, + "loss": 0.0685, + "mean_token_accuracy": 0.9812012195587159, + "num_tokens": 42775902.0, + "step": 18775 + }, + { + "entropy": 0.05505295917391777, + "epoch": 4.377666394684695, + "grad_norm": 0.609375, + "learning_rate": 4.961910503909145e-05, + "loss": 0.0615, + "mean_token_accuracy": 0.9768405258655548, + "num_tokens": 42796697.0, + "step": 18780 + }, + { + "entropy": 0.060315626300871374, + "epoch": 4.378832031705327, + "grad_norm": 1.0546875, + "learning_rate": 4.96187310108477e-05, + "loss": 0.065, + "mean_token_accuracy": 0.9849414944648742, + "num_tokens": 42813530.0, + "step": 18785 + }, + { + "entropy": 0.11221035532653331, + "epoch": 4.3799976687259585, + "grad_norm": 3.953125, + "learning_rate": 4.961835680189543e-05, + "loss": 0.1438, + "mean_token_accuracy": 0.9659707307815552, + "num_tokens": 42835504.0, + "step": 18790 + }, + { + "entropy": 0.054355106130242345, + "epoch": 4.381163305746591, + "grad_norm": 3.734375, + "learning_rate": 4.961798241224024e-05, + "loss": 0.0747, + "mean_token_accuracy": 0.9837758183479309, + "num_tokens": 42851192.0, + "step": 18795 + }, + { + "entropy": 0.05948843127116561, + "epoch": 4.382328942767222, + "grad_norm": 5.34375, + "learning_rate": 4.9617607841887707e-05, + "loss": 0.0572, + "mean_token_accuracy": 0.9820643961429596, + "num_tokens": 42873119.0, + "step": 18800 + }, + { + "entropy": 0.07751112319529056, + "epoch": 4.383494579787854, + "grad_norm": 1.21875, + "learning_rate": 4.961723309084341e-05, + "loss": 0.1139, + "mean_token_accuracy": 0.9708074569702149, + "num_tokens": 42882581.0, + "step": 18805 + }, + { + "entropy": 0.08554555289447308, + "epoch": 4.384660216808486, + "grad_norm": 1.6328125, + "learning_rate": 4.961685815911295e-05, + "loss": 0.1082, + "mean_token_accuracy": 0.9752114892005921, + "num_tokens": 42891543.0, + "step": 18810 + }, + { + "entropy": 0.062315055076032876, + "epoch": 4.385825853829117, + "grad_norm": 0.6953125, + "learning_rate": 4.961648304670191e-05, + "loss": 0.0535, + "mean_token_accuracy": 0.9814134836196899, + "num_tokens": 42919010.0, + "step": 18815 + }, + { + "entropy": 0.068437035754323, + "epoch": 4.386991490849749, + "grad_norm": 2.71875, + "learning_rate": 4.961610775361588e-05, + "loss": 0.0919, + "mean_token_accuracy": 0.9802298843860626, + "num_tokens": 42933957.0, + "step": 18820 + }, + { + "entropy": 0.07225360907614231, + "epoch": 4.3881571278703815, + "grad_norm": 2.65625, + "learning_rate": 4.961573227986045e-05, + "loss": 0.0631, + "mean_token_accuracy": 0.9835760712623596, + "num_tokens": 42962988.0, + "step": 18825 + }, + { + "entropy": 0.06295080268755555, + "epoch": 4.389322764891013, + "grad_norm": 0.330078125, + "learning_rate": 4.9615356625441246e-05, + "loss": 0.1016, + "mean_token_accuracy": 0.9730377972126008, + "num_tokens": 42983746.0, + "step": 18830 + }, + { + "entropy": 0.06006499025970698, + "epoch": 4.390488401911645, + "grad_norm": 2.359375, + "learning_rate": 4.961498079036384e-05, + "loss": 0.0629, + "mean_token_accuracy": 0.980323189496994, + "num_tokens": 43002500.0, + "step": 18835 + }, + { + "entropy": 0.06451915800571442, + "epoch": 4.391654038932277, + "grad_norm": 3.734375, + "learning_rate": 4.961460477463385e-05, + "loss": 0.0428, + "mean_token_accuracy": 0.9831142783164978, + "num_tokens": 43030396.0, + "step": 18840 + }, + { + "entropy": 0.10234779641032218, + "epoch": 4.392819675952908, + "grad_norm": 3.9375, + "learning_rate": 4.961422857825689e-05, + "loss": 0.1293, + "mean_token_accuracy": 0.9682917714118957, + "num_tokens": 43037539.0, + "step": 18845 + }, + { + "entropy": 0.1020116938278079, + "epoch": 4.39398531297354, + "grad_norm": 1.4921875, + "learning_rate": 4.961385220123855e-05, + "loss": 0.1034, + "mean_token_accuracy": 0.9642068803310394, + "num_tokens": 43050680.0, + "step": 18850 + }, + { + "entropy": 0.04995893612504006, + "epoch": 4.395150949994171, + "grad_norm": 0.43359375, + "learning_rate": 4.961347564358446e-05, + "loss": 0.0376, + "mean_token_accuracy": 0.9891486525535583, + "num_tokens": 43078610.0, + "step": 18855 + }, + { + "entropy": 0.07869170345366001, + "epoch": 4.3963165870148035, + "grad_norm": 2.796875, + "learning_rate": 4.961309890530023e-05, + "loss": 0.0852, + "mean_token_accuracy": 0.9785373866558075, + "num_tokens": 43098034.0, + "step": 18860 + }, + { + "entropy": 0.06370147401466966, + "epoch": 4.397482224035436, + "grad_norm": 2.53125, + "learning_rate": 4.9612721986391474e-05, + "loss": 0.0721, + "mean_token_accuracy": 0.9796221852302551, + "num_tokens": 43115615.0, + "step": 18865 + }, + { + "entropy": 0.06276164511218667, + "epoch": 4.398647861056067, + "grad_norm": 0.3515625, + "learning_rate": 4.961234488686382e-05, + "loss": 0.0751, + "mean_token_accuracy": 0.9809258460998536, + "num_tokens": 43138300.0, + "step": 18870 + }, + { + "entropy": 0.07407207041978836, + "epoch": 4.399813498076699, + "grad_norm": 1.5859375, + "learning_rate": 4.961196760672288e-05, + "loss": 0.0753, + "mean_token_accuracy": 0.9747194409370422, + "num_tokens": 43161841.0, + "step": 18875 + }, + { + "entropy": 0.05361275505274534, + "epoch": 4.400979135097331, + "grad_norm": 3.609375, + "learning_rate": 4.961159014597428e-05, + "loss": 0.0564, + "mean_token_accuracy": 0.9803205966949463, + "num_tokens": 43179908.0, + "step": 18880 + }, + { + "entropy": 0.08528228178620338, + "epoch": 4.402144772117962, + "grad_norm": 1.984375, + "learning_rate": 4.961121250462366e-05, + "loss": 0.1197, + "mean_token_accuracy": 0.9708484292030335, + "num_tokens": 43188893.0, + "step": 18885 + }, + { + "entropy": 0.0671801614575088, + "epoch": 4.403310409138594, + "grad_norm": 1.75, + "learning_rate": 4.961083468267664e-05, + "loss": 0.0874, + "mean_token_accuracy": 0.9767676532268524, + "num_tokens": 43227304.0, + "step": 18890 + }, + { + "entropy": 0.053513195179402825, + "epoch": 4.404476046159226, + "grad_norm": 0.80078125, + "learning_rate": 4.961045668013887e-05, + "loss": 0.0373, + "mean_token_accuracy": 0.98809694647789, + "num_tokens": 43252473.0, + "step": 18895 + }, + { + "entropy": 0.08175447061657906, + "epoch": 4.405641683179858, + "grad_norm": 2.765625, + "learning_rate": 4.961007849701596e-05, + "loss": 0.1035, + "mean_token_accuracy": 0.9751053869724273, + "num_tokens": 43264002.0, + "step": 18900 + }, + { + "entropy": 0.08174102194607258, + "epoch": 4.40680732020049, + "grad_norm": 1.1171875, + "learning_rate": 4.960970013331358e-05, + "loss": 0.0854, + "mean_token_accuracy": 0.9750945091247558, + "num_tokens": 43278951.0, + "step": 18905 + }, + { + "entropy": 0.07742302156984807, + "epoch": 4.407972957221121, + "grad_norm": 2.1875, + "learning_rate": 4.960932158903735e-05, + "loss": 0.0814, + "mean_token_accuracy": 0.9777226269245147, + "num_tokens": 43292822.0, + "step": 18910 + }, + { + "entropy": 0.048268615175038575, + "epoch": 4.409138594241753, + "grad_norm": 0.5, + "learning_rate": 4.960894286419293e-05, + "loss": 0.0507, + "mean_token_accuracy": 0.9873724281787872, + "num_tokens": 43326562.0, + "step": 18915 + }, + { + "entropy": 0.05129559133201837, + "epoch": 4.410304231262385, + "grad_norm": 0.4375, + "learning_rate": 4.9608563958785945e-05, + "loss": 0.0574, + "mean_token_accuracy": 0.9859373986721038, + "num_tokens": 43359792.0, + "step": 18920 + }, + { + "entropy": 0.06564451195299625, + "epoch": 4.4114698682830165, + "grad_norm": 5.90625, + "learning_rate": 4.9608184872822065e-05, + "loss": 0.0946, + "mean_token_accuracy": 0.978387588262558, + "num_tokens": 43383462.0, + "step": 18925 + }, + { + "entropy": 0.08527331128716469, + "epoch": 4.412635505303649, + "grad_norm": 2.28125, + "learning_rate": 4.960780560630694e-05, + "loss": 0.0894, + "mean_token_accuracy": 0.9752187550067901, + "num_tokens": 43414584.0, + "step": 18930 + }, + { + "entropy": 0.07463353797793389, + "epoch": 4.41380114232428, + "grad_norm": 3.15625, + "learning_rate": 4.9607426159246226e-05, + "loss": 0.0691, + "mean_token_accuracy": 0.9808345139026642, + "num_tokens": 43445164.0, + "step": 18935 + }, + { + "entropy": 0.06146889589726925, + "epoch": 4.414966779344912, + "grad_norm": 1.4296875, + "learning_rate": 4.960704653164557e-05, + "loss": 0.051, + "mean_token_accuracy": 0.9783270239830018, + "num_tokens": 43463080.0, + "step": 18940 + }, + { + "entropy": 0.07272496372461319, + "epoch": 4.416132416365544, + "grad_norm": 1.046875, + "learning_rate": 4.960666672351064e-05, + "loss": 0.1171, + "mean_token_accuracy": 0.971141928434372, + "num_tokens": 43472428.0, + "step": 18945 + }, + { + "entropy": 0.05846181372180581, + "epoch": 4.417298053386175, + "grad_norm": 0.6640625, + "learning_rate": 4.96062867348471e-05, + "loss": 0.0519, + "mean_token_accuracy": 0.9783796727657318, + "num_tokens": 43496040.0, + "step": 18950 + }, + { + "entropy": 0.07526832111179829, + "epoch": 4.418463690406807, + "grad_norm": 0.51953125, + "learning_rate": 4.960590656566062e-05, + "loss": 0.0657, + "mean_token_accuracy": 0.9777651190757751, + "num_tokens": 43515069.0, + "step": 18955 + }, + { + "entropy": 0.06531362514942884, + "epoch": 4.419629327427439, + "grad_norm": 1.03125, + "learning_rate": 4.960552621595686e-05, + "loss": 0.0776, + "mean_token_accuracy": 0.9797972917556763, + "num_tokens": 43534735.0, + "step": 18960 + }, + { + "entropy": 0.07656772956252098, + "epoch": 4.420794964448071, + "grad_norm": 2.171875, + "learning_rate": 4.96051456857415e-05, + "loss": 0.0941, + "mean_token_accuracy": 0.9756794929504394, + "num_tokens": 43543891.0, + "step": 18965 + }, + { + "entropy": 0.06007896903902292, + "epoch": 4.421960601468703, + "grad_norm": 0.2470703125, + "learning_rate": 4.960476497502021e-05, + "loss": 0.0712, + "mean_token_accuracy": 0.9770024001598359, + "num_tokens": 43567670.0, + "step": 18970 + }, + { + "entropy": 0.07055467199534178, + "epoch": 4.423126238489335, + "grad_norm": 1.734375, + "learning_rate": 4.960438408379867e-05, + "loss": 0.0692, + "mean_token_accuracy": 0.9769983887672424, + "num_tokens": 43583601.0, + "step": 18975 + }, + { + "entropy": 0.0902567700482905, + "epoch": 4.424291875509966, + "grad_norm": 1.7109375, + "learning_rate": 4.960400301208255e-05, + "loss": 0.0891, + "mean_token_accuracy": 0.9708002507686615, + "num_tokens": 43609479.0, + "step": 18980 + }, + { + "entropy": 0.07092046402394772, + "epoch": 4.425457512530598, + "grad_norm": 6.65625, + "learning_rate": 4.9603621759877544e-05, + "loss": 0.0864, + "mean_token_accuracy": 0.9759444415569305, + "num_tokens": 43631194.0, + "step": 18985 + }, + { + "entropy": 0.09915064247325063, + "epoch": 4.426623149551229, + "grad_norm": 7.53125, + "learning_rate": 4.9603240327189335e-05, + "loss": 0.0668, + "mean_token_accuracy": 0.9716208279132843, + "num_tokens": 43653279.0, + "step": 18990 + }, + { + "entropy": 0.05771286068484187, + "epoch": 4.4277887865718615, + "grad_norm": 1.5546875, + "learning_rate": 4.9602858714023605e-05, + "loss": 0.0702, + "mean_token_accuracy": 0.9814035415649414, + "num_tokens": 43673620.0, + "step": 18995 + }, + { + "entropy": 0.06412562178447842, + "epoch": 4.428954423592494, + "grad_norm": 6.0, + "learning_rate": 4.960247692038605e-05, + "loss": 0.1015, + "mean_token_accuracy": 0.9743063271045684, + "num_tokens": 43690001.0, + "step": 19000 + }, + { + "entropy": 0.07472214587032795, + "epoch": 4.430120060613125, + "grad_norm": 3.359375, + "learning_rate": 4.960209494628236e-05, + "loss": 0.0712, + "mean_token_accuracy": 0.9787097334861755, + "num_tokens": 43704465.0, + "step": 19005 + }, + { + "entropy": 0.06662010606378317, + "epoch": 4.431285697633757, + "grad_norm": 0.84375, + "learning_rate": 4.960171279171823e-05, + "loss": 0.0735, + "mean_token_accuracy": 0.9809480428695678, + "num_tokens": 43716993.0, + "step": 19010 + }, + { + "entropy": 0.08009600536897779, + "epoch": 4.432451334654389, + "grad_norm": 0.25, + "learning_rate": 4.9601330456699366e-05, + "loss": 0.0857, + "mean_token_accuracy": 0.9711295068264008, + "num_tokens": 43750588.0, + "step": 19015 + }, + { + "entropy": 0.09564240127801896, + "epoch": 4.43361697167502, + "grad_norm": 2.328125, + "learning_rate": 4.960094794123146e-05, + "loss": 0.1229, + "mean_token_accuracy": 0.970706331729889, + "num_tokens": 43761979.0, + "step": 19020 + }, + { + "entropy": 0.06584495399147272, + "epoch": 4.434782608695652, + "grad_norm": 0.68359375, + "learning_rate": 4.9600565245320215e-05, + "loss": 0.0742, + "mean_token_accuracy": 0.9821219086647034, + "num_tokens": 43793503.0, + "step": 19025 + }, + { + "entropy": 0.049910994991660115, + "epoch": 4.4359482457162835, + "grad_norm": 0.66015625, + "learning_rate": 4.9600182368971344e-05, + "loss": 0.0401, + "mean_token_accuracy": 0.98340545296669, + "num_tokens": 43822739.0, + "step": 19030 + }, + { + "entropy": 0.10595990158617496, + "epoch": 4.437113882736916, + "grad_norm": 2.578125, + "learning_rate": 4.959979931219056e-05, + "loss": 0.0965, + "mean_token_accuracy": 0.9769360482692718, + "num_tokens": 43834491.0, + "step": 19035 + }, + { + "entropy": 0.09281698856502771, + "epoch": 4.438279519757548, + "grad_norm": 3.921875, + "learning_rate": 4.959941607498356e-05, + "loss": 0.1186, + "mean_token_accuracy": 0.9695560753345489, + "num_tokens": 43867073.0, + "step": 19040 + }, + { + "entropy": 0.10635361950844527, + "epoch": 4.439445156778179, + "grad_norm": 5.375, + "learning_rate": 4.959903265735607e-05, + "loss": 0.1358, + "mean_token_accuracy": 0.9646057069301606, + "num_tokens": 43880104.0, + "step": 19045 + }, + { + "entropy": 0.05380313564091921, + "epoch": 4.440610793798811, + "grad_norm": 1.6875, + "learning_rate": 4.959864905931381e-05, + "loss": 0.0628, + "mean_token_accuracy": 0.9821853876113892, + "num_tokens": 43896475.0, + "step": 19050 + }, + { + "entropy": 0.18970977468416095, + "epoch": 4.441776430819443, + "grad_norm": 1.5859375, + "learning_rate": 4.959826528086249e-05, + "loss": 0.2268, + "mean_token_accuracy": 0.9328379809856415, + "num_tokens": 43935237.0, + "step": 19055 + }, + { + "entropy": 0.0632144408300519, + "epoch": 4.442942067840074, + "grad_norm": 0.470703125, + "learning_rate": 4.9597881322007845e-05, + "loss": 0.0726, + "mean_token_accuracy": 0.9753654658794403, + "num_tokens": 43953674.0, + "step": 19060 + }, + { + "entropy": 0.0784110258333385, + "epoch": 4.4441077048607065, + "grad_norm": 1.6484375, + "learning_rate": 4.959749718275559e-05, + "loss": 0.0503, + "mean_token_accuracy": 0.9830631732940673, + "num_tokens": 43973634.0, + "step": 19065 + }, + { + "entropy": 0.1564923981204629, + "epoch": 4.445273341881338, + "grad_norm": 6.34375, + "learning_rate": 4.9597112863111455e-05, + "loss": 0.143, + "mean_token_accuracy": 0.9643102347850799, + "num_tokens": 43984468.0, + "step": 19070 + }, + { + "entropy": 0.051792218908667566, + "epoch": 4.44643897890197, + "grad_norm": 0.92578125, + "learning_rate": 4.9596728363081177e-05, + "loss": 0.0531, + "mean_token_accuracy": 0.9854408144950867, + "num_tokens": 43999903.0, + "step": 19075 + }, + { + "entropy": 0.08109848536550998, + "epoch": 4.447604615922602, + "grad_norm": 3.4375, + "learning_rate": 4.9596343682670475e-05, + "loss": 0.0734, + "mean_token_accuracy": 0.9807873725891113, + "num_tokens": 44015648.0, + "step": 19080 + }, + { + "entropy": 0.05638847313821316, + "epoch": 4.448770252943233, + "grad_norm": 1.53125, + "learning_rate": 4.9595958821885104e-05, + "loss": 0.0578, + "mean_token_accuracy": 0.9812259316444397, + "num_tokens": 44035719.0, + "step": 19085 + }, + { + "entropy": 0.09323077034205199, + "epoch": 4.449935889963865, + "grad_norm": 0.390625, + "learning_rate": 4.959557378073079e-05, + "loss": 0.0712, + "mean_token_accuracy": 0.9786225438117981, + "num_tokens": 44058887.0, + "step": 19090 + }, + { + "entropy": 0.061530550755560395, + "epoch": 4.451101526984497, + "grad_norm": 2.6875, + "learning_rate": 4.9595188559213276e-05, + "loss": 0.0649, + "mean_token_accuracy": 0.9821036100387573, + "num_tokens": 44079240.0, + "step": 19095 + }, + { + "entropy": 0.14186363713815808, + "epoch": 4.452267164005129, + "grad_norm": 0.458984375, + "learning_rate": 4.959480315733831e-05, + "loss": 0.2516, + "mean_token_accuracy": 0.9597993791103363, + "num_tokens": 44110096.0, + "step": 19100 + }, + { + "entropy": 0.05938525218516588, + "epoch": 4.453432801025761, + "grad_norm": 2.46875, + "learning_rate": 4.9594417575111634e-05, + "loss": 0.0548, + "mean_token_accuracy": 0.9803711473941803, + "num_tokens": 44143221.0, + "step": 19105 + }, + { + "entropy": 0.05355137772858143, + "epoch": 4.454598438046393, + "grad_norm": 4.1875, + "learning_rate": 4.9594031812539006e-05, + "loss": 0.0585, + "mean_token_accuracy": 0.9849870860576629, + "num_tokens": 44163401.0, + "step": 19110 + }, + { + "entropy": 0.08820384666323662, + "epoch": 4.455764075067024, + "grad_norm": 1.328125, + "learning_rate": 4.959364586962617e-05, + "loss": 0.0717, + "mean_token_accuracy": 0.9808596074581146, + "num_tokens": 44179516.0, + "step": 19115 + }, + { + "entropy": 0.058700266759842634, + "epoch": 4.456929712087656, + "grad_norm": 2.15625, + "learning_rate": 4.959325974637888e-05, + "loss": 0.0403, + "mean_token_accuracy": 0.9841213166713715, + "num_tokens": 44225512.0, + "step": 19120 + }, + { + "entropy": 0.07730643711984157, + "epoch": 4.458095349108287, + "grad_norm": 2.40625, + "learning_rate": 4.959287344280291e-05, + "loss": 0.0904, + "mean_token_accuracy": 0.9748827397823334, + "num_tokens": 44237791.0, + "step": 19125 + }, + { + "entropy": 0.05126442005857825, + "epoch": 4.459260986128919, + "grad_norm": 4.6875, + "learning_rate": 4.9592486958904e-05, + "loss": 0.0536, + "mean_token_accuracy": 0.9859380066394806, + "num_tokens": 44281173.0, + "step": 19130 + }, + { + "entropy": 0.15797716118395327, + "epoch": 4.4604266231495515, + "grad_norm": 1.109375, + "learning_rate": 4.959210029468793e-05, + "loss": 0.315, + "mean_token_accuracy": 0.9346661984920501, + "num_tokens": 44301965.0, + "step": 19135 + }, + { + "entropy": 0.06716629974544049, + "epoch": 4.461592260170183, + "grad_norm": 2.25, + "learning_rate": 4.959171345016045e-05, + "loss": 0.0949, + "mean_token_accuracy": 0.9775259852409363, + "num_tokens": 44313084.0, + "step": 19140 + }, + { + "entropy": 0.06034324299544096, + "epoch": 4.462757897190815, + "grad_norm": 2.015625, + "learning_rate": 4.9591326425327334e-05, + "loss": 0.0657, + "mean_token_accuracy": 0.9777418315410614, + "num_tokens": 44328344.0, + "step": 19145 + }, + { + "entropy": 0.07359747290611267, + "epoch": 4.463923534211447, + "grad_norm": 2.796875, + "learning_rate": 4.959093922019435e-05, + "loss": 0.0932, + "mean_token_accuracy": 0.9751519203186035, + "num_tokens": 44346989.0, + "step": 19150 + }, + { + "entropy": 0.088407745026052, + "epoch": 4.465089171232078, + "grad_norm": 4.75, + "learning_rate": 4.9590551834767285e-05, + "loss": 0.0916, + "mean_token_accuracy": 0.973981785774231, + "num_tokens": 44359350.0, + "step": 19155 + }, + { + "entropy": 0.05561266434378922, + "epoch": 4.46625480825271, + "grad_norm": 0.53125, + "learning_rate": 4.959016426905191e-05, + "loss": 0.0556, + "mean_token_accuracy": 0.9856752693653107, + "num_tokens": 44384065.0, + "step": 19160 + }, + { + "entropy": 0.07562111020088196, + "epoch": 4.4674204452733415, + "grad_norm": 1.2265625, + "learning_rate": 4.958977652305399e-05, + "loss": 0.0683, + "mean_token_accuracy": 0.9820806801319122, + "num_tokens": 44395092.0, + "step": 19165 + }, + { + "entropy": 0.05588637031614781, + "epoch": 4.468586082293974, + "grad_norm": 5.65625, + "learning_rate": 4.958938859677932e-05, + "loss": 0.066, + "mean_token_accuracy": 0.9826294183731079, + "num_tokens": 44418945.0, + "step": 19170 + }, + { + "entropy": 0.060627684276551005, + "epoch": 4.469751719314606, + "grad_norm": 1.109375, + "learning_rate": 4.95890004902337e-05, + "loss": 0.0648, + "mean_token_accuracy": 0.9832158327102661, + "num_tokens": 44443933.0, + "step": 19175 + }, + { + "entropy": 0.05912665966898203, + "epoch": 4.470917356335237, + "grad_norm": 2.53125, + "learning_rate": 4.958861220342288e-05, + "loss": 0.075, + "mean_token_accuracy": 0.9819551467895508, + "num_tokens": 44458854.0, + "step": 19180 + }, + { + "entropy": 0.08281111363321543, + "epoch": 4.472082993355869, + "grad_norm": 0.51171875, + "learning_rate": 4.9588223736352674e-05, + "loss": 0.0715, + "mean_token_accuracy": 0.9780626833438874, + "num_tokens": 44472980.0, + "step": 19185 + }, + { + "entropy": 0.0640386096201837, + "epoch": 4.473248630376501, + "grad_norm": 1.9921875, + "learning_rate": 4.958783508902887e-05, + "loss": 0.0512, + "mean_token_accuracy": 0.9844207406044007, + "num_tokens": 44500670.0, + "step": 19190 + }, + { + "entropy": 0.06259249579161405, + "epoch": 4.474414267397132, + "grad_norm": 3.078125, + "learning_rate": 4.958744626145727e-05, + "loss": 0.0545, + "mean_token_accuracy": 0.9792797148227692, + "num_tokens": 44525316.0, + "step": 19195 + }, + { + "entropy": 0.11668573003262281, + "epoch": 4.475579904417764, + "grad_norm": 1.6484375, + "learning_rate": 4.958705725364366e-05, + "loss": 0.133, + "mean_token_accuracy": 0.9686145961284638, + "num_tokens": 44536734.0, + "step": 19200 + }, + { + "entropy": 0.059713400527834894, + "epoch": 4.476745541438396, + "grad_norm": 2.296875, + "learning_rate": 4.958666806559385e-05, + "loss": 0.087, + "mean_token_accuracy": 0.9807201862335205, + "num_tokens": 44564065.0, + "step": 19205 + }, + { + "entropy": 0.09271037932485342, + "epoch": 4.477911178459028, + "grad_norm": 0.36328125, + "learning_rate": 4.9586278697313636e-05, + "loss": 0.1193, + "mean_token_accuracy": 0.9705138206481934, + "num_tokens": 44605995.0, + "step": 19210 + }, + { + "entropy": 0.07151696030050517, + "epoch": 4.47907681547966, + "grad_norm": 6.6875, + "learning_rate": 4.958588914880883e-05, + "loss": 0.0852, + "mean_token_accuracy": 0.9804966628551484, + "num_tokens": 44635863.0, + "step": 19215 + }, + { + "entropy": 0.08933782912790775, + "epoch": 4.480242452500291, + "grad_norm": 2.4375, + "learning_rate": 4.958549942008524e-05, + "loss": 0.1262, + "mean_token_accuracy": 0.9740311861038208, + "num_tokens": 44650230.0, + "step": 19220 + }, + { + "entropy": 0.09002497904002667, + "epoch": 4.481408089520923, + "grad_norm": 2.53125, + "learning_rate": 4.958510951114868e-05, + "loss": 0.0666, + "mean_token_accuracy": 0.9774248003959656, + "num_tokens": 44668524.0, + "step": 19225 + }, + { + "entropy": 0.0647845333442092, + "epoch": 4.482573726541555, + "grad_norm": 1.34375, + "learning_rate": 4.9584719422004945e-05, + "loss": 0.0598, + "mean_token_accuracy": 0.9832403361797333, + "num_tokens": 44694794.0, + "step": 19230 + }, + { + "entropy": 0.0654996738769114, + "epoch": 4.4837393635621865, + "grad_norm": 0.62109375, + "learning_rate": 4.958432915265988e-05, + "loss": 0.0382, + "mean_token_accuracy": 0.9838578164577484, + "num_tokens": 44720301.0, + "step": 19235 + }, + { + "entropy": 0.0657532449811697, + "epoch": 4.484905000582819, + "grad_norm": 0.83984375, + "learning_rate": 4.958393870311929e-05, + "loss": 0.0597, + "mean_token_accuracy": 0.980295842885971, + "num_tokens": 44734794.0, + "step": 19240 + }, + { + "entropy": 0.08420066647231579, + "epoch": 4.486070637603451, + "grad_norm": 3.671875, + "learning_rate": 4.958354807338899e-05, + "loss": 0.1104, + "mean_token_accuracy": 0.9734029173851013, + "num_tokens": 44745211.0, + "step": 19245 + }, + { + "entropy": 0.0867409948259592, + "epoch": 4.487236274624082, + "grad_norm": 2.9375, + "learning_rate": 4.958315726347482e-05, + "loss": 0.0558, + "mean_token_accuracy": 0.9785344064235687, + "num_tokens": 44770497.0, + "step": 19250 + }, + { + "entropy": 0.08636245997622609, + "epoch": 4.488401911644714, + "grad_norm": 0.2041015625, + "learning_rate": 4.9582766273382604e-05, + "loss": 0.1164, + "mean_token_accuracy": 0.9738179206848144, + "num_tokens": 44783268.0, + "step": 19255 + }, + { + "entropy": 0.07308413162827491, + "epoch": 4.489567548665345, + "grad_norm": 1.9296875, + "learning_rate": 4.958237510311817e-05, + "loss": 0.0694, + "mean_token_accuracy": 0.9750112354755401, + "num_tokens": 44804127.0, + "step": 19260 + }, + { + "entropy": 0.0728783905506134, + "epoch": 4.490733185685977, + "grad_norm": 0.5390625, + "learning_rate": 4.958198375268734e-05, + "loss": 0.0839, + "mean_token_accuracy": 0.9792826414108277, + "num_tokens": 44823592.0, + "step": 19265 + }, + { + "entropy": 0.0877895756624639, + "epoch": 4.4918988227066095, + "grad_norm": 2.21875, + "learning_rate": 4.9581592222095974e-05, + "loss": 0.1242, + "mean_token_accuracy": 0.9677175223827362, + "num_tokens": 44841262.0, + "step": 19270 + }, + { + "entropy": 0.08363472241908312, + "epoch": 4.493064459727241, + "grad_norm": 2.046875, + "learning_rate": 4.9581200511349886e-05, + "loss": 0.1032, + "mean_token_accuracy": 0.97632537484169, + "num_tokens": 44852453.0, + "step": 19275 + }, + { + "entropy": 0.07496081218123436, + "epoch": 4.494230096747873, + "grad_norm": 0.921875, + "learning_rate": 4.9580808620454934e-05, + "loss": 0.0769, + "mean_token_accuracy": 0.9786527574062347, + "num_tokens": 44865486.0, + "step": 19280 + }, + { + "entropy": 0.04869019603356719, + "epoch": 4.495395733768505, + "grad_norm": 0.4140625, + "learning_rate": 4.9580416549416945e-05, + "loss": 0.0513, + "mean_token_accuracy": 0.9858419060707092, + "num_tokens": 44892406.0, + "step": 19285 + }, + { + "entropy": 0.08506630435585975, + "epoch": 4.496561370789136, + "grad_norm": 1.8359375, + "learning_rate": 4.958002429824179e-05, + "loss": 0.0807, + "mean_token_accuracy": 0.9732596397399902, + "num_tokens": 44904489.0, + "step": 19290 + }, + { + "entropy": 0.05554007384926081, + "epoch": 4.497727007809768, + "grad_norm": 0.80078125, + "learning_rate": 4.957963186693529e-05, + "loss": 0.0315, + "mean_token_accuracy": 0.9892236590385437, + "num_tokens": 44936231.0, + "step": 19295 + }, + { + "entropy": 0.07873046463355422, + "epoch": 4.498892644830399, + "grad_norm": 2.359375, + "learning_rate": 4.9579239255503316e-05, + "loss": 0.068, + "mean_token_accuracy": 0.9715416669845581, + "num_tokens": 44961471.0, + "step": 19300 + }, + { + "entropy": 0.07022126615047455, + "epoch": 4.5000582818510315, + "grad_norm": 0.7421875, + "learning_rate": 4.957884646395171e-05, + "loss": 0.0775, + "mean_token_accuracy": 0.9786169409751893, + "num_tokens": 44975004.0, + "step": 19305 + }, + { + "entropy": 0.07414238564670086, + "epoch": 4.501223918871664, + "grad_norm": 1.3359375, + "learning_rate": 4.9578453492286345e-05, + "loss": 0.1074, + "mean_token_accuracy": 0.9741037786006927, + "num_tokens": 44986316.0, + "step": 19310 + }, + { + "entropy": 0.06832624040544033, + "epoch": 4.502389555892295, + "grad_norm": 0.490234375, + "learning_rate": 4.957806034051307e-05, + "loss": 0.0715, + "mean_token_accuracy": 0.9820472538471222, + "num_tokens": 45025979.0, + "step": 19315 + }, + { + "entropy": 0.06494354400783778, + "epoch": 4.503555192912927, + "grad_norm": 0.54296875, + "learning_rate": 4.957766700863774e-05, + "loss": 0.0804, + "mean_token_accuracy": 0.9823660254478455, + "num_tokens": 45042042.0, + "step": 19320 + }, + { + "entropy": 0.08731096163392067, + "epoch": 4.504720829933559, + "grad_norm": 4.5, + "learning_rate": 4.9577273496666236e-05, + "loss": 0.1369, + "mean_token_accuracy": 0.9645826160907746, + "num_tokens": 45052645.0, + "step": 19325 + }, + { + "entropy": 0.06000461746007204, + "epoch": 4.50588646695419, + "grad_norm": 0.408203125, + "learning_rate": 4.957687980460442e-05, + "loss": 0.0751, + "mean_token_accuracy": 0.9814544498920441, + "num_tokens": 45086655.0, + "step": 19330 + }, + { + "entropy": 0.056810198538005355, + "epoch": 4.507052103974822, + "grad_norm": 0.68359375, + "learning_rate": 4.957648593245816e-05, + "loss": 0.0602, + "mean_token_accuracy": 0.9821222722530365, + "num_tokens": 45103298.0, + "step": 19335 + }, + { + "entropy": 0.10935597391799093, + "epoch": 4.508217740995454, + "grad_norm": 2.609375, + "learning_rate": 4.9576091880233335e-05, + "loss": 0.148, + "mean_token_accuracy": 0.965090674161911, + "num_tokens": 45126200.0, + "step": 19340 + }, + { + "entropy": 0.06502020470798016, + "epoch": 4.509383378016086, + "grad_norm": 0.337890625, + "learning_rate": 4.957569764793581e-05, + "loss": 0.0791, + "mean_token_accuracy": 0.9803296446800231, + "num_tokens": 45149472.0, + "step": 19345 + }, + { + "entropy": 0.07320923134684562, + "epoch": 4.510549015036718, + "grad_norm": 2.375, + "learning_rate": 4.957530323557147e-05, + "loss": 0.0541, + "mean_token_accuracy": 0.9775784730911254, + "num_tokens": 45184300.0, + "step": 19350 + }, + { + "entropy": 0.07345627807080746, + "epoch": 4.511714652057349, + "grad_norm": 3.171875, + "learning_rate": 4.95749086431462e-05, + "loss": 0.1122, + "mean_token_accuracy": 0.972129487991333, + "num_tokens": 45194944.0, + "step": 19355 + }, + { + "entropy": 0.08000601641833782, + "epoch": 4.512880289077981, + "grad_norm": 5.4375, + "learning_rate": 4.957451387066588e-05, + "loss": 0.106, + "mean_token_accuracy": 0.9739298522472382, + "num_tokens": 45204799.0, + "step": 19360 + }, + { + "entropy": 0.08380871191620827, + "epoch": 4.514045926098613, + "grad_norm": 3.28125, + "learning_rate": 4.957411891813639e-05, + "loss": 0.1116, + "mean_token_accuracy": 0.9741986215114593, + "num_tokens": 45222726.0, + "step": 19365 + }, + { + "entropy": 0.06374263260513544, + "epoch": 4.515211563119244, + "grad_norm": 1.9140625, + "learning_rate": 4.9573723785563636e-05, + "loss": 0.0625, + "mean_token_accuracy": 0.9786804854869843, + "num_tokens": 45242007.0, + "step": 19370 + }, + { + "entropy": 0.07915383875370026, + "epoch": 4.5163772001398765, + "grad_norm": 1.2734375, + "learning_rate": 4.9573328472953496e-05, + "loss": 0.0798, + "mean_token_accuracy": 0.9795429646968842, + "num_tokens": 45259231.0, + "step": 19375 + }, + { + "entropy": 0.07513550948351622, + "epoch": 4.517542837160509, + "grad_norm": 2.328125, + "learning_rate": 4.957293298031187e-05, + "loss": 0.0564, + "mean_token_accuracy": 0.9819276630878448, + "num_tokens": 45276816.0, + "step": 19380 + }, + { + "entropy": 0.08630263023078441, + "epoch": 4.51870847418114, + "grad_norm": 2.0, + "learning_rate": 4.957253730764466e-05, + "loss": 0.1117, + "mean_token_accuracy": 0.9731955528259277, + "num_tokens": 45286266.0, + "step": 19385 + }, + { + "entropy": 0.07398798689246178, + "epoch": 4.519874111201772, + "grad_norm": 3.96875, + "learning_rate": 4.957214145495775e-05, + "loss": 0.0862, + "mean_token_accuracy": 0.9772976040840149, + "num_tokens": 45304714.0, + "step": 19390 + }, + { + "entropy": 0.06771981194615365, + "epoch": 4.521039748222403, + "grad_norm": 2.0, + "learning_rate": 4.957174542225706e-05, + "loss": 0.0927, + "mean_token_accuracy": 0.9755495727062226, + "num_tokens": 45320062.0, + "step": 19395 + }, + { + "entropy": 0.08253844156861305, + "epoch": 4.522205385243035, + "grad_norm": 0.62109375, + "learning_rate": 4.957134920954849e-05, + "loss": 0.0593, + "mean_token_accuracy": 0.9802878797054291, + "num_tokens": 45344591.0, + "step": 19400 + }, + { + "entropy": 0.06176201160997152, + "epoch": 4.523371022263667, + "grad_norm": 3.78125, + "learning_rate": 4.957095281683794e-05, + "loss": 0.0679, + "mean_token_accuracy": 0.9821565389633179, + "num_tokens": 45362801.0, + "step": 19405 + }, + { + "entropy": 0.08072059694677591, + "epoch": 4.524536659284299, + "grad_norm": 1.171875, + "learning_rate": 4.957055624413134e-05, + "loss": 0.0863, + "mean_token_accuracy": 0.9759260535240173, + "num_tokens": 45378106.0, + "step": 19410 + }, + { + "entropy": 0.062263531237840654, + "epoch": 4.525702296304931, + "grad_norm": 3.125, + "learning_rate": 4.957015949143458e-05, + "loss": 0.0757, + "mean_token_accuracy": 0.9812383353710175, + "num_tokens": 45403774.0, + "step": 19415 + }, + { + "entropy": 0.09101590849459171, + "epoch": 4.526867933325562, + "grad_norm": 2.09375, + "learning_rate": 4.956976255875359e-05, + "loss": 0.1214, + "mean_token_accuracy": 0.9659562647342682, + "num_tokens": 45412925.0, + "step": 19420 + }, + { + "entropy": 0.062494827434420586, + "epoch": 4.528033570346194, + "grad_norm": 1.6875, + "learning_rate": 4.956936544609429e-05, + "loss": 0.0654, + "mean_token_accuracy": 0.9782341003417969, + "num_tokens": 45430972.0, + "step": 19425 + }, + { + "entropy": 0.06619368139654398, + "epoch": 4.529199207366826, + "grad_norm": 0.5625, + "learning_rate": 4.956896815346259e-05, + "loss": 0.0622, + "mean_token_accuracy": 0.9839639365673065, + "num_tokens": 45459186.0, + "step": 19430 + }, + { + "entropy": 0.06992116756737232, + "epoch": 4.530364844387458, + "grad_norm": 2.515625, + "learning_rate": 4.956857068086443e-05, + "loss": 0.1111, + "mean_token_accuracy": 0.9732448756694794, + "num_tokens": 45477375.0, + "step": 19435 + }, + { + "entropy": 0.08637938443571329, + "epoch": 4.5315304814080895, + "grad_norm": 4.0625, + "learning_rate": 4.9568173028305724e-05, + "loss": 0.0684, + "mean_token_accuracy": 0.9800038695335388, + "num_tokens": 45502943.0, + "step": 19440 + }, + { + "entropy": 0.05814673639833927, + "epoch": 4.5326961184287216, + "grad_norm": 0.8125, + "learning_rate": 4.956777519579241e-05, + "loss": 0.0598, + "mean_token_accuracy": 0.9838175058364869, + "num_tokens": 45518792.0, + "step": 19445 + }, + { + "entropy": 0.07371747437864543, + "epoch": 4.533861755449353, + "grad_norm": 6.6875, + "learning_rate": 4.956737718333042e-05, + "loss": 0.077, + "mean_token_accuracy": 0.98111212849617, + "num_tokens": 45540871.0, + "step": 19450 + }, + { + "entropy": 0.07126566041260958, + "epoch": 4.535027392469985, + "grad_norm": 3.390625, + "learning_rate": 4.9566978990925686e-05, + "loss": 0.0665, + "mean_token_accuracy": 0.9774880886077881, + "num_tokens": 45563615.0, + "step": 19455 + }, + { + "entropy": 0.07104693334549665, + "epoch": 4.536193029490617, + "grad_norm": 2.125, + "learning_rate": 4.9566580618584136e-05, + "loss": 0.0682, + "mean_token_accuracy": 0.9758515536785126, + "num_tokens": 45594215.0, + "step": 19460 + }, + { + "entropy": 0.07539936387911439, + "epoch": 4.537358666511248, + "grad_norm": 0.578125, + "learning_rate": 4.956618206631172e-05, + "loss": 0.0955, + "mean_token_accuracy": 0.9738450884819031, + "num_tokens": 45620724.0, + "step": 19465 + }, + { + "entropy": 0.091930534504354, + "epoch": 4.53852430353188, + "grad_norm": 5.78125, + "learning_rate": 4.956578333411439e-05, + "loss": 0.0978, + "mean_token_accuracy": 0.9745730400085449, + "num_tokens": 45631916.0, + "step": 19470 + }, + { + "entropy": 0.08296403437852859, + "epoch": 4.5396899405525115, + "grad_norm": 1.875, + "learning_rate": 4.956538442199808e-05, + "loss": 0.0521, + "mean_token_accuracy": 0.9824115693569183, + "num_tokens": 45649779.0, + "step": 19475 + }, + { + "entropy": 0.07108269976451993, + "epoch": 4.540855577573144, + "grad_norm": 1.7734375, + "learning_rate": 4.956498532996874e-05, + "loss": 0.0539, + "mean_token_accuracy": 0.9788315117359161, + "num_tokens": 45674396.0, + "step": 19480 + }, + { + "entropy": 0.11804137602448464, + "epoch": 4.542021214593776, + "grad_norm": 1.59375, + "learning_rate": 4.956458605803232e-05, + "loss": 0.138, + "mean_token_accuracy": 0.9672506809234619, + "num_tokens": 45701462.0, + "step": 19485 + }, + { + "entropy": 0.06310545764863491, + "epoch": 4.543186851614407, + "grad_norm": 1.2265625, + "learning_rate": 4.956418660619477e-05, + "loss": 0.0675, + "mean_token_accuracy": 0.9810591280460358, + "num_tokens": 45712067.0, + "step": 19490 + }, + { + "entropy": 0.06863165087997913, + "epoch": 4.544352488635039, + "grad_norm": 0.72265625, + "learning_rate": 4.9563786974462064e-05, + "loss": 0.0905, + "mean_token_accuracy": 0.97963907122612, + "num_tokens": 45727858.0, + "step": 19495 + }, + { + "entropy": 0.0643789792433381, + "epoch": 4.545518125655671, + "grad_norm": 1.4453125, + "learning_rate": 4.9563387162840144e-05, + "loss": 0.0696, + "mean_token_accuracy": 0.9800073564052582, + "num_tokens": 45740208.0, + "step": 19500 + }, + { + "entropy": 0.06495386781170964, + "epoch": 4.546683762676302, + "grad_norm": 3.109375, + "learning_rate": 4.9562987171334976e-05, + "loss": 0.0618, + "mean_token_accuracy": 0.9782070398330689, + "num_tokens": 45764350.0, + "step": 19505 + }, + { + "entropy": 0.0650036720559001, + "epoch": 4.5478493996969345, + "grad_norm": 2.9375, + "learning_rate": 4.956258699995253e-05, + "loss": 0.0644, + "mean_token_accuracy": 0.9787128925323486, + "num_tokens": 45786544.0, + "step": 19510 + }, + { + "entropy": 0.12152950577437878, + "epoch": 4.549015036717567, + "grad_norm": 3.84375, + "learning_rate": 4.956218664869876e-05, + "loss": 0.2017, + "mean_token_accuracy": 0.9524850487709046, + "num_tokens": 45809166.0, + "step": 19515 + }, + { + "entropy": 0.0705368846654892, + "epoch": 4.550180673738198, + "grad_norm": 2.640625, + "learning_rate": 4.956178611757966e-05, + "loss": 0.093, + "mean_token_accuracy": 0.9743614614009857, + "num_tokens": 45820714.0, + "step": 19520 + }, + { + "entropy": 0.04608556115999818, + "epoch": 4.55134631075883, + "grad_norm": 0.78125, + "learning_rate": 4.9561385406601167e-05, + "loss": 0.0247, + "mean_token_accuracy": 0.9896496951580047, + "num_tokens": 45845944.0, + "step": 19525 + }, + { + "entropy": 0.07118664849549532, + "epoch": 4.552511947779461, + "grad_norm": 0.65625, + "learning_rate": 4.956098451576929e-05, + "loss": 0.1066, + "mean_token_accuracy": 0.970186847448349, + "num_tokens": 45860384.0, + "step": 19530 + }, + { + "entropy": 0.035560993710532784, + "epoch": 4.553677584800093, + "grad_norm": 0.59375, + "learning_rate": 4.956058344508999e-05, + "loss": 0.0216, + "mean_token_accuracy": 0.9867688715457916, + "num_tokens": 45896561.0, + "step": 19535 + }, + { + "entropy": 0.07229228690266609, + "epoch": 4.554843221820725, + "grad_norm": 0.8359375, + "learning_rate": 4.9560182194569246e-05, + "loss": 0.0968, + "mean_token_accuracy": 0.9724124073982239, + "num_tokens": 45916494.0, + "step": 19540 + }, + { + "entropy": 0.05498791430145502, + "epoch": 4.5560088588413565, + "grad_norm": 1.0703125, + "learning_rate": 4.955978076421305e-05, + "loss": 0.0327, + "mean_token_accuracy": 0.9827622950077057, + "num_tokens": 45937511.0, + "step": 19545 + }, + { + "entropy": 0.08500229343771934, + "epoch": 4.557174495861989, + "grad_norm": 3.5, + "learning_rate": 4.9559379154027386e-05, + "loss": 0.1174, + "mean_token_accuracy": 0.9725946724414826, + "num_tokens": 45948842.0, + "step": 19550 + }, + { + "entropy": 0.06914058709517121, + "epoch": 4.55834013288262, + "grad_norm": 0.4921875, + "learning_rate": 4.955897736401824e-05, + "loss": 0.0767, + "mean_token_accuracy": 0.9804995000362396, + "num_tokens": 45964465.0, + "step": 19555 + }, + { + "entropy": 0.048480591550469396, + "epoch": 4.559505769903252, + "grad_norm": 1.1171875, + "learning_rate": 4.9558575394191605e-05, + "loss": 0.0468, + "mean_token_accuracy": 0.987060832977295, + "num_tokens": 45998617.0, + "step": 19560 + }, + { + "entropy": 0.067578933108598, + "epoch": 4.560671406923884, + "grad_norm": 3.296875, + "learning_rate": 4.955817324455347e-05, + "loss": 0.0586, + "mean_token_accuracy": 0.9790487349033355, + "num_tokens": 46022443.0, + "step": 19565 + }, + { + "entropy": 0.08472463395446539, + "epoch": 4.561837043944516, + "grad_norm": 4.625, + "learning_rate": 4.955777091510984e-05, + "loss": 0.1074, + "mean_token_accuracy": 0.9704361200332642, + "num_tokens": 46035920.0, + "step": 19570 + }, + { + "entropy": 0.06885777739807963, + "epoch": 4.563002680965147, + "grad_norm": 7.25, + "learning_rate": 4.95573684058667e-05, + "loss": 0.0704, + "mean_token_accuracy": 0.9791424155235291, + "num_tokens": 46058228.0, + "step": 19575 + }, + { + "entropy": 0.09059565905481577, + "epoch": 4.5641683179857795, + "grad_norm": 7.84375, + "learning_rate": 4.955696571683007e-05, + "loss": 0.0842, + "mean_token_accuracy": 0.9713367521762848, + "num_tokens": 46095283.0, + "step": 19580 + }, + { + "entropy": 0.06202486571855843, + "epoch": 4.565333955006411, + "grad_norm": 1.9765625, + "learning_rate": 4.9556562848005946e-05, + "loss": 0.0633, + "mean_token_accuracy": 0.9819358885288239, + "num_tokens": 46127335.0, + "step": 19585 + }, + { + "entropy": 0.047707998938858506, + "epoch": 4.566499592027043, + "grad_norm": 5.3125, + "learning_rate": 4.9556159799400334e-05, + "loss": 0.0512, + "mean_token_accuracy": 0.981537401676178, + "num_tokens": 46157178.0, + "step": 19590 + }, + { + "entropy": 0.06663997173309326, + "epoch": 4.567665229047675, + "grad_norm": 5.875, + "learning_rate": 4.955575657101924e-05, + "loss": 0.1047, + "mean_token_accuracy": 0.9750019073486328, + "num_tokens": 46168435.0, + "step": 19595 + }, + { + "entropy": 0.06492017675191164, + "epoch": 4.568830866068306, + "grad_norm": 1.046875, + "learning_rate": 4.955535316286869e-05, + "loss": 0.0837, + "mean_token_accuracy": 0.9799909651279449, + "num_tokens": 46183144.0, + "step": 19600 + }, + { + "entropy": 0.06846091412007808, + "epoch": 4.569996503088938, + "grad_norm": 1.3125, + "learning_rate": 4.955494957495469e-05, + "loss": 0.0861, + "mean_token_accuracy": 0.9765637576580047, + "num_tokens": 46208042.0, + "step": 19605 + }, + { + "entropy": 0.08362425286322832, + "epoch": 4.5711621401095694, + "grad_norm": 1.2890625, + "learning_rate": 4.955454580728327e-05, + "loss": 0.0351, + "mean_token_accuracy": 0.9863261938095093, + "num_tokens": 46242450.0, + "step": 19610 + }, + { + "entropy": 0.0774501122534275, + "epoch": 4.5723277771302016, + "grad_norm": 1.265625, + "learning_rate": 4.955414185986043e-05, + "loss": 0.0665, + "mean_token_accuracy": 0.9804924190044403, + "num_tokens": 46256250.0, + "step": 19615 + }, + { + "entropy": 0.08892765715718269, + "epoch": 4.573493414150834, + "grad_norm": 6.15625, + "learning_rate": 4.95537377326922e-05, + "loss": 0.0906, + "mean_token_accuracy": 0.9789716601371765, + "num_tokens": 46265601.0, + "step": 19620 + }, + { + "entropy": 0.05659576002508402, + "epoch": 4.574659051171465, + "grad_norm": 0.5, + "learning_rate": 4.955333342578462e-05, + "loss": 0.0825, + "mean_token_accuracy": 0.9786292016506195, + "num_tokens": 46284258.0, + "step": 19625 + }, + { + "entropy": 0.05378645788878202, + "epoch": 4.575824688192097, + "grad_norm": 0.42578125, + "learning_rate": 4.955292893914371e-05, + "loss": 0.0686, + "mean_token_accuracy": 0.9818499684333801, + "num_tokens": 46306611.0, + "step": 19630 + }, + { + "entropy": 0.08624094808474184, + "epoch": 4.576990325212729, + "grad_norm": 4.65625, + "learning_rate": 4.955252427277549e-05, + "loss": 0.0798, + "mean_token_accuracy": 0.9779546380043029, + "num_tokens": 46325031.0, + "step": 19635 + }, + { + "entropy": 0.06809303583577275, + "epoch": 4.57815596223336, + "grad_norm": 0.88671875, + "learning_rate": 4.955211942668602e-05, + "loss": 0.0799, + "mean_token_accuracy": 0.9798089146614075, + "num_tokens": 46343106.0, + "step": 19640 + }, + { + "entropy": 0.05596675332635641, + "epoch": 4.579321599253992, + "grad_norm": 4.0625, + "learning_rate": 4.955171440088131e-05, + "loss": 0.0629, + "mean_token_accuracy": 0.9851307094097137, + "num_tokens": 46369488.0, + "step": 19645 + }, + { + "entropy": 0.05978215290233493, + "epoch": 4.5804872362746245, + "grad_norm": 3.0625, + "learning_rate": 4.955130919536741e-05, + "loss": 0.0598, + "mean_token_accuracy": 0.983077323436737, + "num_tokens": 46398374.0, + "step": 19650 + }, + { + "entropy": 0.07102181990630925, + "epoch": 4.581652873295256, + "grad_norm": 1.3828125, + "learning_rate": 4.955090381015037e-05, + "loss": 0.0563, + "mean_token_accuracy": 0.9782206952571869, + "num_tokens": 46417471.0, + "step": 19655 + }, + { + "entropy": 0.08769596517086028, + "epoch": 4.582818510315888, + "grad_norm": 3.5, + "learning_rate": 4.955049824523623e-05, + "loss": 0.1106, + "mean_token_accuracy": 0.9734553158283233, + "num_tokens": 46428623.0, + "step": 19660 + }, + { + "entropy": 0.07133472822606564, + "epoch": 4.583984147336519, + "grad_norm": 5.75, + "learning_rate": 4.9550092500631034e-05, + "loss": 0.0735, + "mean_token_accuracy": 0.9771177768707275, + "num_tokens": 46451312.0, + "step": 19665 + }, + { + "entropy": 0.05361058982089162, + "epoch": 4.585149784357151, + "grad_norm": 1.7734375, + "learning_rate": 4.9549686576340834e-05, + "loss": 0.0456, + "mean_token_accuracy": 0.9819006502628327, + "num_tokens": 46484205.0, + "step": 19670 + }, + { + "entropy": 0.1158721529878676, + "epoch": 4.586315421377783, + "grad_norm": 1.84375, + "learning_rate": 4.954928047237168e-05, + "loss": 0.1289, + "mean_token_accuracy": 0.9689590871334076, + "num_tokens": 46511834.0, + "step": 19675 + }, + { + "entropy": 0.06347185205668211, + "epoch": 4.5874810583984145, + "grad_norm": 1.03125, + "learning_rate": 4.9548874188729645e-05, + "loss": 0.0572, + "mean_token_accuracy": 0.9769329488277435, + "num_tokens": 46532517.0, + "step": 19680 + }, + { + "entropy": 0.15378685537725686, + "epoch": 4.588646695419047, + "grad_norm": 1.4453125, + "learning_rate": 4.954846772542076e-05, + "loss": 0.1941, + "mean_token_accuracy": 0.9541963636875153, + "num_tokens": 46547581.0, + "step": 19685 + }, + { + "entropy": 0.04796793041750789, + "epoch": 4.589812332439678, + "grad_norm": 1.3671875, + "learning_rate": 4.95480610824511e-05, + "loss": 0.0561, + "mean_token_accuracy": 0.9836558759212494, + "num_tokens": 46571459.0, + "step": 19690 + }, + { + "entropy": 0.08416934944689274, + "epoch": 4.59097796946031, + "grad_norm": 0.78515625, + "learning_rate": 4.9547654259826734e-05, + "loss": 0.0743, + "mean_token_accuracy": 0.9806716084480286, + "num_tokens": 46591900.0, + "step": 19695 + }, + { + "entropy": 0.07612838819622994, + "epoch": 4.592143606480942, + "grad_norm": 3.828125, + "learning_rate": 4.954724725755372e-05, + "loss": 0.0916, + "mean_token_accuracy": 0.9744833171367645, + "num_tokens": 46606849.0, + "step": 19700 + }, + { + "entropy": 0.05052490308880806, + "epoch": 4.593309243501574, + "grad_norm": 1.15625, + "learning_rate": 4.954684007563813e-05, + "loss": 0.039, + "mean_token_accuracy": 0.9903412938117981, + "num_tokens": 46626272.0, + "step": 19705 + }, + { + "entropy": 0.06455773105844856, + "epoch": 4.594474880522205, + "grad_norm": 2.53125, + "learning_rate": 4.9546432714086035e-05, + "loss": 0.0762, + "mean_token_accuracy": 0.9837546706199646, + "num_tokens": 46652485.0, + "step": 19710 + }, + { + "entropy": 0.06334900464862585, + "epoch": 4.595640517542837, + "grad_norm": 0.326171875, + "learning_rate": 4.9546025172903505e-05, + "loss": 0.0846, + "mean_token_accuracy": 0.9812196731567383, + "num_tokens": 46676087.0, + "step": 19715 + }, + { + "entropy": 0.04339192071929574, + "epoch": 4.596806154563469, + "grad_norm": 0.392578125, + "learning_rate": 4.954561745209662e-05, + "loss": 0.0296, + "mean_token_accuracy": 0.986208838224411, + "num_tokens": 46728355.0, + "step": 19720 + }, + { + "entropy": 0.09071781933307647, + "epoch": 4.597971791584101, + "grad_norm": 1.3359375, + "learning_rate": 4.954520955167147e-05, + "loss": 0.1165, + "mean_token_accuracy": 0.9729561984539032, + "num_tokens": 46737441.0, + "step": 19725 + }, + { + "entropy": 0.08092315215617418, + "epoch": 4.599137428604733, + "grad_norm": 6.46875, + "learning_rate": 4.954480147163412e-05, + "loss": 0.0779, + "mean_token_accuracy": 0.9785025775432586, + "num_tokens": 46754296.0, + "step": 19730 + }, + { + "entropy": 0.08478829376399517, + "epoch": 4.600303065625364, + "grad_norm": 0.78515625, + "learning_rate": 4.954439321199067e-05, + "loss": 0.1018, + "mean_token_accuracy": 0.9742565453052521, + "num_tokens": 46775708.0, + "step": 19735 + }, + { + "entropy": 0.07357489094138145, + "epoch": 4.601468702645996, + "grad_norm": 1.609375, + "learning_rate": 4.95439847727472e-05, + "loss": 0.1158, + "mean_token_accuracy": 0.9726702034473419, + "num_tokens": 46794542.0, + "step": 19740 + }, + { + "entropy": 0.07723245853558183, + "epoch": 4.602634339666627, + "grad_norm": 0.32421875, + "learning_rate": 4.95435761539098e-05, + "loss": 0.0923, + "mean_token_accuracy": 0.97293501496315, + "num_tokens": 46811394.0, + "step": 19745 + }, + { + "entropy": 0.06583732040598989, + "epoch": 4.6037999766872595, + "grad_norm": 0.419921875, + "learning_rate": 4.954316735548456e-05, + "loss": 0.0813, + "mean_token_accuracy": 0.979914003610611, + "num_tokens": 46829430.0, + "step": 19750 + }, + { + "entropy": 0.0757858750410378, + "epoch": 4.604965613707892, + "grad_norm": 6.0, + "learning_rate": 4.954275837747759e-05, + "loss": 0.0785, + "mean_token_accuracy": 0.9767545580863952, + "num_tokens": 46860533.0, + "step": 19755 + }, + { + "entropy": 0.05514599541202188, + "epoch": 4.606131250728523, + "grad_norm": 4.96875, + "learning_rate": 4.954234921989498e-05, + "loss": 0.083, + "mean_token_accuracy": 0.9799856185913086, + "num_tokens": 46896620.0, + "step": 19760 + }, + { + "entropy": 0.060221548471599815, + "epoch": 4.607296887749155, + "grad_norm": 1.203125, + "learning_rate": 4.954193988274282e-05, + "loss": 0.0673, + "mean_token_accuracy": 0.9807502627372742, + "num_tokens": 46916848.0, + "step": 19765 + }, + { + "entropy": 0.06733870087191463, + "epoch": 4.608462524769787, + "grad_norm": 4.625, + "learning_rate": 4.954153036602723e-05, + "loss": 0.0848, + "mean_token_accuracy": 0.976883488893509, + "num_tokens": 46933000.0, + "step": 19770 + }, + { + "entropy": 0.07344697508960962, + "epoch": 4.609628161790418, + "grad_norm": 1.875, + "learning_rate": 4.954112066975431e-05, + "loss": 0.0892, + "mean_token_accuracy": 0.9760464787483215, + "num_tokens": 46947494.0, + "step": 19775 + }, + { + "entropy": 0.3117006901651621, + "epoch": 4.61079379881105, + "grad_norm": 1.8203125, + "learning_rate": 4.9540710793930174e-05, + "loss": 0.4114, + "mean_token_accuracy": 0.9219294607639312, + "num_tokens": 46976816.0, + "step": 19780 + }, + { + "entropy": 0.07747607827186584, + "epoch": 4.611959435831682, + "grad_norm": 2.796875, + "learning_rate": 4.954030073856093e-05, + "loss": 0.1049, + "mean_token_accuracy": 0.9715212643146515, + "num_tokens": 46991104.0, + "step": 19785 + }, + { + "entropy": 0.06775341653265059, + "epoch": 4.613125072852314, + "grad_norm": 2.53125, + "learning_rate": 4.9539890503652684e-05, + "loss": 0.0557, + "mean_token_accuracy": 0.977581012248993, + "num_tokens": 47011058.0, + "step": 19790 + }, + { + "entropy": 0.06645539095625282, + "epoch": 4.614290709872946, + "grad_norm": 0.65234375, + "learning_rate": 4.953948008921157e-05, + "loss": 0.0174, + "mean_token_accuracy": 0.9802887737751007, + "num_tokens": 47052454.0, + "step": 19795 + }, + { + "entropy": 0.05235816705971956, + "epoch": 4.615456346893577, + "grad_norm": 2.734375, + "learning_rate": 4.9539069495243694e-05, + "loss": 0.0522, + "mean_token_accuracy": 0.9832578837871552, + "num_tokens": 47074809.0, + "step": 19800 + }, + { + "entropy": 0.04748070854693651, + "epoch": 4.616621983914209, + "grad_norm": 2.09375, + "learning_rate": 4.953865872175519e-05, + "loss": 0.0417, + "mean_token_accuracy": 0.9883626997470856, + "num_tokens": 47107110.0, + "step": 19805 + }, + { + "entropy": 0.08402322083711625, + "epoch": 4.617787620934841, + "grad_norm": 6.53125, + "learning_rate": 4.953824776875217e-05, + "loss": 0.1434, + "mean_token_accuracy": 0.9647410392761231, + "num_tokens": 47116318.0, + "step": 19810 + }, + { + "entropy": 0.08315610075369477, + "epoch": 4.618953257955472, + "grad_norm": 0.390625, + "learning_rate": 4.953783663624077e-05, + "loss": 0.0971, + "mean_token_accuracy": 0.9753001630306244, + "num_tokens": 47156487.0, + "step": 19815 + }, + { + "entropy": 0.06097006350755692, + "epoch": 4.6201188949761045, + "grad_norm": 4.4375, + "learning_rate": 4.953742532422713e-05, + "loss": 0.0812, + "mean_token_accuracy": 0.9757486581802368, + "num_tokens": 47182047.0, + "step": 19820 + }, + { + "entropy": 0.06232939455658197, + "epoch": 4.621284531996736, + "grad_norm": 1.5, + "learning_rate": 4.953701383271736e-05, + "loss": 0.0785, + "mean_token_accuracy": 0.9787400305271149, + "num_tokens": 47206737.0, + "step": 19825 + }, + { + "entropy": 0.07709433417767286, + "epoch": 4.622450169017368, + "grad_norm": 2.234375, + "learning_rate": 4.953660216171762e-05, + "loss": 0.1025, + "mean_token_accuracy": 0.9743568658828735, + "num_tokens": 47221486.0, + "step": 19830 + }, + { + "entropy": 0.07625074442476035, + "epoch": 4.623615806038, + "grad_norm": 2.9375, + "learning_rate": 4.953619031123403e-05, + "loss": 0.0868, + "mean_token_accuracy": 0.975380665063858, + "num_tokens": 47235041.0, + "step": 19835 + }, + { + "entropy": 0.06530127339065075, + "epoch": 4.624781443058632, + "grad_norm": 1.5078125, + "learning_rate": 4.953577828127274e-05, + "loss": 0.0443, + "mean_token_accuracy": 0.982153731584549, + "num_tokens": 47258958.0, + "step": 19840 + }, + { + "entropy": 0.08391881920397282, + "epoch": 4.625947080079263, + "grad_norm": 0.73828125, + "learning_rate": 4.9535366071839894e-05, + "loss": 0.0941, + "mean_token_accuracy": 0.9758387804031372, + "num_tokens": 47269042.0, + "step": 19845 + }, + { + "entropy": 0.08590067960321904, + "epoch": 4.627112717099895, + "grad_norm": 2.5, + "learning_rate": 4.953495368294164e-05, + "loss": 0.117, + "mean_token_accuracy": 0.9718995273113251, + "num_tokens": 47279024.0, + "step": 19850 + }, + { + "entropy": 0.08923376137390733, + "epoch": 4.628278354120527, + "grad_norm": 1.609375, + "learning_rate": 4.953454111458413e-05, + "loss": 0.0982, + "mean_token_accuracy": 0.970050984621048, + "num_tokens": 47294540.0, + "step": 19855 + }, + { + "entropy": 0.05908918278291821, + "epoch": 4.629443991141159, + "grad_norm": 0.396484375, + "learning_rate": 4.95341283667735e-05, + "loss": 0.0755, + "mean_token_accuracy": 0.9823239386081696, + "num_tokens": 47332055.0, + "step": 19860 + }, + { + "entropy": 0.041251880768686536, + "epoch": 4.630609628161791, + "grad_norm": 2.25, + "learning_rate": 4.9533715439515914e-05, + "loss": 0.0329, + "mean_token_accuracy": 0.9873129487037658, + "num_tokens": 47365297.0, + "step": 19865 + }, + { + "entropy": 0.0673784639686346, + "epoch": 4.631775265182422, + "grad_norm": 1.875, + "learning_rate": 4.953330233281754e-05, + "loss": 0.082, + "mean_token_accuracy": 0.9806714713573456, + "num_tokens": 47375480.0, + "step": 19870 + }, + { + "entropy": 0.06148081440478563, + "epoch": 4.632940902203054, + "grad_norm": 0.72265625, + "learning_rate": 4.953288904668453e-05, + "loss": 0.0695, + "mean_token_accuracy": 0.982768303155899, + "num_tokens": 47401613.0, + "step": 19875 + }, + { + "entropy": 0.06011179555207491, + "epoch": 4.634106539223685, + "grad_norm": 2.078125, + "learning_rate": 4.953247558112304e-05, + "loss": 0.051, + "mean_token_accuracy": 0.9858065843582153, + "num_tokens": 47428467.0, + "step": 19880 + }, + { + "entropy": 0.08135449420660734, + "epoch": 4.635272176244317, + "grad_norm": 1.3203125, + "learning_rate": 4.953206193613924e-05, + "loss": 0.1072, + "mean_token_accuracy": 0.9746085166931152, + "num_tokens": 47458428.0, + "step": 19885 + }, + { + "entropy": 0.039868217147886756, + "epoch": 4.6364378132649495, + "grad_norm": 0.96875, + "learning_rate": 4.95316481117393e-05, + "loss": 0.0313, + "mean_token_accuracy": 0.9860462665557861, + "num_tokens": 47484331.0, + "step": 19890 + }, + { + "entropy": 0.09878461733460427, + "epoch": 4.637603450285581, + "grad_norm": 7.78125, + "learning_rate": 4.9531234107929396e-05, + "loss": 0.1179, + "mean_token_accuracy": 0.9705485343933106, + "num_tokens": 47494448.0, + "step": 19895 + }, + { + "entropy": 0.043826198158785704, + "epoch": 4.638769087306213, + "grad_norm": 3.53125, + "learning_rate": 4.9530819924715696e-05, + "loss": 0.0339, + "mean_token_accuracy": 0.9882486462593079, + "num_tokens": 47530441.0, + "step": 19900 + }, + { + "entropy": 0.07388420086354017, + "epoch": 4.639934724326845, + "grad_norm": 2.390625, + "learning_rate": 4.953040556210437e-05, + "loss": 0.081, + "mean_token_accuracy": 0.9756742894649506, + "num_tokens": 47546029.0, + "step": 19905 + }, + { + "entropy": 0.06827450687997043, + "epoch": 4.641100361347476, + "grad_norm": 0.341796875, + "learning_rate": 4.95299910201016e-05, + "loss": 0.0461, + "mean_token_accuracy": 0.97547847032547, + "num_tokens": 47588347.0, + "step": 19910 + }, + { + "entropy": 0.05796924643218517, + "epoch": 4.642265998368108, + "grad_norm": 0.5234375, + "learning_rate": 4.952957629871358e-05, + "loss": 0.0663, + "mean_token_accuracy": 0.9812373161315918, + "num_tokens": 47607383.0, + "step": 19915 + }, + { + "entropy": 0.050525398924946785, + "epoch": 4.64343163538874, + "grad_norm": 1.2265625, + "learning_rate": 4.952916139794648e-05, + "loss": 0.0771, + "mean_token_accuracy": 0.9827237486839294, + "num_tokens": 47622336.0, + "step": 19920 + }, + { + "entropy": 0.07169684544205665, + "epoch": 4.644597272409372, + "grad_norm": 1.0625, + "learning_rate": 4.952874631780648e-05, + "loss": 0.0782, + "mean_token_accuracy": 0.9775870382785797, + "num_tokens": 47645151.0, + "step": 19925 + }, + { + "entropy": 0.06724204597994685, + "epoch": 4.645762909430004, + "grad_norm": 7.375, + "learning_rate": 4.952833105829979e-05, + "loss": 0.0897, + "mean_token_accuracy": 0.9776890814304352, + "num_tokens": 47666927.0, + "step": 19930 + }, + { + "entropy": 0.06391678284853697, + "epoch": 4.646928546450635, + "grad_norm": 0.93359375, + "learning_rate": 4.952791561943259e-05, + "loss": 0.0661, + "mean_token_accuracy": 0.984687602519989, + "num_tokens": 47684001.0, + "step": 19935 + }, + { + "entropy": 0.09616228733211755, + "epoch": 4.648094183471267, + "grad_norm": 1.375, + "learning_rate": 4.952750000121108e-05, + "loss": 0.1775, + "mean_token_accuracy": 0.9602535605430603, + "num_tokens": 47693787.0, + "step": 19940 + }, + { + "entropy": 0.07046541702002287, + "epoch": 4.649259820491899, + "grad_norm": 3.140625, + "learning_rate": 4.952708420364145e-05, + "loss": 0.1064, + "mean_token_accuracy": 0.9723043143749237, + "num_tokens": 47707582.0, + "step": 19945 + }, + { + "entropy": 0.13581852428615093, + "epoch": 4.65042545751253, + "grad_norm": 1.78125, + "learning_rate": 4.952666822672991e-05, + "loss": 0.2336, + "mean_token_accuracy": 0.9517211794853211, + "num_tokens": 47726482.0, + "step": 19950 + }, + { + "entropy": 0.060800166614353654, + "epoch": 4.651591094533162, + "grad_norm": 0.1865234375, + "learning_rate": 4.952625207048265e-05, + "loss": 0.0302, + "mean_token_accuracy": 0.9864785373210907, + "num_tokens": 47768498.0, + "step": 19955 + }, + { + "entropy": 0.08810051530599594, + "epoch": 4.652756731553794, + "grad_norm": 5.375, + "learning_rate": 4.952583573490589e-05, + "loss": 0.135, + "mean_token_accuracy": 0.9718170285224914, + "num_tokens": 47787658.0, + "step": 19960 + }, + { + "entropy": 0.05881752036511898, + "epoch": 4.653922368574426, + "grad_norm": 5.6875, + "learning_rate": 4.952541922000583e-05, + "loss": 0.0759, + "mean_token_accuracy": 0.9819856464862824, + "num_tokens": 47817274.0, + "step": 19965 + }, + { + "entropy": 0.06032843859866262, + "epoch": 4.655088005595058, + "grad_norm": 4.625, + "learning_rate": 4.9525002525788685e-05, + "loss": 0.055, + "mean_token_accuracy": 0.981800502538681, + "num_tokens": 47839225.0, + "step": 19970 + }, + { + "entropy": 0.10100402384996414, + "epoch": 4.65625364261569, + "grad_norm": 1.359375, + "learning_rate": 4.952458565226066e-05, + "loss": 0.1203, + "mean_token_accuracy": 0.9719790756702423, + "num_tokens": 47859646.0, + "step": 19975 + }, + { + "entropy": 0.05944112166762352, + "epoch": 4.657419279636321, + "grad_norm": 3.171875, + "learning_rate": 4.952416859942798e-05, + "loss": 0.0631, + "mean_token_accuracy": 0.9825767993927002, + "num_tokens": 47875250.0, + "step": 19980 + }, + { + "entropy": 0.0668319322168827, + "epoch": 4.658584916656953, + "grad_norm": 0.66796875, + "learning_rate": 4.952375136729686e-05, + "loss": 0.0748, + "mean_token_accuracy": 0.9800704658031464, + "num_tokens": 47897665.0, + "step": 19985 + }, + { + "entropy": 0.06467094738036394, + "epoch": 4.6597505536775845, + "grad_norm": 1.1484375, + "learning_rate": 4.952333395587352e-05, + "loss": 0.0524, + "mean_token_accuracy": 0.9808013200759887, + "num_tokens": 47916914.0, + "step": 19990 + }, + { + "entropy": 0.07919787243008614, + "epoch": 4.660916190698217, + "grad_norm": 3.515625, + "learning_rate": 4.952291636516419e-05, + "loss": 0.0853, + "mean_token_accuracy": 0.9790023684501648, + "num_tokens": 47926765.0, + "step": 19995 + }, + { + "entropy": 0.06067050509154796, + "epoch": 4.662081827718849, + "grad_norm": 2.640625, + "learning_rate": 4.9522498595175093e-05, + "loss": 0.0907, + "mean_token_accuracy": 0.9813133835792541, + "num_tokens": 47939441.0, + "step": 20000 + }, + { + "entropy": 0.04763990985229612, + "epoch": 4.66324746473948, + "grad_norm": 1.546875, + "learning_rate": 4.952208064591246e-05, + "loss": 0.0322, + "mean_token_accuracy": 0.9868841648101807, + "num_tokens": 47968277.0, + "step": 20005 + }, + { + "entropy": 0.07200488224625587, + "epoch": 4.664413101760112, + "grad_norm": 2.28125, + "learning_rate": 4.952166251738252e-05, + "loss": 0.0819, + "mean_token_accuracy": 0.9793427646160126, + "num_tokens": 47981850.0, + "step": 20010 + }, + { + "entropy": 0.07122592218220233, + "epoch": 4.665578738780743, + "grad_norm": 1.953125, + "learning_rate": 4.95212442095915e-05, + "loss": 0.0957, + "mean_token_accuracy": 0.9757452428340911, + "num_tokens": 47994291.0, + "step": 20015 + }, + { + "entropy": 0.07337831128388643, + "epoch": 4.666744375801375, + "grad_norm": 0.7265625, + "learning_rate": 4.9520825722545664e-05, + "loss": 0.0756, + "mean_token_accuracy": 0.9799136698246003, + "num_tokens": 48015925.0, + "step": 20020 + }, + { + "entropy": 0.06564246322959662, + "epoch": 4.6679100128220075, + "grad_norm": 3.703125, + "learning_rate": 4.9520407056251235e-05, + "loss": 0.0842, + "mean_token_accuracy": 0.9770355999469758, + "num_tokens": 48031165.0, + "step": 20025 + }, + { + "entropy": 0.09497220404446124, + "epoch": 4.669075649842639, + "grad_norm": 7.375, + "learning_rate": 4.951998821071445e-05, + "loss": 0.1242, + "mean_token_accuracy": 0.9703881442546844, + "num_tokens": 48047560.0, + "step": 20030 + }, + { + "entropy": 0.07135007679462432, + "epoch": 4.670241286863271, + "grad_norm": 0.5390625, + "learning_rate": 4.951956918594157e-05, + "loss": 0.0942, + "mean_token_accuracy": 0.979320478439331, + "num_tokens": 48060649.0, + "step": 20035 + }, + { + "entropy": 0.0695052114315331, + "epoch": 4.671406923883903, + "grad_norm": 0.212890625, + "learning_rate": 4.951914998193883e-05, + "loss": 0.0675, + "mean_token_accuracy": 0.9792129814624786, + "num_tokens": 48078205.0, + "step": 20040 + }, + { + "entropy": 0.09717717897146941, + "epoch": 4.672572560904534, + "grad_norm": 6.875, + "learning_rate": 4.951873059871248e-05, + "loss": 0.1239, + "mean_token_accuracy": 0.9706105470657349, + "num_tokens": 48093089.0, + "step": 20045 + }, + { + "entropy": 0.05780693581327796, + "epoch": 4.673738197925166, + "grad_norm": 0.99609375, + "learning_rate": 4.9518311036268785e-05, + "loss": 0.0629, + "mean_token_accuracy": 0.9817413330078125, + "num_tokens": 48122792.0, + "step": 20050 + }, + { + "entropy": 0.08879671581089496, + "epoch": 4.674903834945798, + "grad_norm": 3.3125, + "learning_rate": 4.9517891294613995e-05, + "loss": 0.0988, + "mean_token_accuracy": 0.9761231005191803, + "num_tokens": 48145476.0, + "step": 20055 + }, + { + "entropy": 0.054804504942148925, + "epoch": 4.6760694719664295, + "grad_norm": 0.30078125, + "learning_rate": 4.9517471373754374e-05, + "loss": 0.0455, + "mean_token_accuracy": 0.9853194117546081, + "num_tokens": 48179724.0, + "step": 20060 + }, + { + "entropy": 0.06867078095674514, + "epoch": 4.677235108987062, + "grad_norm": 4.59375, + "learning_rate": 4.951705127369617e-05, + "loss": 0.1, + "mean_token_accuracy": 0.9770641207695008, + "num_tokens": 48190098.0, + "step": 20065 + }, + { + "entropy": 0.0939449267461896, + "epoch": 4.678400746007693, + "grad_norm": 2.65625, + "learning_rate": 4.951663099444567e-05, + "loss": 0.102, + "mean_token_accuracy": 0.9720357954502106, + "num_tokens": 48207731.0, + "step": 20070 + }, + { + "entropy": 0.11643363423645496, + "epoch": 4.679566383028325, + "grad_norm": 3.03125, + "learning_rate": 4.951621053600912e-05, + "loss": 0.1497, + "mean_token_accuracy": 0.9660777747631073, + "num_tokens": 48236231.0, + "step": 20075 + }, + { + "entropy": 0.058504576422274114, + "epoch": 4.680732020048957, + "grad_norm": 0.890625, + "learning_rate": 4.95157898983928e-05, + "loss": 0.0385, + "mean_token_accuracy": 0.9740236282348633, + "num_tokens": 48270224.0, + "step": 20080 + }, + { + "entropy": 0.05779395885765552, + "epoch": 4.681897657069588, + "grad_norm": 0.30078125, + "learning_rate": 4.9515369081602984e-05, + "loss": 0.0523, + "mean_token_accuracy": 0.9832865595817566, + "num_tokens": 48292194.0, + "step": 20085 + }, + { + "entropy": 0.07101827822625636, + "epoch": 4.68306329409022, + "grad_norm": 0.60546875, + "learning_rate": 4.951494808564593e-05, + "loss": 0.0612, + "mean_token_accuracy": 0.9771855533123016, + "num_tokens": 48321133.0, + "step": 20090 + }, + { + "entropy": 0.07946205716580153, + "epoch": 4.684228931110852, + "grad_norm": 3.921875, + "learning_rate": 4.951452691052794e-05, + "loss": 0.1363, + "mean_token_accuracy": 0.969723004102707, + "num_tokens": 48342279.0, + "step": 20095 + }, + { + "entropy": 0.06795702120289207, + "epoch": 4.685394568131484, + "grad_norm": 1.78125, + "learning_rate": 4.951410555625527e-05, + "loss": 0.074, + "mean_token_accuracy": 0.9802046597003937, + "num_tokens": 48357912.0, + "step": 20100 + }, + { + "entropy": 0.07630017213523388, + "epoch": 4.686560205152116, + "grad_norm": 1.3046875, + "learning_rate": 4.951368402283423e-05, + "loss": 0.0866, + "mean_token_accuracy": 0.9766553997993469, + "num_tokens": 48371624.0, + "step": 20105 + }, + { + "entropy": 0.06927125807851553, + "epoch": 4.687725842172748, + "grad_norm": 1.8671875, + "learning_rate": 4.9513262310271084e-05, + "loss": 0.0483, + "mean_token_accuracy": 0.9857329249382019, + "num_tokens": 48389135.0, + "step": 20110 + }, + { + "entropy": 0.054950883705168964, + "epoch": 4.688891479193379, + "grad_norm": 3.265625, + "learning_rate": 4.951284041857213e-05, + "loss": 0.0493, + "mean_token_accuracy": 0.982935881614685, + "num_tokens": 48410987.0, + "step": 20115 + }, + { + "entropy": 0.0935236718505621, + "epoch": 4.690057116214011, + "grad_norm": 7.3125, + "learning_rate": 4.9512418347743664e-05, + "loss": 0.1299, + "mean_token_accuracy": 0.9671928882598877, + "num_tokens": 48423657.0, + "step": 20120 + }, + { + "entropy": 0.059754582960158585, + "epoch": 4.691222753234642, + "grad_norm": 2.140625, + "learning_rate": 4.9511996097791965e-05, + "loss": 0.0525, + "mean_token_accuracy": 0.9828765392303467, + "num_tokens": 48445677.0, + "step": 20125 + }, + { + "entropy": 0.06901389230042695, + "epoch": 4.6923883902552745, + "grad_norm": 0.78515625, + "learning_rate": 4.951157366872334e-05, + "loss": 0.0586, + "mean_token_accuracy": 0.9815558969974518, + "num_tokens": 48478911.0, + "step": 20130 + }, + { + "entropy": 0.1916733231395483, + "epoch": 4.693554027275907, + "grad_norm": 2.203125, + "learning_rate": 4.951115106054408e-05, + "loss": 0.3589, + "mean_token_accuracy": 0.9531080842018127, + "num_tokens": 48507415.0, + "step": 20135 + }, + { + "entropy": 0.053913332894444464, + "epoch": 4.694719664296538, + "grad_norm": 1.7734375, + "learning_rate": 4.9510728273260496e-05, + "loss": 0.0681, + "mean_token_accuracy": 0.9818377435207367, + "num_tokens": 48526630.0, + "step": 20140 + }, + { + "entropy": 0.05371881201863289, + "epoch": 4.69588530131717, + "grad_norm": 0.388671875, + "learning_rate": 4.951030530687889e-05, + "loss": 0.0482, + "mean_token_accuracy": 0.9868282377719879, + "num_tokens": 48546033.0, + "step": 20145 + }, + { + "entropy": 0.06295309253036976, + "epoch": 4.697050938337801, + "grad_norm": 5.59375, + "learning_rate": 4.9509882161405566e-05, + "loss": 0.0881, + "mean_token_accuracy": 0.9804576992988586, + "num_tokens": 48558724.0, + "step": 20150 + }, + { + "entropy": 0.061114361975342035, + "epoch": 4.698216575358433, + "grad_norm": 2.140625, + "learning_rate": 4.950945883684683e-05, + "loss": 0.0577, + "mean_token_accuracy": 0.9806105256080627, + "num_tokens": 48586711.0, + "step": 20155 + }, + { + "entropy": 0.0676982618868351, + "epoch": 4.699382212379065, + "grad_norm": 1.015625, + "learning_rate": 4.9509035333209005e-05, + "loss": 0.0669, + "mean_token_accuracy": 0.9803455650806427, + "num_tokens": 48599778.0, + "step": 20160 + }, + { + "entropy": 0.057492456119507554, + "epoch": 4.700547849399697, + "grad_norm": 0.82421875, + "learning_rate": 4.95086116504984e-05, + "loss": 0.0555, + "mean_token_accuracy": 0.9830128908157348, + "num_tokens": 48618248.0, + "step": 20165 + }, + { + "entropy": 0.08579396829009056, + "epoch": 4.701713486420329, + "grad_norm": 4.28125, + "learning_rate": 4.950818778872133e-05, + "loss": 0.0526, + "mean_token_accuracy": 0.9830702006816864, + "num_tokens": 48645960.0, + "step": 20170 + }, + { + "entropy": 0.08069932255893945, + "epoch": 4.702879123440961, + "grad_norm": 6.09375, + "learning_rate": 4.950776374788412e-05, + "loss": 0.0841, + "mean_token_accuracy": 0.9744628429412842, + "num_tokens": 48664011.0, + "step": 20175 + }, + { + "entropy": 0.04855256769806147, + "epoch": 4.704044760461592, + "grad_norm": 0.55078125, + "learning_rate": 4.9507339527993095e-05, + "loss": 0.0426, + "mean_token_accuracy": 0.982448011636734, + "num_tokens": 48684718.0, + "step": 20180 + }, + { + "entropy": 0.09109218874946237, + "epoch": 4.705210397482224, + "grad_norm": 7.34375, + "learning_rate": 4.9506915129054576e-05, + "loss": 0.17, + "mean_token_accuracy": 0.9593676149845123, + "num_tokens": 48702995.0, + "step": 20185 + }, + { + "entropy": 0.07370678829029202, + "epoch": 4.706376034502856, + "grad_norm": 4.09375, + "learning_rate": 4.950649055107489e-05, + "loss": 0.0964, + "mean_token_accuracy": 0.9742493212223053, + "num_tokens": 48719147.0, + "step": 20190 + }, + { + "entropy": 0.08082408849149943, + "epoch": 4.7075416715234875, + "grad_norm": 3.125, + "learning_rate": 4.9506065794060375e-05, + "loss": 0.0533, + "mean_token_accuracy": 0.9826491594314575, + "num_tokens": 48741412.0, + "step": 20195 + }, + { + "entropy": 0.05809831535443664, + "epoch": 4.70870730854412, + "grad_norm": 0.4296875, + "learning_rate": 4.950564085801736e-05, + "loss": 0.0644, + "mean_token_accuracy": 0.978083735704422, + "num_tokens": 48779819.0, + "step": 20200 + }, + { + "entropy": 0.06636287728324533, + "epoch": 4.709872945564751, + "grad_norm": 5.59375, + "learning_rate": 4.9505215742952184e-05, + "loss": 0.0837, + "mean_token_accuracy": 0.9786609828472137, + "num_tokens": 48806468.0, + "step": 20205 + }, + { + "entropy": 0.05410275729373097, + "epoch": 4.711038582585383, + "grad_norm": 1.375, + "learning_rate": 4.950479044887118e-05, + "loss": 0.0475, + "mean_token_accuracy": 0.9868753671646118, + "num_tokens": 48822312.0, + "step": 20210 + }, + { + "entropy": 0.2814826850313693, + "epoch": 4.712204219606015, + "grad_norm": 0.37109375, + "learning_rate": 4.9504364975780696e-05, + "loss": 0.5149, + "mean_token_accuracy": 0.9334210813045501, + "num_tokens": 48866242.0, + "step": 20215 + }, + { + "entropy": 0.059157754946500066, + "epoch": 4.713369856626646, + "grad_norm": 2.4375, + "learning_rate": 4.9503939323687073e-05, + "loss": 0.0591, + "mean_token_accuracy": 0.9809535264968872, + "num_tokens": 48892857.0, + "step": 20220 + }, + { + "entropy": 0.0832524687051773, + "epoch": 4.714535493647278, + "grad_norm": 0.59375, + "learning_rate": 4.9503513492596666e-05, + "loss": 0.1003, + "mean_token_accuracy": 0.9713628351688385, + "num_tokens": 48904319.0, + "step": 20225 + }, + { + "entropy": 0.05422810595482588, + "epoch": 4.7157011306679095, + "grad_norm": 0.44140625, + "learning_rate": 4.9503087482515817e-05, + "loss": 0.0475, + "mean_token_accuracy": 0.9819365739822388, + "num_tokens": 48936321.0, + "step": 20230 + }, + { + "entropy": 0.07771264165639877, + "epoch": 4.716866767688542, + "grad_norm": 4.46875, + "learning_rate": 4.9502661293450874e-05, + "loss": 0.1248, + "mean_token_accuracy": 0.9761758804321289, + "num_tokens": 48945222.0, + "step": 20235 + }, + { + "entropy": 0.07859923299401998, + "epoch": 4.718032404709174, + "grad_norm": 4.375, + "learning_rate": 4.95022349254082e-05, + "loss": 0.0766, + "mean_token_accuracy": 0.9763111054897309, + "num_tokens": 48958764.0, + "step": 20240 + }, + { + "entropy": 0.06803965084254741, + "epoch": 4.719198041729806, + "grad_norm": 4.03125, + "learning_rate": 4.950180837839416e-05, + "loss": 0.0812, + "mean_token_accuracy": 0.9793858110904694, + "num_tokens": 48975984.0, + "step": 20245 + }, + { + "entropy": 0.048299599625170234, + "epoch": 4.720363678750437, + "grad_norm": 0.2314453125, + "learning_rate": 4.950138165241509e-05, + "loss": 0.0246, + "mean_token_accuracy": 0.9865788578987121, + "num_tokens": 49004389.0, + "step": 20250 + }, + { + "entropy": 0.061194864101707935, + "epoch": 4.721529315771069, + "grad_norm": 0.3125, + "learning_rate": 4.950095474747738e-05, + "loss": 0.0622, + "mean_token_accuracy": 0.9849075496196746, + "num_tokens": 49029294.0, + "step": 20255 + }, + { + "entropy": 0.08798045851290226, + "epoch": 4.7226949527917, + "grad_norm": 4.0625, + "learning_rate": 4.9500527663587375e-05, + "loss": 0.1386, + "mean_token_accuracy": 0.9668548405170441, + "num_tokens": 49038976.0, + "step": 20260 + }, + { + "entropy": 0.05687556634657085, + "epoch": 4.7238605898123325, + "grad_norm": 2.25, + "learning_rate": 4.950010040075146e-05, + "loss": 0.0464, + "mean_token_accuracy": 0.9875887632369995, + "num_tokens": 49060951.0, + "step": 20265 + }, + { + "entropy": 0.06489252224564553, + "epoch": 4.725026226832965, + "grad_norm": 3.71875, + "learning_rate": 4.9499672958975995e-05, + "loss": 0.0709, + "mean_token_accuracy": 0.979289311170578, + "num_tokens": 49074482.0, + "step": 20270 + }, + { + "entropy": 0.06801861561834813, + "epoch": 4.726191863853596, + "grad_norm": 5.40625, + "learning_rate": 4.949924533826736e-05, + "loss": 0.1091, + "mean_token_accuracy": 0.9740906596183777, + "num_tokens": 49087207.0, + "step": 20275 + }, + { + "entropy": 0.10587560161948203, + "epoch": 4.727357500874228, + "grad_norm": 1.1015625, + "learning_rate": 4.949881753863193e-05, + "loss": 0.1184, + "mean_token_accuracy": 0.9726275205612183, + "num_tokens": 49112264.0, + "step": 20280 + }, + { + "entropy": 0.10358101055026055, + "epoch": 4.728523137894859, + "grad_norm": 2.0, + "learning_rate": 4.9498389560076084e-05, + "loss": 0.159, + "mean_token_accuracy": 0.9651169836521148, + "num_tokens": 49132458.0, + "step": 20285 + }, + { + "entropy": 0.07152350768446922, + "epoch": 4.729688774915491, + "grad_norm": 2.671875, + "learning_rate": 4.9497961402606204e-05, + "loss": 0.0984, + "mean_token_accuracy": 0.9762173771858216, + "num_tokens": 49144811.0, + "step": 20290 + }, + { + "entropy": 0.10446326434612274, + "epoch": 4.730854411936123, + "grad_norm": 0.6171875, + "learning_rate": 4.949753306622867e-05, + "loss": 0.0961, + "mean_token_accuracy": 0.9760794460773468, + "num_tokens": 49158186.0, + "step": 20295 + }, + { + "entropy": 0.0776900127530098, + "epoch": 4.7320200489567545, + "grad_norm": 2.75, + "learning_rate": 4.949710455094987e-05, + "loss": 0.1157, + "mean_token_accuracy": 0.9754393517971038, + "num_tokens": 49167881.0, + "step": 20300 + }, + { + "entropy": 0.04937164485454559, + "epoch": 4.733185685977387, + "grad_norm": 0.73828125, + "learning_rate": 4.94966758567762e-05, + "loss": 0.0273, + "mean_token_accuracy": 0.9871737420558929, + "num_tokens": 49189512.0, + "step": 20305 + }, + { + "entropy": 0.06925871726125479, + "epoch": 4.734351322998019, + "grad_norm": 1.1328125, + "learning_rate": 4.949624698371405e-05, + "loss": 0.0801, + "mean_token_accuracy": 0.9783924877643585, + "num_tokens": 49204116.0, + "step": 20310 + }, + { + "entropy": 0.09072446711361408, + "epoch": 4.73551696001865, + "grad_norm": 1.1875, + "learning_rate": 4.949581793176981e-05, + "loss": 0.0981, + "mean_token_accuracy": 0.969156926870346, + "num_tokens": 49215356.0, + "step": 20315 + }, + { + "entropy": 0.07261850405484438, + "epoch": 4.736682597039282, + "grad_norm": 3.296875, + "learning_rate": 4.9495388700949885e-05, + "loss": 0.0857, + "mean_token_accuracy": 0.9742332696914673, + "num_tokens": 49232369.0, + "step": 20320 + }, + { + "entropy": 0.16985770147293805, + "epoch": 4.737848234059914, + "grad_norm": 0.76171875, + "learning_rate": 4.9494959291260676e-05, + "loss": 0.2871, + "mean_token_accuracy": 0.932180005311966, + "num_tokens": 49258087.0, + "step": 20325 + }, + { + "entropy": 0.07392840981483459, + "epoch": 4.739013871080545, + "grad_norm": 1.171875, + "learning_rate": 4.949452970270857e-05, + "loss": 0.1103, + "mean_token_accuracy": 0.9722067236900329, + "num_tokens": 49267408.0, + "step": 20330 + }, + { + "entropy": 0.060131664481014015, + "epoch": 4.7401795081011775, + "grad_norm": 2.875, + "learning_rate": 4.9494099935299996e-05, + "loss": 0.0497, + "mean_token_accuracy": 0.9814348340034484, + "num_tokens": 49292748.0, + "step": 20335 + }, + { + "entropy": 0.12636512629687785, + "epoch": 4.741345145121809, + "grad_norm": 4.3125, + "learning_rate": 4.9493669989041353e-05, + "loss": 0.165, + "mean_token_accuracy": 0.9643090963363647, + "num_tokens": 49305455.0, + "step": 20340 + }, + { + "entropy": 0.05997271528467536, + "epoch": 4.742510782142441, + "grad_norm": 1.1953125, + "learning_rate": 4.949323986393905e-05, + "loss": 0.0747, + "mean_token_accuracy": 0.9823045790195465, + "num_tokens": 49321546.0, + "step": 20345 + }, + { + "entropy": 0.05400382243096828, + "epoch": 4.743676419163073, + "grad_norm": 2.28125, + "learning_rate": 4.9492809559999495e-05, + "loss": 0.0464, + "mean_token_accuracy": 0.9806383967399597, + "num_tokens": 49370181.0, + "step": 20350 + }, + { + "entropy": 0.06353923268616199, + "epoch": 4.744842056183704, + "grad_norm": 0.318359375, + "learning_rate": 4.949237907722912e-05, + "loss": 0.0788, + "mean_token_accuracy": 0.977184635400772, + "num_tokens": 49392823.0, + "step": 20355 + }, + { + "entropy": 0.09873983627185226, + "epoch": 4.746007693204336, + "grad_norm": 0.88671875, + "learning_rate": 4.9491948415634335e-05, + "loss": 0.0789, + "mean_token_accuracy": 0.974556165933609, + "num_tokens": 49408279.0, + "step": 20360 + }, + { + "entropy": 0.09930532816797495, + "epoch": 4.7471733302249675, + "grad_norm": 0.42578125, + "learning_rate": 4.949151757522155e-05, + "loss": 0.123, + "mean_token_accuracy": 0.9696231663227082, + "num_tokens": 49429537.0, + "step": 20365 + }, + { + "entropy": 0.0745665643364191, + "epoch": 4.7483389672456, + "grad_norm": 1.296875, + "learning_rate": 4.949108655599721e-05, + "loss": 0.054, + "mean_token_accuracy": 0.9802153050899506, + "num_tokens": 49455342.0, + "step": 20370 + }, + { + "entropy": 0.0746733145788312, + "epoch": 4.749504604266232, + "grad_norm": 0.9765625, + "learning_rate": 4.949065535796774e-05, + "loss": 0.0982, + "mean_token_accuracy": 0.9758549451828002, + "num_tokens": 49476536.0, + "step": 20375 + }, + { + "entropy": 0.05807813517749309, + "epoch": 4.750670241286863, + "grad_norm": 2.828125, + "learning_rate": 4.949022398113955e-05, + "loss": 0.0743, + "mean_token_accuracy": 0.9838265895843505, + "num_tokens": 49494710.0, + "step": 20380 + }, + { + "entropy": 0.07137834914028644, + "epoch": 4.751835878307495, + "grad_norm": 2.46875, + "learning_rate": 4.9489792425519097e-05, + "loss": 0.1034, + "mean_token_accuracy": 0.9739278852939606, + "num_tokens": 49505900.0, + "step": 20385 + }, + { + "entropy": 0.06287243217229843, + "epoch": 4.753001515328127, + "grad_norm": 3.421875, + "learning_rate": 4.948936069111281e-05, + "loss": 0.0759, + "mean_token_accuracy": 0.9815413355827332, + "num_tokens": 49519016.0, + "step": 20390 + }, + { + "entropy": 0.05989686464890838, + "epoch": 4.754167152348758, + "grad_norm": 0.56640625, + "learning_rate": 4.948892877792711e-05, + "loss": 0.056, + "mean_token_accuracy": 0.9855867683887481, + "num_tokens": 49537367.0, + "step": 20395 + }, + { + "entropy": 0.09508889261633158, + "epoch": 4.75533278936939, + "grad_norm": 1.765625, + "learning_rate": 4.9488496685968455e-05, + "loss": 0.1299, + "mean_token_accuracy": 0.9706778466701508, + "num_tokens": 49561307.0, + "step": 20400 + }, + { + "entropy": 0.07447026818990707, + "epoch": 4.7564984263900225, + "grad_norm": 4.53125, + "learning_rate": 4.948806441524328e-05, + "loss": 0.0922, + "mean_token_accuracy": 0.9730834066867828, + "num_tokens": 49578999.0, + "step": 20405 + }, + { + "entropy": 0.08297924473881721, + "epoch": 4.757664063410654, + "grad_norm": 2.0, + "learning_rate": 4.9487631965758034e-05, + "loss": 0.109, + "mean_token_accuracy": 0.9753465235233307, + "num_tokens": 49601086.0, + "step": 20410 + }, + { + "entropy": 0.07900863215327263, + "epoch": 4.758829700431286, + "grad_norm": 4.6875, + "learning_rate": 4.948719933751916e-05, + "loss": 0.0983, + "mean_token_accuracy": 0.974524050951004, + "num_tokens": 49611278.0, + "step": 20415 + }, + { + "entropy": 0.08208957873284817, + "epoch": 4.759995337451917, + "grad_norm": 2.09375, + "learning_rate": 4.9486766530533126e-05, + "loss": 0.0715, + "mean_token_accuracy": 0.9752253651618957, + "num_tokens": 49631030.0, + "step": 20420 + }, + { + "entropy": 0.057911211252212526, + "epoch": 4.761160974472549, + "grad_norm": 0.490234375, + "learning_rate": 4.9486333544806365e-05, + "loss": 0.0377, + "mean_token_accuracy": 0.983147144317627, + "num_tokens": 49649629.0, + "step": 20425 + }, + { + "entropy": 0.08789655100554228, + "epoch": 4.762326611493181, + "grad_norm": 2.328125, + "learning_rate": 4.948590038034535e-05, + "loss": 0.102, + "mean_token_accuracy": 0.9745977938175201, + "num_tokens": 49660399.0, + "step": 20430 + }, + { + "entropy": 0.08837173730134965, + "epoch": 4.7634922485138125, + "grad_norm": 1.5, + "learning_rate": 4.9485467037156525e-05, + "loss": 0.1278, + "mean_token_accuracy": 0.9708876311779022, + "num_tokens": 49669243.0, + "step": 20435 + }, + { + "entropy": 0.05032849675044417, + "epoch": 4.764657885534445, + "grad_norm": 0.546875, + "learning_rate": 4.948503351524636e-05, + "loss": 0.043, + "mean_token_accuracy": 0.9866231560707093, + "num_tokens": 49700542.0, + "step": 20440 + }, + { + "entropy": 0.08655855841934681, + "epoch": 4.765823522555077, + "grad_norm": 1.1015625, + "learning_rate": 4.948459981462132e-05, + "loss": 0.0835, + "mean_token_accuracy": 0.9743392825126648, + "num_tokens": 49718787.0, + "step": 20445 + }, + { + "entropy": 0.06486239023506642, + "epoch": 4.766989159575708, + "grad_norm": 1.3515625, + "learning_rate": 4.948416593528787e-05, + "loss": 0.082, + "mean_token_accuracy": 0.9798474609851837, + "num_tokens": 49734457.0, + "step": 20450 + }, + { + "entropy": 0.0723851252347231, + "epoch": 4.76815479659634, + "grad_norm": 2.078125, + "learning_rate": 4.948373187725249e-05, + "loss": 0.0684, + "mean_token_accuracy": 0.9775489926338196, + "num_tokens": 49753956.0, + "step": 20455 + }, + { + "entropy": 0.08052669223397971, + "epoch": 4.769320433616972, + "grad_norm": 1.28125, + "learning_rate": 4.948329764052163e-05, + "loss": 0.0704, + "mean_token_accuracy": 0.9762837886810303, + "num_tokens": 49773734.0, + "step": 20460 + }, + { + "entropy": 0.08943664506077767, + "epoch": 4.770486070637603, + "grad_norm": 0.5859375, + "learning_rate": 4.948286322510179e-05, + "loss": 0.083, + "mean_token_accuracy": 0.973814857006073, + "num_tokens": 49787380.0, + "step": 20465 + }, + { + "entropy": 0.07294655237346888, + "epoch": 4.771651707658235, + "grad_norm": 2.859375, + "learning_rate": 4.9482428630999426e-05, + "loss": 0.0817, + "mean_token_accuracy": 0.979939204454422, + "num_tokens": 49805226.0, + "step": 20470 + }, + { + "entropy": 0.09040804943069816, + "epoch": 4.772817344678867, + "grad_norm": 3.59375, + "learning_rate": 4.948199385822103e-05, + "loss": 0.1118, + "mean_token_accuracy": 0.9751499712467193, + "num_tokens": 49817884.0, + "step": 20475 + }, + { + "entropy": 0.05256835455074906, + "epoch": 4.773982981699499, + "grad_norm": 0.50390625, + "learning_rate": 4.948155890677309e-05, + "loss": 0.0451, + "mean_token_accuracy": 0.9860180079936981, + "num_tokens": 49850987.0, + "step": 20480 + }, + { + "entropy": 0.10846010688692331, + "epoch": 4.775148618720131, + "grad_norm": 3.1875, + "learning_rate": 4.948112377666208e-05, + "loss": 0.1126, + "mean_token_accuracy": 0.9711306512355804, + "num_tokens": 49867014.0, + "step": 20485 + }, + { + "entropy": 0.05567761724814772, + "epoch": 4.776314255740762, + "grad_norm": 0.53515625, + "learning_rate": 4.94806884678945e-05, + "loss": 0.0604, + "mean_token_accuracy": 0.9824544131755829, + "num_tokens": 49899711.0, + "step": 20490 + }, + { + "entropy": 0.07059495002031327, + "epoch": 4.777479892761394, + "grad_norm": 1.90625, + "learning_rate": 4.9480252980476825e-05, + "loss": 0.0977, + "mean_token_accuracy": 0.9781529903411865, + "num_tokens": 49932999.0, + "step": 20495 + }, + { + "entropy": 0.0721295103430748, + "epoch": 4.778645529782025, + "grad_norm": 2.875, + "learning_rate": 4.947981731441557e-05, + "loss": 0.0914, + "mean_token_accuracy": 0.9754843413829803, + "num_tokens": 49942908.0, + "step": 20500 + }, + { + "entropy": 0.11595875062048436, + "epoch": 4.7798111668026575, + "grad_norm": 6.34375, + "learning_rate": 4.947938146971721e-05, + "loss": 0.2174, + "mean_token_accuracy": 0.9521984398365021, + "num_tokens": 49961131.0, + "step": 20505 + }, + { + "entropy": 0.06587783191353083, + "epoch": 4.78097680382329, + "grad_norm": 4.78125, + "learning_rate": 4.9478945446388255e-05, + "loss": 0.0809, + "mean_token_accuracy": 0.9789382636547088, + "num_tokens": 49974497.0, + "step": 20510 + }, + { + "entropy": 0.059248296450823544, + "epoch": 4.782142440843921, + "grad_norm": 2.875, + "learning_rate": 4.947850924443521e-05, + "loss": 0.0772, + "mean_token_accuracy": 0.9822989463806152, + "num_tokens": 49993498.0, + "step": 20515 + }, + { + "entropy": 0.09005047529935836, + "epoch": 4.783308077864553, + "grad_norm": 2.46875, + "learning_rate": 4.9478072863864576e-05, + "loss": 0.1128, + "mean_token_accuracy": 0.9668475985527039, + "num_tokens": 50002253.0, + "step": 20520 + }, + { + "entropy": 0.09272575750946999, + "epoch": 4.784473714885185, + "grad_norm": 2.859375, + "learning_rate": 4.947763630468286e-05, + "loss": 0.1423, + "mean_token_accuracy": 0.9673610985279083, + "num_tokens": 50010592.0, + "step": 20525 + }, + { + "entropy": 0.07717711478471756, + "epoch": 4.785639351905816, + "grad_norm": 0.96484375, + "learning_rate": 4.947719956689657e-05, + "loss": 0.0943, + "mean_token_accuracy": 0.9727614760398865, + "num_tokens": 50041275.0, + "step": 20530 + }, + { + "entropy": 0.12004315797239543, + "epoch": 4.786804988926448, + "grad_norm": 1.296875, + "learning_rate": 4.947676265051222e-05, + "loss": 0.1233, + "mean_token_accuracy": 0.9705375373363495, + "num_tokens": 50068454.0, + "step": 20535 + }, + { + "entropy": 0.07685479950159788, + "epoch": 4.7879706259470804, + "grad_norm": 5.90625, + "learning_rate": 4.947632555553633e-05, + "loss": 0.0732, + "mean_token_accuracy": 0.9742613315582276, + "num_tokens": 50090905.0, + "step": 20540 + }, + { + "entropy": 0.08232810776680707, + "epoch": 4.789136262967712, + "grad_norm": 12.875, + "learning_rate": 4.9475888281975404e-05, + "loss": 0.1358, + "mean_token_accuracy": 0.9695055544376373, + "num_tokens": 50104591.0, + "step": 20545 + }, + { + "entropy": 0.09904810027219355, + "epoch": 4.790301899988344, + "grad_norm": 0.2236328125, + "learning_rate": 4.947545082983597e-05, + "loss": 0.1956, + "mean_token_accuracy": 0.9490303874015809, + "num_tokens": 50151001.0, + "step": 20550 + }, + { + "entropy": 0.05315098352730274, + "epoch": 4.791467537008975, + "grad_norm": 2.640625, + "learning_rate": 4.9475013199124556e-05, + "loss": 0.0487, + "mean_token_accuracy": 0.9870794475078583, + "num_tokens": 50176426.0, + "step": 20555 + }, + { + "entropy": 0.08232418056577444, + "epoch": 4.792633174029607, + "grad_norm": 3.15625, + "learning_rate": 4.947457538984769e-05, + "loss": 0.0556, + "mean_token_accuracy": 0.9761513948440552, + "num_tokens": 50205312.0, + "step": 20560 + }, + { + "entropy": 0.06682766638696194, + "epoch": 4.793798811050239, + "grad_norm": 4.125, + "learning_rate": 4.947413740201189e-05, + "loss": 0.0881, + "mean_token_accuracy": 0.9708187401294708, + "num_tokens": 50234640.0, + "step": 20565 + }, + { + "entropy": 0.08927183225750923, + "epoch": 4.79496444807087, + "grad_norm": 1.1875, + "learning_rate": 4.9473699235623686e-05, + "loss": 0.1019, + "mean_token_accuracy": 0.9742757558822632, + "num_tokens": 50256381.0, + "step": 20570 + }, + { + "entropy": 0.07163733001798392, + "epoch": 4.7961300850915025, + "grad_norm": 0.74609375, + "learning_rate": 4.947326089068962e-05, + "loss": 0.0766, + "mean_token_accuracy": 0.9792376101016999, + "num_tokens": 50272402.0, + "step": 20575 + }, + { + "entropy": 0.0808179883286357, + "epoch": 4.797295722112135, + "grad_norm": 3.796875, + "learning_rate": 4.9472822367216225e-05, + "loss": 0.0901, + "mean_token_accuracy": 0.9793714106082916, + "num_tokens": 50296722.0, + "step": 20580 + }, + { + "entropy": 0.10730183199048042, + "epoch": 4.798461359132766, + "grad_norm": 4.71875, + "learning_rate": 4.9472383665210045e-05, + "loss": 0.1544, + "mean_token_accuracy": 0.9621846616268158, + "num_tokens": 50315927.0, + "step": 20585 + }, + { + "entropy": 0.056595608685165645, + "epoch": 4.799626996153398, + "grad_norm": 2.265625, + "learning_rate": 4.947194478467761e-05, + "loss": 0.0463, + "mean_token_accuracy": 0.9833418965339661, + "num_tokens": 50339939.0, + "step": 20590 + }, + { + "entropy": 0.07467662245035171, + "epoch": 4.80079263317403, + "grad_norm": 2.96875, + "learning_rate": 4.9471505725625475e-05, + "loss": 0.0945, + "mean_token_accuracy": 0.9767918467521668, + "num_tokens": 50349137.0, + "step": 20595 + }, + { + "entropy": 0.09103550501167774, + "epoch": 4.801958270194661, + "grad_norm": 1.234375, + "learning_rate": 4.947106648806018e-05, + "loss": 0.1074, + "mean_token_accuracy": 0.9768331527709961, + "num_tokens": 50371173.0, + "step": 20600 + }, + { + "entropy": 0.07896652333438396, + "epoch": 4.803123907215293, + "grad_norm": 0.9296875, + "learning_rate": 4.947062707198829e-05, + "loss": 0.1035, + "mean_token_accuracy": 0.9723316550254821, + "num_tokens": 50397114.0, + "step": 20605 + }, + { + "entropy": 0.09035487687215209, + "epoch": 4.804289544235925, + "grad_norm": 2.734375, + "learning_rate": 4.947018747741633e-05, + "loss": 0.1021, + "mean_token_accuracy": 0.9717907607555389, + "num_tokens": 50426955.0, + "step": 20610 + }, + { + "entropy": 0.08067240752279758, + "epoch": 4.805455181256557, + "grad_norm": 1.5078125, + "learning_rate": 4.946974770435088e-05, + "loss": 0.0624, + "mean_token_accuracy": 0.974557638168335, + "num_tokens": 50446613.0, + "step": 20615 + }, + { + "entropy": 0.08309468347579241, + "epoch": 4.806620818277189, + "grad_norm": 1.03125, + "learning_rate": 4.946930775279848e-05, + "loss": 0.1288, + "mean_token_accuracy": 0.9713143050670624, + "num_tokens": 50457957.0, + "step": 20620 + }, + { + "entropy": 0.06522488570772111, + "epoch": 4.80778645529782, + "grad_norm": 1.3984375, + "learning_rate": 4.946886762276571e-05, + "loss": 0.0816, + "mean_token_accuracy": 0.976836746931076, + "num_tokens": 50479228.0, + "step": 20625 + }, + { + "entropy": 0.06677600918337703, + "epoch": 4.808952092318452, + "grad_norm": 2.03125, + "learning_rate": 4.946842731425911e-05, + "loss": 0.0917, + "mean_token_accuracy": 0.9727399408817291, + "num_tokens": 50503005.0, + "step": 20630 + }, + { + "entropy": 0.08386764228343964, + "epoch": 4.810117729339083, + "grad_norm": 5.59375, + "learning_rate": 4.9467986827285265e-05, + "loss": 0.1182, + "mean_token_accuracy": 0.9724303543567657, + "num_tokens": 50514092.0, + "step": 20635 + }, + { + "entropy": 0.05374229941517115, + "epoch": 4.811283366359715, + "grad_norm": 0.44140625, + "learning_rate": 4.946754616185073e-05, + "loss": 0.0518, + "mean_token_accuracy": 0.9885022401809692, + "num_tokens": 50554374.0, + "step": 20640 + }, + { + "entropy": 0.06895175650715828, + "epoch": 4.8124490033803475, + "grad_norm": 1.25, + "learning_rate": 4.946710531796209e-05, + "loss": 0.1335, + "mean_token_accuracy": 0.9721293866634368, + "num_tokens": 50565314.0, + "step": 20645 + }, + { + "entropy": 0.07366420496255159, + "epoch": 4.813614640400979, + "grad_norm": 1.59375, + "learning_rate": 4.94666642956259e-05, + "loss": 0.0786, + "mean_token_accuracy": 0.9837552666664123, + "num_tokens": 50577374.0, + "step": 20650 + }, + { + "entropy": 0.09850245006382466, + "epoch": 4.814780277421611, + "grad_norm": 1.359375, + "learning_rate": 4.946622309484875e-05, + "loss": 0.0968, + "mean_token_accuracy": 0.9730374455451966, + "num_tokens": 50587536.0, + "step": 20655 + }, + { + "entropy": 0.08902466334402562, + "epoch": 4.815945914442243, + "grad_norm": 4.34375, + "learning_rate": 4.9465781715637224e-05, + "loss": 0.1062, + "mean_token_accuracy": 0.9702731013298035, + "num_tokens": 50597004.0, + "step": 20660 + }, + { + "entropy": 0.06709796143695712, + "epoch": 4.817111551462874, + "grad_norm": 1.75, + "learning_rate": 4.946534015799789e-05, + "loss": 0.0396, + "mean_token_accuracy": 0.983069121837616, + "num_tokens": 50624783.0, + "step": 20665 + }, + { + "entropy": 0.06530861468054354, + "epoch": 4.818277188483506, + "grad_norm": 1.625, + "learning_rate": 4.946489842193733e-05, + "loss": 0.0506, + "mean_token_accuracy": 0.9851066172122955, + "num_tokens": 50652000.0, + "step": 20670 + }, + { + "entropy": 0.08375565335154533, + "epoch": 4.819442825504138, + "grad_norm": 2.578125, + "learning_rate": 4.946445650746214e-05, + "loss": 0.1049, + "mean_token_accuracy": 0.9721269071102142, + "num_tokens": 50661199.0, + "step": 20675 + }, + { + "entropy": 0.0636015109717846, + "epoch": 4.82060846252477, + "grad_norm": 1.453125, + "learning_rate": 4.946401441457891e-05, + "loss": 0.1162, + "mean_token_accuracy": 0.9724109470844269, + "num_tokens": 50671843.0, + "step": 20680 + }, + { + "entropy": 0.07089730594307184, + "epoch": 4.821774099545402, + "grad_norm": 2.890625, + "learning_rate": 4.946357214329423e-05, + "loss": 0.0719, + "mean_token_accuracy": 0.9801780402660369, + "num_tokens": 50686035.0, + "step": 20685 + }, + { + "entropy": 0.06434464119374753, + "epoch": 4.822939736566033, + "grad_norm": 5.28125, + "learning_rate": 4.9463129693614705e-05, + "loss": 0.0852, + "mean_token_accuracy": 0.9769482016563416, + "num_tokens": 50707631.0, + "step": 20690 + }, + { + "entropy": 0.07662344705313444, + "epoch": 4.824105373586665, + "grad_norm": 4.15625, + "learning_rate": 4.946268706554691e-05, + "loss": 0.1046, + "mean_token_accuracy": 0.9777352511882782, + "num_tokens": 50724343.0, + "step": 20695 + }, + { + "entropy": 0.06888558939099312, + "epoch": 4.825271010607297, + "grad_norm": 2.34375, + "learning_rate": 4.946224425909746e-05, + "loss": 0.0598, + "mean_token_accuracy": 0.9821949362754822, + "num_tokens": 50745360.0, + "step": 20700 + }, + { + "entropy": 0.06832386320456862, + "epoch": 4.826436647627928, + "grad_norm": 0.349609375, + "learning_rate": 4.946180127427296e-05, + "loss": 0.0733, + "mean_token_accuracy": 0.980415141582489, + "num_tokens": 50771964.0, + "step": 20705 + }, + { + "entropy": 0.0478357121348381, + "epoch": 4.8276022846485604, + "grad_norm": 1.1875, + "learning_rate": 4.9461358111080015e-05, + "loss": 0.0616, + "mean_token_accuracy": 0.9851399242877961, + "num_tokens": 50815419.0, + "step": 20710 + }, + { + "entropy": 0.07840246148407459, + "epoch": 4.8287679216691926, + "grad_norm": 0.255859375, + "learning_rate": 4.946091476952522e-05, + "loss": 0.0939, + "mean_token_accuracy": 0.9758677661418915, + "num_tokens": 50839360.0, + "step": 20715 + }, + { + "entropy": 0.12673660293221473, + "epoch": 4.829933558689824, + "grad_norm": 4.1875, + "learning_rate": 4.94604712496152e-05, + "loss": 0.1517, + "mean_token_accuracy": 0.963898116350174, + "num_tokens": 50855945.0, + "step": 20720 + }, + { + "entropy": 0.04601201200857759, + "epoch": 4.831099195710456, + "grad_norm": 0.41796875, + "learning_rate": 4.9460027551356566e-05, + "loss": 0.0403, + "mean_token_accuracy": 0.9830052733421326, + "num_tokens": 50883110.0, + "step": 20725 + }, + { + "entropy": 0.06605788934975862, + "epoch": 4.832264832731088, + "grad_norm": 0.333984375, + "learning_rate": 4.945958367475593e-05, + "loss": 0.0815, + "mean_token_accuracy": 0.9799921751022339, + "num_tokens": 50905024.0, + "step": 20730 + }, + { + "entropy": 0.06048436416313052, + "epoch": 4.833430469751719, + "grad_norm": 1.875, + "learning_rate": 4.945913961981992e-05, + "loss": 0.0724, + "mean_token_accuracy": 0.9796711564064026, + "num_tokens": 50920962.0, + "step": 20735 + }, + { + "entropy": 0.08121685888618231, + "epoch": 4.834596106772351, + "grad_norm": 1.03125, + "learning_rate": 4.9458695386555135e-05, + "loss": 0.0782, + "mean_token_accuracy": 0.9797780096530915, + "num_tokens": 50937892.0, + "step": 20740 + }, + { + "entropy": 0.0714781729504466, + "epoch": 4.8357617437929825, + "grad_norm": 3.59375, + "learning_rate": 4.945825097496823e-05, + "loss": 0.0972, + "mean_token_accuracy": 0.975777804851532, + "num_tokens": 50950534.0, + "step": 20745 + }, + { + "entropy": 0.050937081407755616, + "epoch": 4.836927380813615, + "grad_norm": 0.5859375, + "learning_rate": 4.945780638506581e-05, + "loss": 0.0491, + "mean_token_accuracy": 0.9856567502021789, + "num_tokens": 50983069.0, + "step": 20750 + }, + { + "entropy": 0.05653784843161702, + "epoch": 4.838093017834247, + "grad_norm": 2.125, + "learning_rate": 4.945736161685451e-05, + "loss": 0.0419, + "mean_token_accuracy": 0.9845134735107421, + "num_tokens": 51009508.0, + "step": 20755 + }, + { + "entropy": 0.06965322587639093, + "epoch": 4.839258654854878, + "grad_norm": 1.203125, + "learning_rate": 4.945691667034096e-05, + "loss": 0.1131, + "mean_token_accuracy": 0.968089246749878, + "num_tokens": 51029970.0, + "step": 20760 + }, + { + "entropy": 0.04931186120957136, + "epoch": 4.84042429187551, + "grad_norm": 0.57421875, + "learning_rate": 4.9456471545531804e-05, + "loss": 0.0373, + "mean_token_accuracy": 0.9879566371440888, + "num_tokens": 51050398.0, + "step": 20765 + }, + { + "entropy": 0.06215945780277252, + "epoch": 4.841589928896141, + "grad_norm": 1.2578125, + "learning_rate": 4.945602624243368e-05, + "loss": 0.0886, + "mean_token_accuracy": 0.9776066720485688, + "num_tokens": 51063243.0, + "step": 20770 + }, + { + "entropy": 0.08728420175611973, + "epoch": 4.842755565916773, + "grad_norm": 4.15625, + "learning_rate": 4.945558076105321e-05, + "loss": 0.0949, + "mean_token_accuracy": 0.9721355080604553, + "num_tokens": 51073906.0, + "step": 20775 + }, + { + "entropy": 0.06952288392931223, + "epoch": 4.8439212029374055, + "grad_norm": 0.333984375, + "learning_rate": 4.9455135101397056e-05, + "loss": 0.0831, + "mean_token_accuracy": 0.9805684685707092, + "num_tokens": 51100643.0, + "step": 20780 + }, + { + "entropy": 0.04499766388908029, + "epoch": 4.845086839958037, + "grad_norm": 0.625, + "learning_rate": 4.945468926347185e-05, + "loss": 0.0246, + "mean_token_accuracy": 0.9909033119678498, + "num_tokens": 51136680.0, + "step": 20785 + }, + { + "entropy": 0.04354067463427782, + "epoch": 4.846252476978669, + "grad_norm": 0.3359375, + "learning_rate": 4.945424324728425e-05, + "loss": 0.0345, + "mean_token_accuracy": 0.9890659034252167, + "num_tokens": 51159069.0, + "step": 20790 + }, + { + "entropy": 0.08422119580209256, + "epoch": 4.847418113999301, + "grad_norm": 1.0625, + "learning_rate": 4.945379705284091e-05, + "loss": 0.0883, + "mean_token_accuracy": 0.9788319587707519, + "num_tokens": 51170620.0, + "step": 20795 + }, + { + "entropy": 0.07503297440707683, + "epoch": 4.848583751019932, + "grad_norm": 4.46875, + "learning_rate": 4.945335068014847e-05, + "loss": 0.1176, + "mean_token_accuracy": 0.9673508882522583, + "num_tokens": 51180690.0, + "step": 20800 + }, + { + "entropy": 0.09586451314389706, + "epoch": 4.849749388040564, + "grad_norm": 1.3828125, + "learning_rate": 4.945290412921359e-05, + "loss": 0.1468, + "mean_token_accuracy": 0.9734158456325531, + "num_tokens": 51213063.0, + "step": 20805 + }, + { + "entropy": 0.06957128308713437, + "epoch": 4.850915025061196, + "grad_norm": 2.09375, + "learning_rate": 4.945245740004293e-05, + "loss": 0.0854, + "mean_token_accuracy": 0.9737328290939331, + "num_tokens": 51233378.0, + "step": 20810 + }, + { + "entropy": 0.04465410923585296, + "epoch": 4.8520806620818275, + "grad_norm": 0.66796875, + "learning_rate": 4.9452010492643165e-05, + "loss": 0.0513, + "mean_token_accuracy": 0.9855859398841857, + "num_tokens": 51257918.0, + "step": 20815 + }, + { + "entropy": 0.06682367827743292, + "epoch": 4.85324629910246, + "grad_norm": 1.671875, + "learning_rate": 4.945156340702093e-05, + "loss": 0.0605, + "mean_token_accuracy": 0.9807843506336212, + "num_tokens": 51278448.0, + "step": 20820 + }, + { + "entropy": 0.09898395230993629, + "epoch": 4.854411936123091, + "grad_norm": 0.32421875, + "learning_rate": 4.945111614318292e-05, + "loss": 0.0838, + "mean_token_accuracy": 0.9787111759185791, + "num_tokens": 51297431.0, + "step": 20825 + }, + { + "entropy": 0.07027254728600382, + "epoch": 4.855577573143723, + "grad_norm": 3.46875, + "learning_rate": 4.945066870113579e-05, + "loss": 0.0683, + "mean_token_accuracy": 0.9835409700870514, + "num_tokens": 51320978.0, + "step": 20830 + }, + { + "entropy": 0.0634617348201573, + "epoch": 4.856743210164355, + "grad_norm": 1.0234375, + "learning_rate": 4.945022108088621e-05, + "loss": 0.0545, + "mean_token_accuracy": 0.9819484114646911, + "num_tokens": 51348363.0, + "step": 20835 + }, + { + "entropy": 0.06744776256382465, + "epoch": 4.857908847184986, + "grad_norm": 4.59375, + "learning_rate": 4.944977328244086e-05, + "loss": 0.0571, + "mean_token_accuracy": 0.9829931437969208, + "num_tokens": 51369185.0, + "step": 20840 + }, + { + "entropy": 0.05282375058159232, + "epoch": 4.859074484205618, + "grad_norm": 0.466796875, + "learning_rate": 4.944932530580643e-05, + "loss": 0.0313, + "mean_token_accuracy": 0.98763307929039, + "num_tokens": 51400901.0, + "step": 20845 + }, + { + "entropy": 0.06535831596702338, + "epoch": 4.8602401212262505, + "grad_norm": 0.81640625, + "learning_rate": 4.9448877150989576e-05, + "loss": 0.0602, + "mean_token_accuracy": 0.9842148721218109, + "num_tokens": 51422228.0, + "step": 20850 + }, + { + "entropy": 0.062378438748419286, + "epoch": 4.861405758246882, + "grad_norm": 0.85546875, + "learning_rate": 4.944842881799699e-05, + "loss": 0.0462, + "mean_token_accuracy": 0.9865955471992492, + "num_tokens": 51439003.0, + "step": 20855 + }, + { + "entropy": 0.07847417313605547, + "epoch": 4.862571395267514, + "grad_norm": 4.875, + "learning_rate": 4.9447980306835364e-05, + "loss": 0.0648, + "mean_token_accuracy": 0.9656040012836457, + "num_tokens": 51466913.0, + "step": 20860 + }, + { + "entropy": 0.07198535539209842, + "epoch": 4.863737032288146, + "grad_norm": 3.609375, + "learning_rate": 4.944753161751138e-05, + "loss": 0.0434, + "mean_token_accuracy": 0.9815111875534057, + "num_tokens": 51496979.0, + "step": 20865 + }, + { + "entropy": 0.16263010874390602, + "epoch": 4.864902669308777, + "grad_norm": 0.55078125, + "learning_rate": 4.9447082750031724e-05, + "loss": 0.3119, + "mean_token_accuracy": 0.9357062309980393, + "num_tokens": 51539514.0, + "step": 20870 + }, + { + "entropy": 0.08168750740587712, + "epoch": 4.866068306329409, + "grad_norm": 3.96875, + "learning_rate": 4.94466337044031e-05, + "loss": 0.1601, + "mean_token_accuracy": 0.9629373013973236, + "num_tokens": 51548829.0, + "step": 20875 + }, + { + "entropy": 0.06368317557498812, + "epoch": 4.8672339433500404, + "grad_norm": 0.9453125, + "learning_rate": 4.9446184480632194e-05, + "loss": 0.0763, + "mean_token_accuracy": 0.9779473185539246, + "num_tokens": 51573706.0, + "step": 20880 + }, + { + "entropy": 0.07042047139257193, + "epoch": 4.8683995803706726, + "grad_norm": 1.7890625, + "learning_rate": 4.944573507872572e-05, + "loss": 0.0907, + "mean_token_accuracy": 0.9742065310478211, + "num_tokens": 51586256.0, + "step": 20885 + }, + { + "entropy": 0.07600197829306125, + "epoch": 4.869565217391305, + "grad_norm": 0.890625, + "learning_rate": 4.944528549869036e-05, + "loss": 0.1134, + "mean_token_accuracy": 0.9720952391624451, + "num_tokens": 51597109.0, + "step": 20890 + }, + { + "entropy": 0.10058935396373272, + "epoch": 4.870730854411936, + "grad_norm": 2.546875, + "learning_rate": 4.9444835740532825e-05, + "loss": 0.1386, + "mean_token_accuracy": 0.968506908416748, + "num_tokens": 51604876.0, + "step": 20895 + }, + { + "entropy": 0.04899873323738575, + "epoch": 4.871896491432568, + "grad_norm": 0.44921875, + "learning_rate": 4.9444385804259826e-05, + "loss": 0.0424, + "mean_token_accuracy": 0.9887868106365204, + "num_tokens": 51637614.0, + "step": 20900 + }, + { + "entropy": 0.09722777623683214, + "epoch": 4.873062128453199, + "grad_norm": 0.30859375, + "learning_rate": 4.944393568987807e-05, + "loss": 0.0429, + "mean_token_accuracy": 0.9840863108634949, + "num_tokens": 51671476.0, + "step": 20905 + }, + { + "entropy": 0.07712397929280997, + "epoch": 4.874227765473831, + "grad_norm": 0.5703125, + "learning_rate": 4.944348539739427e-05, + "loss": 0.0866, + "mean_token_accuracy": 0.9797841608524323, + "num_tokens": 51684155.0, + "step": 20910 + }, + { + "entropy": 0.0837567220441997, + "epoch": 4.875393402494463, + "grad_norm": 4.625, + "learning_rate": 4.944303492681514e-05, + "loss": 0.0708, + "mean_token_accuracy": 0.9777788579463959, + "num_tokens": 51707822.0, + "step": 20915 + }, + { + "entropy": 0.1396466862410307, + "epoch": 4.876559039515095, + "grad_norm": 2.0625, + "learning_rate": 4.944258427814739e-05, + "loss": 0.2607, + "mean_token_accuracy": 0.9586719572544098, + "num_tokens": 51734253.0, + "step": 20920 + }, + { + "entropy": 0.0678241515532136, + "epoch": 4.877724676535727, + "grad_norm": 8.9375, + "learning_rate": 4.9442133451397745e-05, + "loss": 0.0741, + "mean_token_accuracy": 0.9762378096580505, + "num_tokens": 51760598.0, + "step": 20925 + }, + { + "entropy": 0.07093913480639458, + "epoch": 4.878890313556359, + "grad_norm": 1.4375, + "learning_rate": 4.944168244657293e-05, + "loss": 0.0865, + "mean_token_accuracy": 0.9774516761302948, + "num_tokens": 51772195.0, + "step": 20930 + }, + { + "entropy": 0.05839260285720229, + "epoch": 4.88005595057699, + "grad_norm": 2.96875, + "learning_rate": 4.9441231263679664e-05, + "loss": 0.0666, + "mean_token_accuracy": 0.9820129334926605, + "num_tokens": 51795228.0, + "step": 20935 + }, + { + "entropy": 0.07823163829743862, + "epoch": 4.881221587597622, + "grad_norm": 7.625, + "learning_rate": 4.9440779902724684e-05, + "loss": 0.1215, + "mean_token_accuracy": 0.9703441679477691, + "num_tokens": 51815122.0, + "step": 20940 + }, + { + "entropy": 0.05640836558304727, + "epoch": 4.882387224618254, + "grad_norm": 0.953125, + "learning_rate": 4.944032836371472e-05, + "loss": 0.0493, + "mean_token_accuracy": 0.9837121963500977, + "num_tokens": 51840298.0, + "step": 20945 + }, + { + "entropy": 0.06406810432672501, + "epoch": 4.8835528616388855, + "grad_norm": 3.0, + "learning_rate": 4.94398766466565e-05, + "loss": 0.104, + "mean_token_accuracy": 0.9748159289360047, + "num_tokens": 51850125.0, + "step": 20950 + }, + { + "entropy": 0.06545968130230903, + "epoch": 4.884718498659518, + "grad_norm": 2.140625, + "learning_rate": 4.9439424751556754e-05, + "loss": 0.0919, + "mean_token_accuracy": 0.9777773797512055, + "num_tokens": 51861624.0, + "step": 20955 + }, + { + "entropy": 0.059191903471946715, + "epoch": 4.885884135680149, + "grad_norm": 0.40625, + "learning_rate": 4.943897267842223e-05, + "loss": 0.0471, + "mean_token_accuracy": 0.9841900050640107, + "num_tokens": 51885430.0, + "step": 20960 + }, + { + "entropy": 0.08465072922408581, + "epoch": 4.887049772700781, + "grad_norm": 1.1875, + "learning_rate": 4.9438520427259674e-05, + "loss": 0.0909, + "mean_token_accuracy": 0.974427980184555, + "num_tokens": 51920408.0, + "step": 20965 + }, + { + "entropy": 0.07176385279744864, + "epoch": 4.888215409721413, + "grad_norm": 1.5703125, + "learning_rate": 4.943806799807581e-05, + "loss": 0.0573, + "mean_token_accuracy": 0.9810054063796997, + "num_tokens": 51948620.0, + "step": 20970 + }, + { + "entropy": 0.058643194288015364, + "epoch": 4.889381046742044, + "grad_norm": 1.7734375, + "learning_rate": 4.9437615390877404e-05, + "loss": 0.054, + "mean_token_accuracy": 0.9795984923839569, + "num_tokens": 51971871.0, + "step": 20975 + }, + { + "entropy": 0.07647083997726441, + "epoch": 4.890546683762676, + "grad_norm": 0.90625, + "learning_rate": 4.943716260567119e-05, + "loss": 0.0915, + "mean_token_accuracy": 0.9773392856121064, + "num_tokens": 51982972.0, + "step": 20980 + }, + { + "entropy": 0.06268258430063725, + "epoch": 4.891712320783308, + "grad_norm": 1.8984375, + "learning_rate": 4.943670964246394e-05, + "loss": 0.0938, + "mean_token_accuracy": 0.9779832363128662, + "num_tokens": 51994332.0, + "step": 20985 + }, + { + "entropy": 0.05670452257618308, + "epoch": 4.89287795780394, + "grad_norm": 0.45703125, + "learning_rate": 4.943625650126238e-05, + "loss": 0.055, + "mean_token_accuracy": 0.9824697732925415, + "num_tokens": 52012043.0, + "step": 20990 + }, + { + "entropy": 0.09376739151775837, + "epoch": 4.894043594824572, + "grad_norm": 0.6328125, + "learning_rate": 4.943580318207329e-05, + "loss": 0.102, + "mean_token_accuracy": 0.972071361541748, + "num_tokens": 52034314.0, + "step": 20995 + }, + { + "entropy": 0.06972389779984951, + "epoch": 4.895209231845204, + "grad_norm": 1.3203125, + "learning_rate": 4.943534968490342e-05, + "loss": 0.1101, + "mean_token_accuracy": 0.9718354284763336, + "num_tokens": 52045098.0, + "step": 21000 + }, + { + "entropy": 0.06242169812321663, + "epoch": 4.896374868865835, + "grad_norm": 4.09375, + "learning_rate": 4.943489600975953e-05, + "loss": 0.0828, + "mean_token_accuracy": 0.9797560632228851, + "num_tokens": 52068145.0, + "step": 21005 + }, + { + "entropy": 0.06226806389167905, + "epoch": 4.897540505886467, + "grad_norm": 6.9375, + "learning_rate": 4.94344421566484e-05, + "loss": 0.0678, + "mean_token_accuracy": 0.9791005849838257, + "num_tokens": 52097424.0, + "step": 21010 + }, + { + "entropy": 0.08740999130532146, + "epoch": 4.898706142907098, + "grad_norm": 7.84375, + "learning_rate": 4.9433988125576783e-05, + "loss": 0.1157, + "mean_token_accuracy": 0.9717265069484711, + "num_tokens": 52138832.0, + "step": 21015 + }, + { + "entropy": 0.061702551785856485, + "epoch": 4.8998717799277305, + "grad_norm": 3.109375, + "learning_rate": 4.943353391655145e-05, + "loss": 0.0524, + "mean_token_accuracy": 0.9776601493358612, + "num_tokens": 52164329.0, + "step": 21020 + }, + { + "entropy": 0.06630434533581138, + "epoch": 4.901037416948363, + "grad_norm": 0.1982421875, + "learning_rate": 4.943307952957918e-05, + "loss": 0.0794, + "mean_token_accuracy": 0.9799892425537109, + "num_tokens": 52179810.0, + "step": 21025 + }, + { + "entropy": 0.08890583254396915, + "epoch": 4.902203053968994, + "grad_norm": 1.0546875, + "learning_rate": 4.943262496466675e-05, + "loss": 0.1093, + "mean_token_accuracy": 0.9698486030101776, + "num_tokens": 52194861.0, + "step": 21030 + }, + { + "entropy": 0.0646402221173048, + "epoch": 4.903368690989626, + "grad_norm": 3.703125, + "learning_rate": 4.943217022182093e-05, + "loss": 0.0697, + "mean_token_accuracy": 0.9834041774272919, + "num_tokens": 52207180.0, + "step": 21035 + }, + { + "entropy": 0.06280097691342235, + "epoch": 4.904534328010257, + "grad_norm": 2.453125, + "learning_rate": 4.9431715301048504e-05, + "loss": 0.0596, + "mean_token_accuracy": 0.9817287743091583, + "num_tokens": 52232325.0, + "step": 21040 + }, + { + "entropy": 0.07492497358471155, + "epoch": 4.905699965030889, + "grad_norm": 0.97265625, + "learning_rate": 4.943126020235626e-05, + "loss": 0.0795, + "mean_token_accuracy": 0.974094831943512, + "num_tokens": 52244692.0, + "step": 21045 + }, + { + "entropy": 0.07054590750485659, + "epoch": 4.906865602051521, + "grad_norm": 0.75, + "learning_rate": 4.943080492575097e-05, + "loss": 0.1018, + "mean_token_accuracy": 0.9751786947250366, + "num_tokens": 52257785.0, + "step": 21050 + }, + { + "entropy": 0.06916769053786993, + "epoch": 4.9080312390721526, + "grad_norm": 1.625, + "learning_rate": 4.943034947123944e-05, + "loss": 0.0913, + "mean_token_accuracy": 0.9798057615756989, + "num_tokens": 52269539.0, + "step": 21055 + }, + { + "entropy": 0.04995636437088251, + "epoch": 4.909196876092785, + "grad_norm": 0.58203125, + "learning_rate": 4.9429893838828464e-05, + "loss": 0.0399, + "mean_token_accuracy": 0.981740140914917, + "num_tokens": 52319714.0, + "step": 21060 + }, + { + "entropy": 0.08283067289739847, + "epoch": 4.910362513113417, + "grad_norm": 2.59375, + "learning_rate": 4.942943802852482e-05, + "loss": 0.0966, + "mean_token_accuracy": 0.9736198604106903, + "num_tokens": 52332795.0, + "step": 21065 + }, + { + "entropy": 0.14895931966602802, + "epoch": 4.911528150134048, + "grad_norm": 1.953125, + "learning_rate": 4.942898204033532e-05, + "loss": 0.2523, + "mean_token_accuracy": 0.9456245481967926, + "num_tokens": 52351199.0, + "step": 21070 + }, + { + "entropy": 0.06533634569495916, + "epoch": 4.91269378715468, + "grad_norm": 2.59375, + "learning_rate": 4.942852587426674e-05, + "loss": 0.0497, + "mean_token_accuracy": 0.9837863922119141, + "num_tokens": 52376245.0, + "step": 21075 + }, + { + "entropy": 0.10989860650151968, + "epoch": 4.913859424175312, + "grad_norm": 0.984375, + "learning_rate": 4.94280695303259e-05, + "loss": 0.1259, + "mean_token_accuracy": 0.9637296438217163, + "num_tokens": 52436860.0, + "step": 21080 + }, + { + "entropy": 0.08045022562146187, + "epoch": 4.915025061195943, + "grad_norm": 3.03125, + "learning_rate": 4.9427613008519616e-05, + "loss": 0.0736, + "mean_token_accuracy": 0.9755086541175843, + "num_tokens": 52452073.0, + "step": 21085 + }, + { + "entropy": 0.0864185519516468, + "epoch": 4.9161906982165755, + "grad_norm": 2.53125, + "learning_rate": 4.9427156308854674e-05, + "loss": 0.1074, + "mean_token_accuracy": 0.9733963072299957, + "num_tokens": 52461200.0, + "step": 21090 + }, + { + "entropy": 0.09636174971237779, + "epoch": 4.917356335237207, + "grad_norm": 1.265625, + "learning_rate": 4.942669943133789e-05, + "loss": 0.1164, + "mean_token_accuracy": 0.9673805236816406, + "num_tokens": 52477657.0, + "step": 21095 + }, + { + "entropy": 0.08892916599288583, + "epoch": 4.918521972257839, + "grad_norm": 0.88671875, + "learning_rate": 4.9426242375976085e-05, + "loss": 0.1248, + "mean_token_accuracy": 0.9690983772277832, + "num_tokens": 52495784.0, + "step": 21100 + }, + { + "entropy": 0.07809329750016332, + "epoch": 4.919687609278471, + "grad_norm": 0.3125, + "learning_rate": 4.942578514277606e-05, + "loss": 0.1099, + "mean_token_accuracy": 0.9738022804260253, + "num_tokens": 52512573.0, + "step": 21105 + }, + { + "entropy": 0.04925914811901748, + "epoch": 4.920853246299102, + "grad_norm": 4.71875, + "learning_rate": 4.9425327731744644e-05, + "loss": 0.0413, + "mean_token_accuracy": 0.9875961005687713, + "num_tokens": 52550932.0, + "step": 21110 + }, + { + "entropy": 0.08235758668743073, + "epoch": 4.922018883319734, + "grad_norm": 0.41015625, + "learning_rate": 4.942487014288866e-05, + "loss": 0.0886, + "mean_token_accuracy": 0.9814740777015686, + "num_tokens": 52587741.0, + "step": 21115 + }, + { + "entropy": 0.06665782891213894, + "epoch": 4.923184520340366, + "grad_norm": 1.578125, + "learning_rate": 4.9424412376214915e-05, + "loss": 0.0696, + "mean_token_accuracy": 0.977788758277893, + "num_tokens": 52600393.0, + "step": 21120 + }, + { + "entropy": 0.11595227997750043, + "epoch": 4.924350157360998, + "grad_norm": 1.8359375, + "learning_rate": 4.9423954431730254e-05, + "loss": 0.0991, + "mean_token_accuracy": 0.9738423705101014, + "num_tokens": 52610343.0, + "step": 21125 + }, + { + "entropy": 0.08311420790851116, + "epoch": 4.92551579438163, + "grad_norm": 1.3671875, + "learning_rate": 4.94234963094415e-05, + "loss": 0.0999, + "mean_token_accuracy": 0.9729685187339783, + "num_tokens": 52628675.0, + "step": 21130 + }, + { + "entropy": 0.06154662910848856, + "epoch": 4.926681431402262, + "grad_norm": 0.45703125, + "learning_rate": 4.9423038009355474e-05, + "loss": 0.0643, + "mean_token_accuracy": 0.9839459717273712, + "num_tokens": 52648559.0, + "step": 21135 + }, + { + "entropy": 0.06962651396170258, + "epoch": 4.927847068422893, + "grad_norm": 1.0859375, + "learning_rate": 4.942257953147903e-05, + "loss": 0.0914, + "mean_token_accuracy": 0.9786814033985138, + "num_tokens": 52670788.0, + "step": 21140 + }, + { + "entropy": 0.06154291275888681, + "epoch": 4.929012705443525, + "grad_norm": 1.828125, + "learning_rate": 4.9422120875818986e-05, + "loss": 0.0548, + "mean_token_accuracy": 0.9831968486309052, + "num_tokens": 52695532.0, + "step": 21145 + }, + { + "entropy": 0.0829665838740766, + "epoch": 4.930178342464156, + "grad_norm": 1.25, + "learning_rate": 4.9421662042382185e-05, + "loss": 0.1213, + "mean_token_accuracy": 0.9721298694610596, + "num_tokens": 52713871.0, + "step": 21150 + }, + { + "entropy": 0.10626598820090294, + "epoch": 4.931343979484788, + "grad_norm": 6.25, + "learning_rate": 4.942120303117547e-05, + "loss": 0.1169, + "mean_token_accuracy": 0.9673919379711151, + "num_tokens": 52741429.0, + "step": 21155 + }, + { + "entropy": 0.07647773548960686, + "epoch": 4.9325096165054205, + "grad_norm": 1.6875, + "learning_rate": 4.94207438422057e-05, + "loss": 0.0673, + "mean_token_accuracy": 0.9810298800468444, + "num_tokens": 52755321.0, + "step": 21160 + }, + { + "entropy": 0.05966353416442871, + "epoch": 4.933675253526052, + "grad_norm": 2.390625, + "learning_rate": 4.9420284475479697e-05, + "loss": 0.065, + "mean_token_accuracy": 0.9799255132675171, + "num_tokens": 52780757.0, + "step": 21165 + }, + { + "entropy": 0.07079061567783355, + "epoch": 4.934840890546684, + "grad_norm": 3.4375, + "learning_rate": 4.941982493100433e-05, + "loss": 0.0882, + "mean_token_accuracy": 0.9788235485553741, + "num_tokens": 52791577.0, + "step": 21170 + }, + { + "entropy": 0.07353595411404967, + "epoch": 4.936006527567315, + "grad_norm": 1.125, + "learning_rate": 4.941936520878644e-05, + "loss": 0.0684, + "mean_token_accuracy": 0.982946801185608, + "num_tokens": 52821422.0, + "step": 21175 + }, + { + "entropy": 0.05350867630913854, + "epoch": 4.937172164587947, + "grad_norm": 3.0, + "learning_rate": 4.9418905308832884e-05, + "loss": 0.0615, + "mean_token_accuracy": 0.9831634402275086, + "num_tokens": 52841396.0, + "step": 21180 + }, + { + "entropy": 0.07068223357200623, + "epoch": 4.938337801608579, + "grad_norm": 0.37890625, + "learning_rate": 4.941844523115053e-05, + "loss": 0.066, + "mean_token_accuracy": 0.9852615475654602, + "num_tokens": 52853780.0, + "step": 21185 + }, + { + "entropy": 0.14305189084261655, + "epoch": 4.9395034386292105, + "grad_norm": 4.59375, + "learning_rate": 4.941798497574623e-05, + "loss": 0.1897, + "mean_token_accuracy": 0.9685503423213959, + "num_tokens": 52889834.0, + "step": 21190 + }, + { + "entropy": 0.05313537791371346, + "epoch": 4.940669075649843, + "grad_norm": 0.345703125, + "learning_rate": 4.941752454262685e-05, + "loss": 0.0439, + "mean_token_accuracy": 0.985480272769928, + "num_tokens": 52918093.0, + "step": 21195 + }, + { + "entropy": 0.07132979445159435, + "epoch": 4.941834712670475, + "grad_norm": 2.59375, + "learning_rate": 4.9417063931799245e-05, + "loss": 0.1044, + "mean_token_accuracy": 0.9747491538524627, + "num_tokens": 52927212.0, + "step": 21200 + }, + { + "entropy": 0.058989010052755475, + "epoch": 4.943000349691106, + "grad_norm": 0.59765625, + "learning_rate": 4.94166031432703e-05, + "loss": 0.0747, + "mean_token_accuracy": 0.9773088097572327, + "num_tokens": 52952905.0, + "step": 21205 + }, + { + "entropy": 0.06833735201507807, + "epoch": 4.944165986711738, + "grad_norm": 0.6875, + "learning_rate": 4.941614217704688e-05, + "loss": 0.0705, + "mean_token_accuracy": 0.9795933485031127, + "num_tokens": 52976311.0, + "step": 21210 + }, + { + "entropy": 0.07373546492308378, + "epoch": 4.94533162373237, + "grad_norm": 1.5390625, + "learning_rate": 4.941568103313585e-05, + "loss": 0.0805, + "mean_token_accuracy": 0.9760990738868713, + "num_tokens": 52990179.0, + "step": 21215 + }, + { + "entropy": 0.0755191221833229, + "epoch": 4.946497260753001, + "grad_norm": 5.9375, + "learning_rate": 4.9415219711544104e-05, + "loss": 0.0962, + "mean_token_accuracy": 0.9685922086238861, + "num_tokens": 53008541.0, + "step": 21220 + }, + { + "entropy": 0.07566210273653269, + "epoch": 4.947662897773633, + "grad_norm": 2.984375, + "learning_rate": 4.94147582122785e-05, + "loss": 0.1012, + "mean_token_accuracy": 0.973409628868103, + "num_tokens": 53019565.0, + "step": 21225 + }, + { + "entropy": 0.07277205716818572, + "epoch": 4.948828534794265, + "grad_norm": 0.67578125, + "learning_rate": 4.941429653534594e-05, + "loss": 0.0632, + "mean_token_accuracy": 0.9744443714618682, + "num_tokens": 53042532.0, + "step": 21230 + }, + { + "entropy": 0.0649688365869224, + "epoch": 4.949994171814897, + "grad_norm": 2.3125, + "learning_rate": 4.9413834680753296e-05, + "loss": 0.0657, + "mean_token_accuracy": 0.9841285765171051, + "num_tokens": 53063708.0, + "step": 21235 + }, + { + "entropy": 0.06395247289910913, + "epoch": 4.951159808835529, + "grad_norm": 1.265625, + "learning_rate": 4.941337264850745e-05, + "loss": 0.0476, + "mean_token_accuracy": 0.9863583087921143, + "num_tokens": 53104339.0, + "step": 21240 + }, + { + "entropy": 0.07488085981458426, + "epoch": 4.95232544585616, + "grad_norm": 3.25, + "learning_rate": 4.941291043861531e-05, + "loss": 0.1062, + "mean_token_accuracy": 0.9732397139072418, + "num_tokens": 53122662.0, + "step": 21245 + }, + { + "entropy": 0.06674009589478373, + "epoch": 4.953491082876792, + "grad_norm": 3.65625, + "learning_rate": 4.941244805108375e-05, + "loss": 0.059, + "mean_token_accuracy": 0.9823914647102356, + "num_tokens": 53149847.0, + "step": 21250 + }, + { + "entropy": 0.08388477927073837, + "epoch": 4.954656719897424, + "grad_norm": 2.71875, + "learning_rate": 4.9411985485919674e-05, + "loss": 0.0803, + "mean_token_accuracy": 0.9737024128437042, + "num_tokens": 53183327.0, + "step": 21255 + }, + { + "entropy": 0.07561515085399151, + "epoch": 4.9558223569180555, + "grad_norm": 1.859375, + "learning_rate": 4.941152274312998e-05, + "loss": 0.0669, + "mean_token_accuracy": 0.9786112248897553, + "num_tokens": 53208441.0, + "step": 21260 + }, + { + "entropy": 0.05005005598068237, + "epoch": 4.956987993938688, + "grad_norm": 0.45703125, + "learning_rate": 4.941105982272156e-05, + "loss": 0.049, + "mean_token_accuracy": 0.9870032131671905, + "num_tokens": 53225543.0, + "step": 21265 + }, + { + "entropy": 0.07580123171210289, + "epoch": 4.95815363095932, + "grad_norm": 5.90625, + "learning_rate": 4.941059672470133e-05, + "loss": 0.1021, + "mean_token_accuracy": 0.9714035093784332, + "num_tokens": 53246905.0, + "step": 21270 + }, + { + "entropy": 0.05712772505357862, + "epoch": 4.959319267979951, + "grad_norm": 0.31640625, + "learning_rate": 4.941013344907619e-05, + "loss": 0.0367, + "mean_token_accuracy": 0.9858934164047242, + "num_tokens": 53276952.0, + "step": 21275 + }, + { + "entropy": 0.0628195583820343, + "epoch": 4.960484905000583, + "grad_norm": 0.86328125, + "learning_rate": 4.9409669995853035e-05, + "loss": 0.0744, + "mean_token_accuracy": 0.9781266987323761, + "num_tokens": 53292563.0, + "step": 21280 + }, + { + "entropy": 0.07548150643706322, + "epoch": 4.961650542021214, + "grad_norm": 4.6875, + "learning_rate": 4.9409206365038794e-05, + "loss": 0.0886, + "mean_token_accuracy": 0.9792620122432709, + "num_tokens": 53313173.0, + "step": 21285 + }, + { + "entropy": 0.06093268636614084, + "epoch": 4.962816179041846, + "grad_norm": 1.296875, + "learning_rate": 4.940874255664037e-05, + "loss": 0.0688, + "mean_token_accuracy": 0.9816692888736724, + "num_tokens": 53329420.0, + "step": 21290 + }, + { + "entropy": 0.09625336267054081, + "epoch": 4.9639818160624785, + "grad_norm": 3.953125, + "learning_rate": 4.940827857066469e-05, + "loss": 0.1071, + "mean_token_accuracy": 0.9731559336185456, + "num_tokens": 53345471.0, + "step": 21295 + }, + { + "entropy": 0.04683486856520176, + "epoch": 4.96514745308311, + "grad_norm": 2.546875, + "learning_rate": 4.940781440711866e-05, + "loss": 0.0374, + "mean_token_accuracy": 0.9829618215560914, + "num_tokens": 53384019.0, + "step": 21300 + }, + { + "entropy": 0.08568495102226734, + "epoch": 4.966313090103742, + "grad_norm": 0.4921875, + "learning_rate": 4.9407350066009204e-05, + "loss": 0.081, + "mean_token_accuracy": 0.9735833466053009, + "num_tokens": 53398294.0, + "step": 21305 + }, + { + "entropy": 0.05991802159696817, + "epoch": 4.967478727124373, + "grad_norm": 3.453125, + "learning_rate": 4.940688554734326e-05, + "loss": 0.0576, + "mean_token_accuracy": 0.9740480959415436, + "num_tokens": 53430339.0, + "step": 21310 + }, + { + "entropy": 0.1412233560346067, + "epoch": 4.968644364145005, + "grad_norm": 0.6015625, + "learning_rate": 4.940642085112773e-05, + "loss": 0.2138, + "mean_token_accuracy": 0.9502263188362121, + "num_tokens": 53471198.0, + "step": 21315 + }, + { + "entropy": 0.04991894001141191, + "epoch": 4.969810001165637, + "grad_norm": 1.390625, + "learning_rate": 4.9405955977369564e-05, + "loss": 0.0592, + "mean_token_accuracy": 0.9838922679424286, + "num_tokens": 53490441.0, + "step": 21320 + }, + { + "entropy": 0.07745058406144381, + "epoch": 4.970975638186268, + "grad_norm": 0.4140625, + "learning_rate": 4.940549092607569e-05, + "loss": 0.1122, + "mean_token_accuracy": 0.9719259083271027, + "num_tokens": 53506639.0, + "step": 21325 + }, + { + "entropy": 0.09155577477067708, + "epoch": 4.9721412752069005, + "grad_norm": 4.75, + "learning_rate": 4.940502569725303e-05, + "loss": 0.1207, + "mean_token_accuracy": 0.9676483154296875, + "num_tokens": 53520060.0, + "step": 21330 + }, + { + "entropy": 0.07278778329491616, + "epoch": 4.973306912227533, + "grad_norm": 0.76171875, + "learning_rate": 4.940456029090854e-05, + "loss": 0.0631, + "mean_token_accuracy": 0.9748271405696869, + "num_tokens": 53534161.0, + "step": 21335 + }, + { + "entropy": 0.08302396573126317, + "epoch": 4.974472549248164, + "grad_norm": 1.5859375, + "learning_rate": 4.9404094707049145e-05, + "loss": 0.0896, + "mean_token_accuracy": 0.9752457678318024, + "num_tokens": 53544994.0, + "step": 21340 + }, + { + "entropy": 0.09030197001993656, + "epoch": 4.975638186268796, + "grad_norm": 1.5546875, + "learning_rate": 4.94036289456818e-05, + "loss": 0.105, + "mean_token_accuracy": 0.9733418822288513, + "num_tokens": 53555728.0, + "step": 21345 + }, + { + "entropy": 0.09017610475420952, + "epoch": 4.976803823289428, + "grad_norm": 1.3125, + "learning_rate": 4.940316300681344e-05, + "loss": 0.0968, + "mean_token_accuracy": 0.9723630130290986, + "num_tokens": 53567499.0, + "step": 21350 + }, + { + "entropy": 0.08766529373824597, + "epoch": 4.977969460310059, + "grad_norm": 2.875, + "learning_rate": 4.940269689045101e-05, + "loss": 0.0966, + "mean_token_accuracy": 0.9772234261035919, + "num_tokens": 53576899.0, + "step": 21355 + }, + { + "entropy": 0.06158297499641776, + "epoch": 4.979135097330691, + "grad_norm": 6.0, + "learning_rate": 4.940223059660147e-05, + "loss": 0.0737, + "mean_token_accuracy": 0.982252448797226, + "num_tokens": 53599827.0, + "step": 21360 + }, + { + "entropy": 0.06548569137230516, + "epoch": 4.980300734351323, + "grad_norm": 1.25, + "learning_rate": 4.940176412527177e-05, + "loss": 0.0958, + "mean_token_accuracy": 0.9723960101604462, + "num_tokens": 53617066.0, + "step": 21365 + }, + { + "entropy": 0.07234325166791677, + "epoch": 4.981466371371955, + "grad_norm": 3.828125, + "learning_rate": 4.940129747646886e-05, + "loss": 0.0842, + "mean_token_accuracy": 0.9767987549304962, + "num_tokens": 53634950.0, + "step": 21370 + }, + { + "entropy": 0.08200656361877919, + "epoch": 4.982632008392587, + "grad_norm": 2.953125, + "learning_rate": 4.940083065019972e-05, + "loss": 0.1237, + "mean_token_accuracy": 0.9713518559932709, + "num_tokens": 53643613.0, + "step": 21375 + }, + { + "entropy": 0.044780909549444914, + "epoch": 4.983797645413218, + "grad_norm": 0.56640625, + "learning_rate": 4.940036364647129e-05, + "loss": 0.0449, + "mean_token_accuracy": 0.9856958091259003, + "num_tokens": 53669670.0, + "step": 21380 + }, + { + "entropy": 0.0722307562828064, + "epoch": 4.98496328243385, + "grad_norm": 4.59375, + "learning_rate": 4.939989646529053e-05, + "loss": 0.0864, + "mean_token_accuracy": 0.9787289142608643, + "num_tokens": 53680604.0, + "step": 21385 + }, + { + "entropy": 0.20885901637375354, + "epoch": 4.986128919454482, + "grad_norm": 2.953125, + "learning_rate": 4.939942910666442e-05, + "loss": 0.3076, + "mean_token_accuracy": 0.9541841089725495, + "num_tokens": 53702324.0, + "step": 21390 + }, + { + "entropy": 0.0543774152174592, + "epoch": 4.987294556475113, + "grad_norm": 0.83203125, + "learning_rate": 4.939896157059992e-05, + "loss": 0.0448, + "mean_token_accuracy": 0.9863493800163269, + "num_tokens": 53727959.0, + "step": 21395 + }, + { + "entropy": 0.1413085490465164, + "epoch": 4.9884601934957455, + "grad_norm": 3.546875, + "learning_rate": 4.939849385710402e-05, + "loss": 0.2737, + "mean_token_accuracy": 0.9440780580043793, + "num_tokens": 53747497.0, + "step": 21400 + }, + { + "entropy": 0.07893666541203856, + "epoch": 4.989625830516378, + "grad_norm": 1.640625, + "learning_rate": 4.939802596618366e-05, + "loss": 0.0688, + "mean_token_accuracy": 0.9808478355407715, + "num_tokens": 53763449.0, + "step": 21405 + }, + { + "entropy": 0.08251105006784201, + "epoch": 4.990791467537009, + "grad_norm": 4.5625, + "learning_rate": 4.9397557897845856e-05, + "loss": 0.1024, + "mean_token_accuracy": 0.9750867128372193, + "num_tokens": 53776240.0, + "step": 21410 + }, + { + "entropy": 0.06223855372518301, + "epoch": 4.991957104557641, + "grad_norm": 0.6640625, + "learning_rate": 4.9397089652097556e-05, + "loss": 0.0695, + "mean_token_accuracy": 0.9808686673641205, + "num_tokens": 53797279.0, + "step": 21415 + }, + { + "entropy": 0.05682404525578022, + "epoch": 4.993122741578272, + "grad_norm": 1.1015625, + "learning_rate": 4.939662122894576e-05, + "loss": 0.0451, + "mean_token_accuracy": 0.9859870493412017, + "num_tokens": 53812726.0, + "step": 21420 + }, + { + "entropy": 0.08272222261875868, + "epoch": 4.994288378598904, + "grad_norm": 3.046875, + "learning_rate": 4.939615262839745e-05, + "loss": 0.0622, + "mean_token_accuracy": 0.9833892941474914, + "num_tokens": 53827890.0, + "step": 21425 + }, + { + "entropy": 0.05443599112331867, + "epoch": 4.995454015619536, + "grad_norm": 1.0625, + "learning_rate": 4.939568385045961e-05, + "loss": 0.0402, + "mean_token_accuracy": 0.9873492121696472, + "num_tokens": 53844135.0, + "step": 21430 + }, + { + "entropy": 0.054293790087103846, + "epoch": 4.996619652640168, + "grad_norm": 2.765625, + "learning_rate": 4.939521489513923e-05, + "loss": 0.0642, + "mean_token_accuracy": 0.982794600725174, + "num_tokens": 53868623.0, + "step": 21435 + }, + { + "entropy": 0.07399072125554085, + "epoch": 4.9977852896608, + "grad_norm": 5.15625, + "learning_rate": 4.9394745762443304e-05, + "loss": 0.0764, + "mean_token_accuracy": 0.9782315909862518, + "num_tokens": 53881341.0, + "step": 21440 + }, + { + "entropy": 0.06524700932204723, + "epoch": 4.998950926681431, + "grad_norm": 1.65625, + "learning_rate": 4.9394276452378827e-05, + "loss": 0.0789, + "mean_token_accuracy": 0.9781299471855164, + "num_tokens": 53897448.0, + "step": 21445 + }, + { + "entropy": 0.06756970638202296, + "epoch": 5.0, + "grad_norm": 0.97265625, + "learning_rate": 4.9393806964952806e-05, + "loss": 0.0411, + "mean_token_accuracy": 0.9809864560763041, + "num_tokens": 53942250.0, + "step": 21450 + }, + { + "entropy": 0.0722856305539608, + "epoch": 5.001165637020632, + "grad_norm": 3.796875, + "learning_rate": 4.939333730017223e-05, + "loss": 0.1013, + "mean_token_accuracy": 0.9739380717277527, + "num_tokens": 53953317.0, + "step": 21455 + }, + { + "entropy": 0.07048999462276698, + "epoch": 5.002331274041263, + "grad_norm": 0.29296875, + "learning_rate": 4.93928674580441e-05, + "loss": 0.047, + "mean_token_accuracy": 0.9865036189556122, + "num_tokens": 54001504.0, + "step": 21460 + }, + { + "entropy": 0.07579383421689272, + "epoch": 5.003496911061895, + "grad_norm": 0.4921875, + "learning_rate": 4.9392397438575435e-05, + "loss": 0.0639, + "mean_token_accuracy": 0.9817220330238342, + "num_tokens": 54013211.0, + "step": 21465 + }, + { + "entropy": 0.04977593747898936, + "epoch": 5.0046625480825275, + "grad_norm": 2.109375, + "learning_rate": 4.9391927241773226e-05, + "loss": 0.0548, + "mean_token_accuracy": 0.9827068090438843, + "num_tokens": 54036428.0, + "step": 21470 + }, + { + "entropy": 0.07021276131272317, + "epoch": 5.005828185103159, + "grad_norm": 1.328125, + "learning_rate": 4.939145686764451e-05, + "loss": 0.0799, + "mean_token_accuracy": 0.9790279030799866, + "num_tokens": 54050241.0, + "step": 21475 + }, + { + "entropy": 0.054964121477678415, + "epoch": 5.006993822123791, + "grad_norm": 0.06689453125, + "learning_rate": 4.939098631619628e-05, + "loss": 0.059, + "mean_token_accuracy": 0.9842049717903137, + "num_tokens": 54069745.0, + "step": 21480 + }, + { + "entropy": 0.06855918923392892, + "epoch": 5.008159459144422, + "grad_norm": 0.1396484375, + "learning_rate": 4.939051558743556e-05, + "loss": 0.0477, + "mean_token_accuracy": 0.9819868087768555, + "num_tokens": 54088512.0, + "step": 21485 + }, + { + "entropy": 0.08289080634713172, + "epoch": 5.009325096165054, + "grad_norm": 1.3984375, + "learning_rate": 4.939004468136937e-05, + "loss": 0.0555, + "mean_token_accuracy": 0.980892276763916, + "num_tokens": 54106277.0, + "step": 21490 + }, + { + "entropy": 0.06591970575973391, + "epoch": 5.010490733185686, + "grad_norm": 2.34375, + "learning_rate": 4.9389573598004724e-05, + "loss": 0.0699, + "mean_token_accuracy": 0.981263279914856, + "num_tokens": 54123103.0, + "step": 21495 + }, + { + "entropy": 0.056767011620104314, + "epoch": 5.0116563702063175, + "grad_norm": 0.375, + "learning_rate": 4.9389102337348656e-05, + "loss": 0.0564, + "mean_token_accuracy": 0.9821022152900696, + "num_tokens": 54142021.0, + "step": 21500 + }, + { + "entropy": 0.044302819995209575, + "epoch": 5.01282200722695, + "grad_norm": 0.6171875, + "learning_rate": 4.938863089940819e-05, + "loss": 0.0351, + "mean_token_accuracy": 0.9887114048004151, + "num_tokens": 54173043.0, + "step": 21505 + }, + { + "entropy": 0.05516482591629028, + "epoch": 5.013987644247582, + "grad_norm": 2.546875, + "learning_rate": 4.938815928419036e-05, + "loss": 0.066, + "mean_token_accuracy": 0.9820286273956299, + "num_tokens": 54189916.0, + "step": 21510 + }, + { + "entropy": 0.046309492690488695, + "epoch": 5.015153281268213, + "grad_norm": 0.462890625, + "learning_rate": 4.938768749170219e-05, + "loss": 0.0435, + "mean_token_accuracy": 0.988088858127594, + "num_tokens": 54223024.0, + "step": 21515 + }, + { + "entropy": 0.05068000312894583, + "epoch": 5.016318918288845, + "grad_norm": 1.375, + "learning_rate": 4.9387215521950716e-05, + "loss": 0.0439, + "mean_token_accuracy": 0.9892255902290344, + "num_tokens": 54244942.0, + "step": 21520 + }, + { + "entropy": 0.05760006736963987, + "epoch": 5.017484555309476, + "grad_norm": 1.3046875, + "learning_rate": 4.938674337494299e-05, + "loss": 0.0699, + "mean_token_accuracy": 0.9819241404533386, + "num_tokens": 54259058.0, + "step": 21525 + }, + { + "entropy": 0.13578815935179592, + "epoch": 5.018650192330108, + "grad_norm": 4.5, + "learning_rate": 4.938627105068603e-05, + "loss": 0.2186, + "mean_token_accuracy": 0.9652626574039459, + "num_tokens": 54290301.0, + "step": 21530 + }, + { + "entropy": 0.07299588825553656, + "epoch": 5.01981582935074, + "grad_norm": 1.703125, + "learning_rate": 4.9385798549186895e-05, + "loss": 0.078, + "mean_token_accuracy": 0.9776978015899658, + "num_tokens": 54322229.0, + "step": 21535 + }, + { + "entropy": 0.08828615508973599, + "epoch": 5.020981466371372, + "grad_norm": 1.9453125, + "learning_rate": 4.938532587045263e-05, + "loss": 0.0788, + "mean_token_accuracy": 0.9777750134468078, + "num_tokens": 54333646.0, + "step": 21540 + }, + { + "entropy": 0.06954600978642703, + "epoch": 5.022147103392004, + "grad_norm": 3.09375, + "learning_rate": 4.9384853014490274e-05, + "loss": 0.0536, + "mean_token_accuracy": 0.9855652451515198, + "num_tokens": 54347404.0, + "step": 21545 + }, + { + "entropy": 0.05489847809076309, + "epoch": 5.023312740412636, + "grad_norm": 0.1826171875, + "learning_rate": 4.9384379981306884e-05, + "loss": 0.0521, + "mean_token_accuracy": 0.9847877144813537, + "num_tokens": 54371568.0, + "step": 21550 + }, + { + "entropy": 0.06676510954275727, + "epoch": 5.024478377433267, + "grad_norm": 0.26953125, + "learning_rate": 4.9383906770909517e-05, + "loss": 0.0564, + "mean_token_accuracy": 0.9804906964302063, + "num_tokens": 54390510.0, + "step": 21555 + }, + { + "entropy": 0.07030079662799835, + "epoch": 5.025644014453899, + "grad_norm": 0.357421875, + "learning_rate": 4.938343338330522e-05, + "loss": 0.0762, + "mean_token_accuracy": 0.9829931259155273, + "num_tokens": 54401644.0, + "step": 21560 + }, + { + "entropy": 0.0746897492557764, + "epoch": 5.02680965147453, + "grad_norm": 3.5, + "learning_rate": 4.938295981850107e-05, + "loss": 0.0891, + "mean_token_accuracy": 0.9763796985149383, + "num_tokens": 54410346.0, + "step": 21565 + }, + { + "entropy": 0.051309975795447826, + "epoch": 5.0279752884951625, + "grad_norm": 1.25, + "learning_rate": 4.93824860765041e-05, + "loss": 0.0611, + "mean_token_accuracy": 0.9852312088012696, + "num_tokens": 54430243.0, + "step": 21570 + }, + { + "entropy": 0.09667696682736278, + "epoch": 5.029140925515795, + "grad_norm": 4.0, + "learning_rate": 4.93820121573214e-05, + "loss": 0.0751, + "mean_token_accuracy": 0.9779737412929534, + "num_tokens": 54445389.0, + "step": 21575 + }, + { + "entropy": 0.05687328353524208, + "epoch": 5.030306562536426, + "grad_norm": 1.4453125, + "learning_rate": 4.938153806096003e-05, + "loss": 0.0431, + "mean_token_accuracy": 0.9846319437026978, + "num_tokens": 54463785.0, + "step": 21580 + }, + { + "entropy": 0.05415755575522781, + "epoch": 5.031472199557058, + "grad_norm": 2.015625, + "learning_rate": 4.938106378742705e-05, + "loss": 0.0694, + "mean_token_accuracy": 0.9817207515239715, + "num_tokens": 54477292.0, + "step": 21585 + }, + { + "entropy": 0.04446221003308892, + "epoch": 5.03263783657769, + "grad_norm": 0.169921875, + "learning_rate": 4.938058933672954e-05, + "loss": 0.0237, + "mean_token_accuracy": 0.9912239372730255, + "num_tokens": 54506063.0, + "step": 21590 + }, + { + "entropy": 0.07755004474893212, + "epoch": 5.033803473598321, + "grad_norm": 3.375, + "learning_rate": 4.938011470887457e-05, + "loss": 0.0653, + "mean_token_accuracy": 0.9806104242801666, + "num_tokens": 54523112.0, + "step": 21595 + }, + { + "entropy": 0.0687340309843421, + "epoch": 5.034969110618953, + "grad_norm": 0.8671875, + "learning_rate": 4.937963990386923e-05, + "loss": 0.0669, + "mean_token_accuracy": 0.9804644048213959, + "num_tokens": 54548072.0, + "step": 21600 + }, + { + "entropy": 0.05275446167215705, + "epoch": 5.0361347476395855, + "grad_norm": 2.265625, + "learning_rate": 4.937916492172059e-05, + "loss": 0.0324, + "mean_token_accuracy": 0.9847627699375152, + "num_tokens": 54569162.0, + "step": 21605 + }, + { + "entropy": 0.07009538058191538, + "epoch": 5.037300384660217, + "grad_norm": 3.703125, + "learning_rate": 4.937868976243573e-05, + "loss": 0.0723, + "mean_token_accuracy": 0.9828846335411072, + "num_tokens": 54583817.0, + "step": 21610 + }, + { + "entropy": 0.051291110832244156, + "epoch": 5.038466021680849, + "grad_norm": 1.75, + "learning_rate": 4.937821442602174e-05, + "loss": 0.0302, + "mean_token_accuracy": 0.9848531365394593, + "num_tokens": 54604573.0, + "step": 21615 + }, + { + "entropy": 0.061459016799926755, + "epoch": 5.03963165870148, + "grad_norm": 2.375, + "learning_rate": 4.937773891248571e-05, + "loss": 0.0828, + "mean_token_accuracy": 0.9769286692142487, + "num_tokens": 54630544.0, + "step": 21620 + }, + { + "entropy": 0.059759095683693884, + "epoch": 5.040797295722112, + "grad_norm": 1.8515625, + "learning_rate": 4.937726322183472e-05, + "loss": 0.0399, + "mean_token_accuracy": 0.985287070274353, + "num_tokens": 54649093.0, + "step": 21625 + }, + { + "entropy": 0.0626804206520319, + "epoch": 5.041962932742744, + "grad_norm": 0.4453125, + "learning_rate": 4.937678735407587e-05, + "loss": 0.0669, + "mean_token_accuracy": 0.9840811491012573, + "num_tokens": 54662488.0, + "step": 21630 + }, + { + "entropy": 0.048044389486312865, + "epoch": 5.043128569763375, + "grad_norm": 0.330078125, + "learning_rate": 4.937631130921627e-05, + "loss": 0.0298, + "mean_token_accuracy": 0.9894433438777923, + "num_tokens": 54684676.0, + "step": 21635 + }, + { + "entropy": 0.062034656014293435, + "epoch": 5.0442942067840075, + "grad_norm": 3.28125, + "learning_rate": 4.937583508726299e-05, + "loss": 0.0422, + "mean_token_accuracy": 0.9874476134777069, + "num_tokens": 54706977.0, + "step": 21640 + }, + { + "entropy": 0.04591988567262888, + "epoch": 5.04545984380464, + "grad_norm": 1.25, + "learning_rate": 4.937535868822315e-05, + "loss": 0.0414, + "mean_token_accuracy": 0.9855768322944641, + "num_tokens": 54738130.0, + "step": 21645 + }, + { + "entropy": 0.056564598623663184, + "epoch": 5.046625480825271, + "grad_norm": 1.8671875, + "learning_rate": 4.937488211210386e-05, + "loss": 0.0585, + "mean_token_accuracy": 0.9842835962772369, + "num_tokens": 54759483.0, + "step": 21650 + }, + { + "entropy": 0.06345367161557078, + "epoch": 5.047791117845903, + "grad_norm": 4.84375, + "learning_rate": 4.9374405358912213e-05, + "loss": 0.063, + "mean_token_accuracy": 0.9799680531024932, + "num_tokens": 54785815.0, + "step": 21655 + }, + { + "entropy": 0.05606531724333763, + "epoch": 5.048956754866534, + "grad_norm": 3.390625, + "learning_rate": 4.937392842865532e-05, + "loss": 0.0552, + "mean_token_accuracy": 0.9835524022579193, + "num_tokens": 54803493.0, + "step": 21660 + }, + { + "entropy": 0.08263342985883355, + "epoch": 5.050122391887166, + "grad_norm": 2.015625, + "learning_rate": 4.937345132134029e-05, + "loss": 0.0752, + "mean_token_accuracy": 0.9773358285427094, + "num_tokens": 54817570.0, + "step": 21665 + }, + { + "entropy": 0.0704351432621479, + "epoch": 5.051288028907798, + "grad_norm": 1.2734375, + "learning_rate": 4.9372974036974254e-05, + "loss": 0.0853, + "mean_token_accuracy": 0.9777139842510223, + "num_tokens": 54826412.0, + "step": 21670 + }, + { + "entropy": 0.061422979831695555, + "epoch": 5.05245366592843, + "grad_norm": 1.4375, + "learning_rate": 4.9372496575564306e-05, + "loss": 0.0723, + "mean_token_accuracy": 0.98321133852005, + "num_tokens": 54837967.0, + "step": 21675 + }, + { + "entropy": 0.0678645808249712, + "epoch": 5.053619302949062, + "grad_norm": 2.890625, + "learning_rate": 4.937201893711758e-05, + "loss": 0.0739, + "mean_token_accuracy": 0.9775955259799958, + "num_tokens": 54855917.0, + "step": 21680 + }, + { + "entropy": 0.051627715677022935, + "epoch": 5.054784939969694, + "grad_norm": 0.41796875, + "learning_rate": 4.9371541121641194e-05, + "loss": 0.0282, + "mean_token_accuracy": 0.9913087129592896, + "num_tokens": 54883469.0, + "step": 21685 + }, + { + "entropy": 0.04737357590347528, + "epoch": 5.055950576990325, + "grad_norm": 0.40234375, + "learning_rate": 4.937106312914228e-05, + "loss": 0.0206, + "mean_token_accuracy": 0.9885388255119324, + "num_tokens": 54904299.0, + "step": 21690 + }, + { + "entropy": 0.0449544788338244, + "epoch": 5.057116214010957, + "grad_norm": 0.53515625, + "learning_rate": 4.937058495962796e-05, + "loss": 0.0403, + "mean_token_accuracy": 0.9887046158313751, + "num_tokens": 54928834.0, + "step": 21695 + }, + { + "entropy": 0.05512272519990802, + "epoch": 5.058281851031588, + "grad_norm": 1.578125, + "learning_rate": 4.937010661310536e-05, + "loss": 0.0604, + "mean_token_accuracy": 0.9825908660888671, + "num_tokens": 54945161.0, + "step": 21700 + }, + { + "entropy": 0.06513633020222187, + "epoch": 5.05944748805222, + "grad_norm": 0.396484375, + "learning_rate": 4.936962808958161e-05, + "loss": 0.0521, + "mean_token_accuracy": 0.9835683524608612, + "num_tokens": 54964927.0, + "step": 21705 + }, + { + "entropy": 0.10186992576345802, + "epoch": 5.0606131250728525, + "grad_norm": 0.20703125, + "learning_rate": 4.9369149389063865e-05, + "loss": 0.1718, + "mean_token_accuracy": 0.9644091308116913, + "num_tokens": 55003059.0, + "step": 21710 + }, + { + "entropy": 0.06891092918813228, + "epoch": 5.061778762093484, + "grad_norm": 0.52734375, + "learning_rate": 4.936867051155924e-05, + "loss": 0.0591, + "mean_token_accuracy": 0.981193619966507, + "num_tokens": 55023452.0, + "step": 21715 + }, + { + "entropy": 0.0633782428689301, + "epoch": 5.062944399114116, + "grad_norm": 0.35546875, + "learning_rate": 4.936819145707489e-05, + "loss": 0.07, + "mean_token_accuracy": 0.9851223707199097, + "num_tokens": 55044041.0, + "step": 21720 + }, + { + "entropy": 0.06447986625134945, + "epoch": 5.064110036134748, + "grad_norm": 2.671875, + "learning_rate": 4.936771222561796e-05, + "loss": 0.0551, + "mean_token_accuracy": 0.9847223103046417, + "num_tokens": 55057044.0, + "step": 21725 + }, + { + "entropy": 0.06763316094875335, + "epoch": 5.065275673155379, + "grad_norm": 3.53125, + "learning_rate": 4.936723281719558e-05, + "loss": 0.0749, + "mean_token_accuracy": 0.982142984867096, + "num_tokens": 55069209.0, + "step": 21730 + }, + { + "entropy": 0.04672593493014574, + "epoch": 5.066441310176011, + "grad_norm": 0.5625, + "learning_rate": 4.9366753231814914e-05, + "loss": 0.0278, + "mean_token_accuracy": 0.9850932538509369, + "num_tokens": 55111866.0, + "step": 21735 + }, + { + "entropy": 0.06341411881148815, + "epoch": 5.067606947196643, + "grad_norm": 2.75, + "learning_rate": 4.936627346948312e-05, + "loss": 0.0683, + "mean_token_accuracy": 0.9762008249759674, + "num_tokens": 55137517.0, + "step": 21740 + }, + { + "entropy": 0.03943348862230778, + "epoch": 5.068772584217275, + "grad_norm": 0.09619140625, + "learning_rate": 4.936579353020732e-05, + "loss": 0.0318, + "mean_token_accuracy": 0.9900983273983002, + "num_tokens": 55178143.0, + "step": 21745 + }, + { + "entropy": 0.07379267876967788, + "epoch": 5.069938221237907, + "grad_norm": 2.171875, + "learning_rate": 4.936531341399471e-05, + "loss": 0.0689, + "mean_token_accuracy": 0.981987190246582, + "num_tokens": 55194920.0, + "step": 21750 + }, + { + "entropy": 0.06244112215936184, + "epoch": 5.071103858258538, + "grad_norm": 1.765625, + "learning_rate": 4.9364833120852414e-05, + "loss": 0.0667, + "mean_token_accuracy": 0.9802671313285828, + "num_tokens": 55212891.0, + "step": 21755 + }, + { + "entropy": 0.06464984249323606, + "epoch": 5.07226949527917, + "grad_norm": 0.55859375, + "learning_rate": 4.9364352650787624e-05, + "loss": 0.0414, + "mean_token_accuracy": 0.9837022185325622, + "num_tokens": 55233297.0, + "step": 21760 + }, + { + "entropy": 0.05454264315776527, + "epoch": 5.073435132299802, + "grad_norm": 0.2578125, + "learning_rate": 4.936387200380748e-05, + "loss": 0.039, + "mean_token_accuracy": 0.9821477770805359, + "num_tokens": 55269859.0, + "step": 21765 + }, + { + "entropy": 0.05952314343303442, + "epoch": 5.074600769320433, + "grad_norm": 0.2578125, + "learning_rate": 4.9363391179919174e-05, + "loss": 0.0572, + "mean_token_accuracy": 0.9817304372787475, + "num_tokens": 55291450.0, + "step": 21770 + }, + { + "entropy": 0.06436049994081258, + "epoch": 5.0757664063410655, + "grad_norm": 1.5546875, + "learning_rate": 4.936291017912985e-05, + "loss": 0.0667, + "mean_token_accuracy": 0.9814639568328858, + "num_tokens": 55308812.0, + "step": 21775 + }, + { + "entropy": 0.04403821406885981, + "epoch": 5.076932043361698, + "grad_norm": 0.23828125, + "learning_rate": 4.93624290014467e-05, + "loss": 0.0216, + "mean_token_accuracy": 0.9926260411739349, + "num_tokens": 55343260.0, + "step": 21780 + }, + { + "entropy": 0.054392436426132916, + "epoch": 5.078097680382329, + "grad_norm": 2.234375, + "learning_rate": 4.936194764687688e-05, + "loss": 0.0588, + "mean_token_accuracy": 0.9852981567382812, + "num_tokens": 55359488.0, + "step": 21785 + }, + { + "entropy": 0.07421798389405013, + "epoch": 5.079263317402961, + "grad_norm": 3.671875, + "learning_rate": 4.936146611542759e-05, + "loss": 0.0612, + "mean_token_accuracy": 0.9803422451019287, + "num_tokens": 55377919.0, + "step": 21790 + }, + { + "entropy": 0.06554331891238689, + "epoch": 5.080428954423592, + "grad_norm": 1.65625, + "learning_rate": 4.9360984407105996e-05, + "loss": 0.0911, + "mean_token_accuracy": 0.9787883877754211, + "num_tokens": 55388305.0, + "step": 21795 + }, + { + "entropy": 0.055925901234149936, + "epoch": 5.081594591444224, + "grad_norm": 0.59375, + "learning_rate": 4.936050252191928e-05, + "loss": 0.0363, + "mean_token_accuracy": 0.9892447233200073, + "num_tokens": 55421711.0, + "step": 21800 + }, + { + "entropy": 0.098357552010566, + "epoch": 5.082760228464856, + "grad_norm": 2.359375, + "learning_rate": 4.936002045987465e-05, + "loss": 0.0815, + "mean_token_accuracy": 0.978368467092514, + "num_tokens": 55435380.0, + "step": 21805 + }, + { + "entropy": 0.07150723561644554, + "epoch": 5.0839258654854875, + "grad_norm": 2.59375, + "learning_rate": 4.935953822097926e-05, + "loss": 0.0867, + "mean_token_accuracy": 0.9787911772727966, + "num_tokens": 55444640.0, + "step": 21810 + }, + { + "entropy": 0.05865759551525116, + "epoch": 5.08509150250612, + "grad_norm": 2.234375, + "learning_rate": 4.935905580524032e-05, + "loss": 0.0516, + "mean_token_accuracy": 0.9856998682022095, + "num_tokens": 55470590.0, + "step": 21815 + }, + { + "entropy": 0.1110074575059116, + "epoch": 5.086257139526752, + "grad_norm": 0.63671875, + "learning_rate": 4.935857321266502e-05, + "loss": 0.1294, + "mean_token_accuracy": 0.9701458215713501, + "num_tokens": 55503985.0, + "step": 21820 + }, + { + "entropy": 0.048689931537956, + "epoch": 5.087422776547383, + "grad_norm": 1.765625, + "learning_rate": 4.935809044326056e-05, + "loss": 0.0222, + "mean_token_accuracy": 0.9833405792713166, + "num_tokens": 55538140.0, + "step": 21825 + }, + { + "entropy": 0.044951011799275874, + "epoch": 5.088588413568015, + "grad_norm": 2.578125, + "learning_rate": 4.935760749703413e-05, + "loss": 0.0427, + "mean_token_accuracy": 0.9884091496467591, + "num_tokens": 55553734.0, + "step": 21830 + }, + { + "entropy": 0.06277897786349058, + "epoch": 5.089754050588646, + "grad_norm": 1.1875, + "learning_rate": 4.9357124373992945e-05, + "loss": 0.055, + "mean_token_accuracy": 0.9851854026317597, + "num_tokens": 55570993.0, + "step": 21835 + }, + { + "entropy": 0.06481649186462164, + "epoch": 5.090919687609278, + "grad_norm": 1.375, + "learning_rate": 4.93566410741442e-05, + "loss": 0.0596, + "mean_token_accuracy": 0.9827612698078155, + "num_tokens": 55585785.0, + "step": 21840 + }, + { + "entropy": 0.07002595514059066, + "epoch": 5.0920853246299105, + "grad_norm": 2.5, + "learning_rate": 4.93561575974951e-05, + "loss": 0.0665, + "mean_token_accuracy": 0.9836658596992492, + "num_tokens": 55599870.0, + "step": 21845 + }, + { + "entropy": 0.07330641448497772, + "epoch": 5.093250961650542, + "grad_norm": 1.2265625, + "learning_rate": 4.935567394405286e-05, + "loss": 0.0718, + "mean_token_accuracy": 0.9804113328456878, + "num_tokens": 55610084.0, + "step": 21850 + }, + { + "entropy": 0.05826578103005886, + "epoch": 5.094416598671174, + "grad_norm": 0.8125, + "learning_rate": 4.935519011382469e-05, + "loss": 0.0482, + "mean_token_accuracy": 0.9851272463798523, + "num_tokens": 55626799.0, + "step": 21855 + }, + { + "entropy": 0.062410115357488395, + "epoch": 5.095582235691806, + "grad_norm": 0.408203125, + "learning_rate": 4.935470610681781e-05, + "loss": 0.0464, + "mean_token_accuracy": 0.982362687587738, + "num_tokens": 55659544.0, + "step": 21860 + }, + { + "entropy": 0.05943414494395256, + "epoch": 5.096747872712437, + "grad_norm": 1.875, + "learning_rate": 4.9354221923039416e-05, + "loss": 0.055, + "mean_token_accuracy": 0.9825399518013, + "num_tokens": 55678976.0, + "step": 21865 + }, + { + "entropy": 0.04930404406040907, + "epoch": 5.097913509733069, + "grad_norm": 1.6953125, + "learning_rate": 4.9353737562496755e-05, + "loss": 0.041, + "mean_token_accuracy": 0.987514328956604, + "num_tokens": 55698566.0, + "step": 21870 + }, + { + "entropy": 0.08179815951734781, + "epoch": 5.099079146753701, + "grad_norm": 0.97265625, + "learning_rate": 4.935325302519703e-05, + "loss": 0.0625, + "mean_token_accuracy": 0.9812222301959992, + "num_tokens": 55712921.0, + "step": 21875 + }, + { + "entropy": 0.06906782537698745, + "epoch": 5.1002447837743325, + "grad_norm": 3.328125, + "learning_rate": 4.935276831114748e-05, + "loss": 0.0543, + "mean_token_accuracy": 0.9829290509223938, + "num_tokens": 55735759.0, + "step": 21880 + }, + { + "entropy": 0.0615530351176858, + "epoch": 5.101410420794965, + "grad_norm": 0.322265625, + "learning_rate": 4.9352283420355325e-05, + "loss": 0.0769, + "mean_token_accuracy": 0.9852207362651825, + "num_tokens": 55753062.0, + "step": 21885 + }, + { + "entropy": 0.053300064988434315, + "epoch": 5.102576057815596, + "grad_norm": 0.25390625, + "learning_rate": 4.9351798352827805e-05, + "loss": 0.0468, + "mean_token_accuracy": 0.9863043606281281, + "num_tokens": 55770524.0, + "step": 21890 + }, + { + "entropy": 0.052490772865712645, + "epoch": 5.103741694836228, + "grad_norm": 0.287109375, + "learning_rate": 4.935131310857214e-05, + "loss": 0.0378, + "mean_token_accuracy": 0.9889246761798859, + "num_tokens": 55811622.0, + "step": 21895 + }, + { + "entropy": 0.06909960927441716, + "epoch": 5.10490733185686, + "grad_norm": 0.1484375, + "learning_rate": 4.935082768759557e-05, + "loss": 0.0536, + "mean_token_accuracy": 0.98121337890625, + "num_tokens": 55826654.0, + "step": 21900 + }, + { + "entropy": 0.05015625609084964, + "epoch": 5.106072968877491, + "grad_norm": 0.91015625, + "learning_rate": 4.935034208990533e-05, + "loss": 0.0453, + "mean_token_accuracy": 0.9881203591823577, + "num_tokens": 55849840.0, + "step": 21905 + }, + { + "entropy": 0.06175452824681997, + "epoch": 5.107238605898123, + "grad_norm": 2.90625, + "learning_rate": 4.934985631550867e-05, + "loss": 0.0678, + "mean_token_accuracy": 0.9852860569953918, + "num_tokens": 55863086.0, + "step": 21910 + }, + { + "entropy": 0.06861697025597095, + "epoch": 5.1084042429187555, + "grad_norm": 0.322265625, + "learning_rate": 4.9349370364412836e-05, + "loss": 0.0611, + "mean_token_accuracy": 0.9846416294574738, + "num_tokens": 55878134.0, + "step": 21915 + }, + { + "entropy": 0.07329159006476402, + "epoch": 5.109569879939387, + "grad_norm": 3.40625, + "learning_rate": 4.934888423662506e-05, + "loss": 0.0769, + "mean_token_accuracy": 0.9795122087001801, + "num_tokens": 55888333.0, + "step": 21920 + }, + { + "entropy": 0.05855609718710184, + "epoch": 5.110735516960019, + "grad_norm": 0.1689453125, + "learning_rate": 4.93483979321526e-05, + "loss": 0.0544, + "mean_token_accuracy": 0.9840858101844787, + "num_tokens": 55911561.0, + "step": 21925 + }, + { + "entropy": 0.07522995788604021, + "epoch": 5.11190115398065, + "grad_norm": 1.4453125, + "learning_rate": 4.9347911451002705e-05, + "loss": 0.0447, + "mean_token_accuracy": 0.9842344880104065, + "num_tokens": 55933302.0, + "step": 21930 + }, + { + "entropy": 0.0620239726267755, + "epoch": 5.113066791001282, + "grad_norm": 0.373046875, + "learning_rate": 4.934742479318263e-05, + "loss": 0.0538, + "mean_token_accuracy": 0.9858266294002533, + "num_tokens": 55956064.0, + "step": 21935 + }, + { + "entropy": 0.08195541994646191, + "epoch": 5.114232428021914, + "grad_norm": 0.421875, + "learning_rate": 4.9346937958699626e-05, + "loss": 0.0474, + "mean_token_accuracy": 0.9853470683097839, + "num_tokens": 55975040.0, + "step": 21940 + }, + { + "entropy": 0.05026424927636981, + "epoch": 5.1153980650425455, + "grad_norm": 0.3203125, + "learning_rate": 4.934645094756096e-05, + "loss": 0.0451, + "mean_token_accuracy": 0.9877060413360595, + "num_tokens": 56000378.0, + "step": 21945 + }, + { + "entropy": 0.0795700391754508, + "epoch": 5.116563702063178, + "grad_norm": 0.9375, + "learning_rate": 4.93459637597739e-05, + "loss": 0.0946, + "mean_token_accuracy": 0.9759146511554718, + "num_tokens": 56012810.0, + "step": 21950 + }, + { + "entropy": 0.08439991511404514, + "epoch": 5.11772933908381, + "grad_norm": 1.1328125, + "learning_rate": 4.93454763953457e-05, + "loss": 0.1105, + "mean_token_accuracy": 0.9726395010948181, + "num_tokens": 56021050.0, + "step": 21955 + }, + { + "entropy": 0.08431314816698432, + "epoch": 5.118894976104441, + "grad_norm": 1.1796875, + "learning_rate": 4.934498885428363e-05, + "loss": 0.068, + "mean_token_accuracy": 0.9810858368873596, + "num_tokens": 56035566.0, + "step": 21960 + }, + { + "entropy": 0.05840439219027758, + "epoch": 5.120060613125073, + "grad_norm": 1.9375, + "learning_rate": 4.9344501136594963e-05, + "loss": 0.0594, + "mean_token_accuracy": 0.9811022222042084, + "num_tokens": 56050265.0, + "step": 21965 + }, + { + "entropy": 0.05464536147192121, + "epoch": 5.121226250145704, + "grad_norm": 0.49609375, + "learning_rate": 4.934401324228696e-05, + "loss": 0.0233, + "mean_token_accuracy": 0.9924610316753387, + "num_tokens": 56081923.0, + "step": 21970 + }, + { + "entropy": 0.05458259219303727, + "epoch": 5.122391887166336, + "grad_norm": 0.2255859375, + "learning_rate": 4.934352517136691e-05, + "loss": 0.0527, + "mean_token_accuracy": 0.9853946685791015, + "num_tokens": 56106631.0, + "step": 21975 + }, + { + "entropy": 0.04985369872301817, + "epoch": 5.123557524186968, + "grad_norm": 3.203125, + "learning_rate": 4.934303692384209e-05, + "loss": 0.0455, + "mean_token_accuracy": 0.9850191473960876, + "num_tokens": 56135671.0, + "step": 21980 + }, + { + "entropy": 0.09102193363942206, + "epoch": 5.1247231612076, + "grad_norm": 0.38671875, + "learning_rate": 4.9342548499719774e-05, + "loss": 0.0796, + "mean_token_accuracy": 0.9770224332809448, + "num_tokens": 56176870.0, + "step": 21985 + }, + { + "entropy": 0.061427733348682526, + "epoch": 5.125888798228232, + "grad_norm": 0.28515625, + "learning_rate": 4.9342059899007246e-05, + "loss": 0.0186, + "mean_token_accuracy": 0.9816450893878936, + "num_tokens": 56215200.0, + "step": 21990 + }, + { + "entropy": 0.0698608798906207, + "epoch": 5.127054435248864, + "grad_norm": 0.3984375, + "learning_rate": 4.934157112171179e-05, + "loss": 0.0609, + "mean_token_accuracy": 0.9810095489025116, + "num_tokens": 56234551.0, + "step": 21995 + }, + { + "entropy": 0.06343160159885883, + "epoch": 5.128220072269495, + "grad_norm": 2.03125, + "learning_rate": 4.934108216784071e-05, + "loss": 0.0509, + "mean_token_accuracy": 0.9850274443626403, + "num_tokens": 56249797.0, + "step": 22000 + }, + { + "entropy": 0.060403543058782816, + "epoch": 5.129385709290127, + "grad_norm": 1.8671875, + "learning_rate": 4.9340593037401276e-05, + "loss": 0.0612, + "mean_token_accuracy": 0.9820111095905304, + "num_tokens": 56272150.0, + "step": 22005 + }, + { + "entropy": 0.07690092343837023, + "epoch": 5.130551346310758, + "grad_norm": 1.15625, + "learning_rate": 4.9340103730400787e-05, + "loss": 0.0506, + "mean_token_accuracy": 0.98413365483284, + "num_tokens": 56294752.0, + "step": 22010 + }, + { + "entropy": 0.05405779052525759, + "epoch": 5.1317169833313905, + "grad_norm": 0.447265625, + "learning_rate": 4.933961424684655e-05, + "loss": 0.0251, + "mean_token_accuracy": 0.9848962187767029, + "num_tokens": 56325302.0, + "step": 22015 + }, + { + "entropy": 0.07608631141483783, + "epoch": 5.132882620352023, + "grad_norm": 2.28125, + "learning_rate": 4.933912458674586e-05, + "loss": 0.0802, + "mean_token_accuracy": 0.9797267496585846, + "num_tokens": 56334909.0, + "step": 22020 + }, + { + "entropy": 0.05106787588447333, + "epoch": 5.134048257372654, + "grad_norm": 2.828125, + "learning_rate": 4.933863475010601e-05, + "loss": 0.0476, + "mean_token_accuracy": 0.985850727558136, + "num_tokens": 56363979.0, + "step": 22025 + }, + { + "entropy": 0.043447258742526175, + "epoch": 5.135213894393286, + "grad_norm": 0.48046875, + "learning_rate": 4.9338144736934305e-05, + "loss": 0.0217, + "mean_token_accuracy": 0.9914530575275421, + "num_tokens": 56391611.0, + "step": 22030 + }, + { + "entropy": 0.057629916444420816, + "epoch": 5.136379531413918, + "grad_norm": 3.3125, + "learning_rate": 4.9337654547238064e-05, + "loss": 0.0464, + "mean_token_accuracy": 0.9802502512931823, + "num_tokens": 56411985.0, + "step": 22035 + }, + { + "entropy": 0.061352443508803846, + "epoch": 5.137545168434549, + "grad_norm": 3.9375, + "learning_rate": 4.933716418102459e-05, + "loss": 0.0643, + "mean_token_accuracy": 0.98461874127388, + "num_tokens": 56434340.0, + "step": 22040 + }, + { + "entropy": 0.07554442938417197, + "epoch": 5.138710805455181, + "grad_norm": 0.625, + "learning_rate": 4.9336673638301184e-05, + "loss": 0.0761, + "mean_token_accuracy": 0.974240243434906, + "num_tokens": 56452406.0, + "step": 22045 + }, + { + "entropy": 0.049168387427926064, + "epoch": 5.139876442475813, + "grad_norm": 1.09375, + "learning_rate": 4.9336182919075174e-05, + "loss": 0.0452, + "mean_token_accuracy": 0.9868712604045868, + "num_tokens": 56473667.0, + "step": 22050 + }, + { + "entropy": 0.0560465251095593, + "epoch": 5.141042079496445, + "grad_norm": 2.140625, + "learning_rate": 4.933569202335388e-05, + "loss": 0.0573, + "mean_token_accuracy": 0.9840469479560852, + "num_tokens": 56490114.0, + "step": 22055 + }, + { + "entropy": 0.04852314190939069, + "epoch": 5.142207716517077, + "grad_norm": 0.30859375, + "learning_rate": 4.9335200951144614e-05, + "loss": 0.031, + "mean_token_accuracy": 0.9890325367450714, + "num_tokens": 56525717.0, + "step": 22060 + }, + { + "entropy": 0.054032295290380714, + "epoch": 5.143373353537708, + "grad_norm": 3.140625, + "learning_rate": 4.9334709702454694e-05, + "loss": 0.0533, + "mean_token_accuracy": 0.9830582797527313, + "num_tokens": 56546030.0, + "step": 22065 + }, + { + "entropy": 0.05986177334561944, + "epoch": 5.14453899055834, + "grad_norm": 1.84375, + "learning_rate": 4.9334218277291464e-05, + "loss": 0.0437, + "mean_token_accuracy": 0.9865305006504059, + "num_tokens": 56565431.0, + "step": 22070 + }, + { + "entropy": 0.062705637793988, + "epoch": 5.145704627578972, + "grad_norm": 0.31640625, + "learning_rate": 4.933372667566223e-05, + "loss": 0.0362, + "mean_token_accuracy": 0.9859632730484009, + "num_tokens": 56593182.0, + "step": 22075 + }, + { + "entropy": 0.06045402865856886, + "epoch": 5.146870264599603, + "grad_norm": 0.2021484375, + "learning_rate": 4.933323489757433e-05, + "loss": 0.0598, + "mean_token_accuracy": 0.9816952824592591, + "num_tokens": 56611334.0, + "step": 22080 + }, + { + "entropy": 0.06225607329979539, + "epoch": 5.1480359016202355, + "grad_norm": 0.42578125, + "learning_rate": 4.933274294303511e-05, + "loss": 0.0728, + "mean_token_accuracy": 0.982220607995987, + "num_tokens": 56626533.0, + "step": 22085 + }, + { + "entropy": 0.08001007661223411, + "epoch": 5.149201538640868, + "grad_norm": 0.71484375, + "learning_rate": 4.933225081205189e-05, + "loss": 0.0868, + "mean_token_accuracy": 0.9747688889503479, + "num_tokens": 56644744.0, + "step": 22090 + }, + { + "entropy": 0.0839627580717206, + "epoch": 5.150367175661499, + "grad_norm": 3.1875, + "learning_rate": 4.9331758504632014e-05, + "loss": 0.054, + "mean_token_accuracy": 0.9822079181671143, + "num_tokens": 56663768.0, + "step": 22095 + }, + { + "entropy": 0.06215059943497181, + "epoch": 5.151532812682131, + "grad_norm": 2.015625, + "learning_rate": 4.933126602078282e-05, + "loss": 0.0696, + "mean_token_accuracy": 0.9818137347698211, + "num_tokens": 56675332.0, + "step": 22100 + }, + { + "entropy": 0.04475753773003817, + "epoch": 5.152698449702762, + "grad_norm": 2.6875, + "learning_rate": 4.9330773360511654e-05, + "loss": 0.0283, + "mean_token_accuracy": 0.990894901752472, + "num_tokens": 56708402.0, + "step": 22105 + }, + { + "entropy": 0.07335485322400928, + "epoch": 5.153864086723394, + "grad_norm": 2.59375, + "learning_rate": 4.9330280523825866e-05, + "loss": 0.0796, + "mean_token_accuracy": 0.9783159673213959, + "num_tokens": 56725192.0, + "step": 22110 + }, + { + "entropy": 0.06051680203527212, + "epoch": 5.155029723744026, + "grad_norm": 1.3515625, + "learning_rate": 4.932978751073281e-05, + "loss": 0.0596, + "mean_token_accuracy": 0.9841292977333069, + "num_tokens": 56747636.0, + "step": 22115 + }, + { + "entropy": 0.059200151823461056, + "epoch": 5.156195360764658, + "grad_norm": 0.296875, + "learning_rate": 4.932929432123982e-05, + "loss": 0.0475, + "mean_token_accuracy": 0.9841175198554992, + "num_tokens": 56779254.0, + "step": 22120 + }, + { + "entropy": 0.06318260245025158, + "epoch": 5.15736099778529, + "grad_norm": 0.24609375, + "learning_rate": 4.932880095535425e-05, + "loss": 0.0533, + "mean_token_accuracy": 0.9854140460491181, + "num_tokens": 56800037.0, + "step": 22125 + }, + { + "entropy": 0.04333141590468585, + "epoch": 5.158526634805922, + "grad_norm": 0.17578125, + "learning_rate": 4.932830741308348e-05, + "loss": 0.0252, + "mean_token_accuracy": 0.9886651396751404, + "num_tokens": 56831723.0, + "step": 22130 + }, + { + "entropy": 0.07760660853236914, + "epoch": 5.159692271826553, + "grad_norm": 1.2890625, + "learning_rate": 4.9327813694434854e-05, + "loss": 0.0732, + "mean_token_accuracy": 0.9771431624889374, + "num_tokens": 56846308.0, + "step": 22135 + }, + { + "entropy": 0.04463453097268939, + "epoch": 5.160857908847185, + "grad_norm": 0.23046875, + "learning_rate": 4.932731979941573e-05, + "loss": 0.0455, + "mean_token_accuracy": 0.9894105195999146, + "num_tokens": 56874076.0, + "step": 22140 + }, + { + "entropy": 0.05769450534135103, + "epoch": 5.162023545867816, + "grad_norm": 0.51953125, + "learning_rate": 4.9326825728033483e-05, + "loss": 0.0588, + "mean_token_accuracy": 0.980986213684082, + "num_tokens": 56888664.0, + "step": 22145 + }, + { + "entropy": 0.18658456727862358, + "epoch": 5.163189182888448, + "grad_norm": 2.6875, + "learning_rate": 4.932633148029547e-05, + "loss": 0.3163, + "mean_token_accuracy": 0.9311531364917756, + "num_tokens": 56916775.0, + "step": 22150 + }, + { + "entropy": 0.05970644308254123, + "epoch": 5.1643548199090805, + "grad_norm": 1.8203125, + "learning_rate": 4.932583705620908e-05, + "loss": 0.0571, + "mean_token_accuracy": 0.9825261533260345, + "num_tokens": 56933407.0, + "step": 22155 + }, + { + "entropy": 0.05941506642848253, + "epoch": 5.165520456929712, + "grad_norm": 0.68359375, + "learning_rate": 4.932534245578166e-05, + "loss": 0.0497, + "mean_token_accuracy": 0.9831038832664489, + "num_tokens": 56953493.0, + "step": 22160 + }, + { + "entropy": 0.07277057701721787, + "epoch": 5.166686093950344, + "grad_norm": 1.109375, + "learning_rate": 4.93248476790206e-05, + "loss": 0.0711, + "mean_token_accuracy": 0.9815229892730712, + "num_tokens": 56969466.0, + "step": 22165 + }, + { + "entropy": 0.06678186506032943, + "epoch": 5.167851730970976, + "grad_norm": 4.0625, + "learning_rate": 4.9324352725933284e-05, + "loss": 0.069, + "mean_token_accuracy": 0.9849565625190735, + "num_tokens": 56980588.0, + "step": 22170 + }, + { + "entropy": 0.06671747919172048, + "epoch": 5.169017367991607, + "grad_norm": 1.703125, + "learning_rate": 4.932385759652707e-05, + "loss": 0.0443, + "mean_token_accuracy": 0.9861550271511078, + "num_tokens": 57008493.0, + "step": 22175 + }, + { + "entropy": 0.07351988535374403, + "epoch": 5.170183005012239, + "grad_norm": 0.3359375, + "learning_rate": 4.932336229080937e-05, + "loss": 0.0636, + "mean_token_accuracy": 0.9827918767929077, + "num_tokens": 57029159.0, + "step": 22180 + }, + { + "entropy": 0.04658596199005842, + "epoch": 5.171348642032871, + "grad_norm": 2.046875, + "learning_rate": 4.932286680878754e-05, + "loss": 0.0289, + "mean_token_accuracy": 0.9881357610225677, + "num_tokens": 57065409.0, + "step": 22185 + }, + { + "entropy": 0.05278063863515854, + "epoch": 5.172514279053503, + "grad_norm": 0.365234375, + "learning_rate": 4.9322371150468994e-05, + "loss": 0.033, + "mean_token_accuracy": 0.9894371032714844, + "num_tokens": 57094324.0, + "step": 22190 + }, + { + "entropy": 0.06439951695501804, + "epoch": 5.173679916074135, + "grad_norm": 2.125, + "learning_rate": 4.932187531586111e-05, + "loss": 0.0621, + "mean_token_accuracy": 0.980733460187912, + "num_tokens": 57106651.0, + "step": 22195 + }, + { + "entropy": 0.059175492450594905, + "epoch": 5.174845553094766, + "grad_norm": 2.46875, + "learning_rate": 4.932137930497128e-05, + "loss": 0.0418, + "mean_token_accuracy": 0.9881093919277191, + "num_tokens": 57153428.0, + "step": 22200 + }, + { + "entropy": 0.05662951730191708, + "epoch": 5.176011190115398, + "grad_norm": 2.21875, + "learning_rate": 4.932088311780691e-05, + "loss": 0.0491, + "mean_token_accuracy": 0.9842721045017242, + "num_tokens": 57166150.0, + "step": 22205 + }, + { + "entropy": 0.06289763897657394, + "epoch": 5.17717682713603, + "grad_norm": 0.376953125, + "learning_rate": 4.9320386754375394e-05, + "loss": 0.0476, + "mean_token_accuracy": 0.9786098062992096, + "num_tokens": 57189967.0, + "step": 22210 + }, + { + "entropy": 0.07652818039059639, + "epoch": 5.178342464156661, + "grad_norm": 4.78125, + "learning_rate": 4.931989021468413e-05, + "loss": 0.0657, + "mean_token_accuracy": 0.9819384813308716, + "num_tokens": 57200537.0, + "step": 22215 + }, + { + "entropy": 0.06508733816444874, + "epoch": 5.179508101177293, + "grad_norm": 1.5390625, + "learning_rate": 4.9319393498740516e-05, + "loss": 0.053, + "mean_token_accuracy": 0.9842216610908509, + "num_tokens": 57211796.0, + "step": 22220 + }, + { + "entropy": 0.06640651468187571, + "epoch": 5.1806737381979255, + "grad_norm": 0.431640625, + "learning_rate": 4.931889660655198e-05, + "loss": 0.0582, + "mean_token_accuracy": 0.9838066399097443, + "num_tokens": 57231428.0, + "step": 22225 + }, + { + "entropy": 0.06103291492909193, + "epoch": 5.181839375218557, + "grad_norm": 3.78125, + "learning_rate": 4.9318399538125916e-05, + "loss": 0.0691, + "mean_token_accuracy": 0.9828658163547516, + "num_tokens": 57250840.0, + "step": 22230 + }, + { + "entropy": 0.07042492870241404, + "epoch": 5.183005012239189, + "grad_norm": 1.3359375, + "learning_rate": 4.931790229346973e-05, + "loss": 0.053, + "mean_token_accuracy": 0.984697699546814, + "num_tokens": 57273304.0, + "step": 22235 + }, + { + "entropy": 0.044906450994312765, + "epoch": 5.18417064925982, + "grad_norm": 0.341796875, + "learning_rate": 4.931740487259085e-05, + "loss": 0.0268, + "mean_token_accuracy": 0.9909862637519836, + "num_tokens": 57312085.0, + "step": 22240 + }, + { + "entropy": 0.06908139307051897, + "epoch": 5.185336286280452, + "grad_norm": 3.09375, + "learning_rate": 4.93169072754967e-05, + "loss": 0.0437, + "mean_token_accuracy": 0.978874671459198, + "num_tokens": 57345440.0, + "step": 22245 + }, + { + "entropy": 0.056862791441380976, + "epoch": 5.186501923301084, + "grad_norm": 1.578125, + "learning_rate": 4.931640950219468e-05, + "loss": 0.0577, + "mean_token_accuracy": 0.9856437981128693, + "num_tokens": 57372140.0, + "step": 22250 + }, + { + "entropy": 0.05833668913692236, + "epoch": 5.1876675603217155, + "grad_norm": 0.486328125, + "learning_rate": 4.931591155269222e-05, + "loss": 0.0491, + "mean_token_accuracy": 0.9865939795970917, + "num_tokens": 57391472.0, + "step": 22255 + }, + { + "entropy": 0.17140091247856618, + "epoch": 5.188833197342348, + "grad_norm": 1.1328125, + "learning_rate": 4.931541342699675e-05, + "loss": 0.2663, + "mean_token_accuracy": 0.9569355547428131, + "num_tokens": 57425631.0, + "step": 22260 + }, + { + "entropy": 0.2862912990152836, + "epoch": 5.18999883436298, + "grad_norm": 0.7109375, + "learning_rate": 4.931491512511569e-05, + "loss": 0.4858, + "mean_token_accuracy": 0.9360194027423858, + "num_tokens": 57453226.0, + "step": 22265 + }, + { + "entropy": 0.06808519745245575, + "epoch": 5.191164471383611, + "grad_norm": 1.3984375, + "learning_rate": 4.9314416647056485e-05, + "loss": 0.0803, + "mean_token_accuracy": 0.9762260675430298, + "num_tokens": 57468653.0, + "step": 22270 + }, + { + "entropy": 0.06890982948243618, + "epoch": 5.192330108404243, + "grad_norm": 0.93359375, + "learning_rate": 4.9313917992826544e-05, + "loss": 0.0558, + "mean_token_accuracy": 0.9842790246009827, + "num_tokens": 57482450.0, + "step": 22275 + }, + { + "entropy": 0.07730196844786405, + "epoch": 5.193495745424874, + "grad_norm": 4.375, + "learning_rate": 4.9313419162433326e-05, + "loss": 0.0841, + "mean_token_accuracy": 0.9743519246578216, + "num_tokens": 57493336.0, + "step": 22280 + }, + { + "entropy": 0.05996605232357979, + "epoch": 5.194661382445506, + "grad_norm": 1.140625, + "learning_rate": 4.931292015588426e-05, + "loss": 0.049, + "mean_token_accuracy": 0.9876279890537262, + "num_tokens": 57515055.0, + "step": 22285 + }, + { + "entropy": 0.05181253822520375, + "epoch": 5.1958270194661385, + "grad_norm": 0.490234375, + "learning_rate": 4.931242097318678e-05, + "loss": 0.0515, + "mean_token_accuracy": 0.985153716802597, + "num_tokens": 57545326.0, + "step": 22290 + }, + { + "entropy": 0.0514564954675734, + "epoch": 5.19699265648677, + "grad_norm": 2.078125, + "learning_rate": 4.9311921614348343e-05, + "loss": 0.0348, + "mean_token_accuracy": 0.986234825849533, + "num_tokens": 57569516.0, + "step": 22295 + }, + { + "entropy": 0.08702254965901375, + "epoch": 5.198158293507402, + "grad_norm": 5.84375, + "learning_rate": 4.931142207937638e-05, + "loss": 0.1145, + "mean_token_accuracy": 0.9728547394275665, + "num_tokens": 57577612.0, + "step": 22300 + }, + { + "entropy": 0.09497029753401875, + "epoch": 5.199323930528034, + "grad_norm": 0.40625, + "learning_rate": 4.931092236827836e-05, + "loss": 0.0684, + "mean_token_accuracy": 0.9809476554393768, + "num_tokens": 57588909.0, + "step": 22305 + }, + { + "entropy": 0.0746473042294383, + "epoch": 5.200489567548665, + "grad_norm": 2.5625, + "learning_rate": 4.931042248106172e-05, + "loss": 0.0562, + "mean_token_accuracy": 0.9825978338718414, + "num_tokens": 57606043.0, + "step": 22310 + }, + { + "entropy": 0.06316058505326509, + "epoch": 5.201655204569297, + "grad_norm": 2.796875, + "learning_rate": 4.930992241773391e-05, + "loss": 0.046, + "mean_token_accuracy": 0.9862751066684723, + "num_tokens": 57627499.0, + "step": 22315 + }, + { + "entropy": 0.08704807367175818, + "epoch": 5.202820841589929, + "grad_norm": 0.51953125, + "learning_rate": 4.93094221783024e-05, + "loss": 0.0776, + "mean_token_accuracy": 0.9783826053142548, + "num_tokens": 57642818.0, + "step": 22320 + }, + { + "entropy": 0.06668164748698473, + "epoch": 5.2039864786105605, + "grad_norm": 2.625, + "learning_rate": 4.930892176277464e-05, + "loss": 0.0598, + "mean_token_accuracy": 0.9841929137706756, + "num_tokens": 57658432.0, + "step": 22325 + }, + { + "entropy": 0.054752088710665704, + "epoch": 5.205152115631193, + "grad_norm": 0.640625, + "learning_rate": 4.93084211711581e-05, + "loss": 0.0473, + "mean_token_accuracy": 0.9867014527320862, + "num_tokens": 57682037.0, + "step": 22330 + }, + { + "entropy": 0.06804192513227462, + "epoch": 5.206317752651824, + "grad_norm": 2.390625, + "learning_rate": 4.9307920403460235e-05, + "loss": 0.0617, + "mean_token_accuracy": 0.9819234848022461, + "num_tokens": 57693989.0, + "step": 22335 + }, + { + "entropy": 0.06305139996111393, + "epoch": 5.207483389672456, + "grad_norm": 2.8125, + "learning_rate": 4.9307419459688514e-05, + "loss": 0.0603, + "mean_token_accuracy": 0.9847787320613861, + "num_tokens": 57717654.0, + "step": 22340 + }, + { + "entropy": 0.08745529353618622, + "epoch": 5.208649026693088, + "grad_norm": 0.2021484375, + "learning_rate": 4.930691833985042e-05, + "loss": 0.0526, + "mean_token_accuracy": 0.9825003862380981, + "num_tokens": 57738130.0, + "step": 22345 + }, + { + "entropy": 0.0616550050675869, + "epoch": 5.209814663713719, + "grad_norm": 0.2470703125, + "learning_rate": 4.93064170439534e-05, + "loss": 0.0402, + "mean_token_accuracy": 0.9874732315540313, + "num_tokens": 57756033.0, + "step": 22350 + }, + { + "entropy": 0.07887524664402008, + "epoch": 5.210980300734351, + "grad_norm": 1.546875, + "learning_rate": 4.930591557200496e-05, + "loss": 0.0592, + "mean_token_accuracy": 0.9847594499588013, + "num_tokens": 57766455.0, + "step": 22355 + }, + { + "entropy": 0.07828300539404154, + "epoch": 5.2121459377549835, + "grad_norm": 0.890625, + "learning_rate": 4.930541392401255e-05, + "loss": 0.043, + "mean_token_accuracy": 0.9833212852478027, + "num_tokens": 57788545.0, + "step": 22360 + }, + { + "entropy": 0.06497166268527507, + "epoch": 5.213311574775615, + "grad_norm": 0.8515625, + "learning_rate": 4.930491209998366e-05, + "loss": 0.052, + "mean_token_accuracy": 0.9866826653480529, + "num_tokens": 57813550.0, + "step": 22365 + }, + { + "entropy": 0.07942379545420408, + "epoch": 5.214477211796247, + "grad_norm": 3.34375, + "learning_rate": 4.930441009992578e-05, + "loss": 0.0693, + "mean_token_accuracy": 0.9833049654960633, + "num_tokens": 57832481.0, + "step": 22370 + }, + { + "entropy": 0.07472335118800402, + "epoch": 5.215642848816878, + "grad_norm": 1.7265625, + "learning_rate": 4.9303907923846394e-05, + "loss": 0.0856, + "mean_token_accuracy": 0.9751461148262024, + "num_tokens": 57843819.0, + "step": 22375 + }, + { + "entropy": 0.0662139642983675, + "epoch": 5.21680848583751, + "grad_norm": 2.328125, + "learning_rate": 4.930340557175298e-05, + "loss": 0.0667, + "mean_token_accuracy": 0.9782337188720703, + "num_tokens": 57859104.0, + "step": 22380 + }, + { + "entropy": 0.05948888058774173, + "epoch": 5.217974122858142, + "grad_norm": 0.396484375, + "learning_rate": 4.930290304365304e-05, + "loss": 0.0567, + "mean_token_accuracy": 0.9833056330680847, + "num_tokens": 57880523.0, + "step": 22385 + }, + { + "entropy": 0.08422890277579427, + "epoch": 5.219139759878773, + "grad_norm": 3.265625, + "learning_rate": 4.9302400339554066e-05, + "loss": 0.0725, + "mean_token_accuracy": 0.9808120965957642, + "num_tokens": 57898052.0, + "step": 22390 + }, + { + "entropy": 0.06969092637300492, + "epoch": 5.2203053968994055, + "grad_norm": 3.453125, + "learning_rate": 4.930189745946355e-05, + "loss": 0.0783, + "mean_token_accuracy": 0.9814900815486908, + "num_tokens": 57910841.0, + "step": 22395 + }, + { + "entropy": 0.05859537925571203, + "epoch": 5.221471033920038, + "grad_norm": 0.212890625, + "learning_rate": 4.9301394403388984e-05, + "loss": 0.0469, + "mean_token_accuracy": 0.9872027635574341, + "num_tokens": 57933764.0, + "step": 22400 + }, + { + "entropy": 0.050899960659444334, + "epoch": 5.222636670940669, + "grad_norm": 1.25, + "learning_rate": 4.930089117133788e-05, + "loss": 0.0283, + "mean_token_accuracy": 0.9913576006889343, + "num_tokens": 57972262.0, + "step": 22405 + }, + { + "entropy": 0.08815276809036732, + "epoch": 5.223802307961301, + "grad_norm": 2.5, + "learning_rate": 4.9300387763317745e-05, + "loss": 0.0823, + "mean_token_accuracy": 0.9758888900279998, + "num_tokens": 57980445.0, + "step": 22410 + }, + { + "entropy": 0.07962552271783352, + "epoch": 5.224967944981932, + "grad_norm": 0.31640625, + "learning_rate": 4.9299884179336074e-05, + "loss": 0.048, + "mean_token_accuracy": 0.9821713328361511, + "num_tokens": 57995875.0, + "step": 22415 + }, + { + "entropy": 0.06803807709366083, + "epoch": 5.226133582002564, + "grad_norm": 1.5859375, + "learning_rate": 4.9299380419400384e-05, + "loss": 0.0664, + "mean_token_accuracy": 0.9840512633323669, + "num_tokens": 58007268.0, + "step": 22420 + }, + { + "entropy": 0.08102755229920149, + "epoch": 5.227299219023196, + "grad_norm": 0.85546875, + "learning_rate": 4.9298876483518185e-05, + "loss": 0.068, + "mean_token_accuracy": 0.9746978640556335, + "num_tokens": 58026770.0, + "step": 22425 + }, + { + "entropy": 0.058991825021803376, + "epoch": 5.228464856043828, + "grad_norm": 1.1171875, + "learning_rate": 4.9298372371696996e-05, + "loss": 0.0457, + "mean_token_accuracy": 0.9835934102535248, + "num_tokens": 58047531.0, + "step": 22430 + }, + { + "entropy": 0.07415626281872392, + "epoch": 5.22963049306446, + "grad_norm": 3.53125, + "learning_rate": 4.929786808394432e-05, + "loss": 0.0326, + "mean_token_accuracy": 0.9845768630504608, + "num_tokens": 58078331.0, + "step": 22435 + }, + { + "entropy": 0.06825162693858147, + "epoch": 5.230796130085092, + "grad_norm": 2.15625, + "learning_rate": 4.929736362026769e-05, + "loss": 0.0658, + "mean_token_accuracy": 0.9820273876190185, + "num_tokens": 58090377.0, + "step": 22440 + }, + { + "entropy": 0.0566140066832304, + "epoch": 5.231961767105723, + "grad_norm": 0.423828125, + "learning_rate": 4.9296858980674625e-05, + "loss": 0.0568, + "mean_token_accuracy": 0.9854518592357635, + "num_tokens": 58102852.0, + "step": 22445 + }, + { + "entropy": 0.0516730266623199, + "epoch": 5.233127404126355, + "grad_norm": 1.34375, + "learning_rate": 4.929635416517265e-05, + "loss": 0.0535, + "mean_token_accuracy": 0.9852344691753387, + "num_tokens": 58128816.0, + "step": 22450 + }, + { + "entropy": 0.058171987719833854, + "epoch": 5.234293041146987, + "grad_norm": 1.03125, + "learning_rate": 4.9295849173769294e-05, + "loss": 0.0562, + "mean_token_accuracy": 0.9863945484161377, + "num_tokens": 58149787.0, + "step": 22455 + }, + { + "entropy": 0.054046990163624284, + "epoch": 5.2354586781676185, + "grad_norm": 1.1015625, + "learning_rate": 4.929534400647208e-05, + "loss": 0.0463, + "mean_token_accuracy": 0.9873308002948761, + "num_tokens": 58169062.0, + "step": 22460 + }, + { + "entropy": 0.05721415225416422, + "epoch": 5.2366243151882506, + "grad_norm": 3.015625, + "learning_rate": 4.929483866328855e-05, + "loss": 0.0374, + "mean_token_accuracy": 0.9847243249416351, + "num_tokens": 58184895.0, + "step": 22465 + }, + { + "entropy": 0.07032016962766648, + "epoch": 5.237789952208882, + "grad_norm": 4.0, + "learning_rate": 4.929433314422622e-05, + "loss": 0.0711, + "mean_token_accuracy": 0.9818163990974427, + "num_tokens": 58199417.0, + "step": 22470 + }, + { + "entropy": 0.0443870535120368, + "epoch": 5.238955589229514, + "grad_norm": 0.97265625, + "learning_rate": 4.929382744929266e-05, + "loss": 0.0304, + "mean_token_accuracy": 0.9896786749362946, + "num_tokens": 58218118.0, + "step": 22475 + }, + { + "entropy": 0.07258482100442051, + "epoch": 5.240121226250146, + "grad_norm": 1.8515625, + "learning_rate": 4.929332157849539e-05, + "loss": 0.0535, + "mean_token_accuracy": 0.984838330745697, + "num_tokens": 58236418.0, + "step": 22480 + }, + { + "entropy": 0.053557580430060626, + "epoch": 5.241286863270777, + "grad_norm": 0.408203125, + "learning_rate": 4.929281553184195e-05, + "loss": 0.0396, + "mean_token_accuracy": 0.9851312339305878, + "num_tokens": 58263109.0, + "step": 22485 + }, + { + "entropy": 0.04420396983623505, + "epoch": 5.242452500291409, + "grad_norm": 0.1337890625, + "learning_rate": 4.92923093093399e-05, + "loss": 0.0157, + "mean_token_accuracy": 0.9877875030040741, + "num_tokens": 58316098.0, + "step": 22490 + }, + { + "entropy": 0.06833171583712101, + "epoch": 5.243618137312041, + "grad_norm": 4.21875, + "learning_rate": 4.929180291099678e-05, + "loss": 0.0734, + "mean_token_accuracy": 0.9797233819961548, + "num_tokens": 58329411.0, + "step": 22495 + }, + { + "entropy": 0.06508544906973839, + "epoch": 5.244783774332673, + "grad_norm": 0.7578125, + "learning_rate": 4.929129633682015e-05, + "loss": 0.0473, + "mean_token_accuracy": 0.9827897489070893, + "num_tokens": 58361368.0, + "step": 22500 + }, + { + "entropy": 0.05918069491162896, + "epoch": 5.245949411353305, + "grad_norm": 0.296875, + "learning_rate": 4.9290789586817544e-05, + "loss": 0.038, + "mean_token_accuracy": 0.982359778881073, + "num_tokens": 58391277.0, + "step": 22505 + }, + { + "entropy": 0.051390673080459234, + "epoch": 5.247115048373936, + "grad_norm": 0.1689453125, + "learning_rate": 4.9290282660996535e-05, + "loss": 0.0292, + "mean_token_accuracy": 0.987204658985138, + "num_tokens": 58446856.0, + "step": 22510 + }, + { + "entropy": 0.0984171137213707, + "epoch": 5.248280685394568, + "grad_norm": 2.578125, + "learning_rate": 4.9289775559364684e-05, + "loss": 0.0913, + "mean_token_accuracy": 0.9747403621673584, + "num_tokens": 58455680.0, + "step": 22515 + }, + { + "entropy": 0.05602652542293072, + "epoch": 5.2494463224152, + "grad_norm": 1.890625, + "learning_rate": 4.928926828192954e-05, + "loss": 0.0699, + "mean_token_accuracy": 0.9806085467338562, + "num_tokens": 58480807.0, + "step": 22520 + }, + { + "entropy": 0.07297561299055814, + "epoch": 5.250611959435831, + "grad_norm": 0.8046875, + "learning_rate": 4.9288760828698674e-05, + "loss": 0.0684, + "mean_token_accuracy": 0.9783793568611145, + "num_tokens": 58502349.0, + "step": 22525 + }, + { + "entropy": 0.07672806866467, + "epoch": 5.2517775964564635, + "grad_norm": 1.9453125, + "learning_rate": 4.9288253199679654e-05, + "loss": 0.0867, + "mean_token_accuracy": 0.9797773718833923, + "num_tokens": 58522445.0, + "step": 22530 + }, + { + "entropy": 0.06881959196180105, + "epoch": 5.252943233477096, + "grad_norm": 1.125, + "learning_rate": 4.928774539488005e-05, + "loss": 0.053, + "mean_token_accuracy": 0.9879900455474854, + "num_tokens": 58542959.0, + "step": 22535 + }, + { + "entropy": 0.07772445082664489, + "epoch": 5.254108870497727, + "grad_norm": 1.015625, + "learning_rate": 4.928723741430743e-05, + "loss": 0.0797, + "mean_token_accuracy": 0.9801374554634095, + "num_tokens": 58553910.0, + "step": 22540 + }, + { + "entropy": 0.06708589978516102, + "epoch": 5.255274507518359, + "grad_norm": 0.81640625, + "learning_rate": 4.928672925796937e-05, + "loss": 0.0683, + "mean_token_accuracy": 0.981471985578537, + "num_tokens": 58567114.0, + "step": 22545 + }, + { + "entropy": 0.059767977148294446, + "epoch": 5.25644014453899, + "grad_norm": 0.7734375, + "learning_rate": 4.9286220925873457e-05, + "loss": 0.0509, + "mean_token_accuracy": 0.9889122486114502, + "num_tokens": 58583769.0, + "step": 22550 + }, + { + "entropy": 0.06370611824095249, + "epoch": 5.257605781559622, + "grad_norm": 1.8984375, + "learning_rate": 4.9285712418027254e-05, + "loss": 0.0852, + "mean_token_accuracy": 0.9790844082832336, + "num_tokens": 58594274.0, + "step": 22555 + }, + { + "entropy": 0.08938689008355141, + "epoch": 5.258771418580254, + "grad_norm": 3.890625, + "learning_rate": 4.928520373443836e-05, + "loss": 0.0872, + "mean_token_accuracy": 0.9796117067337036, + "num_tokens": 58602989.0, + "step": 22560 + }, + { + "entropy": 0.0646333851851523, + "epoch": 5.2599370556008855, + "grad_norm": 0.73828125, + "learning_rate": 4.928469487511434e-05, + "loss": 0.0503, + "mean_token_accuracy": 0.9790112435817718, + "num_tokens": 58622536.0, + "step": 22565 + }, + { + "entropy": 0.06210710275918245, + "epoch": 5.261102692621518, + "grad_norm": 0.515625, + "learning_rate": 4.9284185840062805e-05, + "loss": 0.0313, + "mean_token_accuracy": 0.9861810505390167, + "num_tokens": 58647178.0, + "step": 22570 + }, + { + "entropy": 0.07191750202327966, + "epoch": 5.26226832964215, + "grad_norm": 4.03125, + "learning_rate": 4.928367662929133e-05, + "loss": 0.0743, + "mean_token_accuracy": 0.9779257357120514, + "num_tokens": 58660942.0, + "step": 22575 + }, + { + "entropy": 0.07680397816002368, + "epoch": 5.263433966662781, + "grad_norm": 2.265625, + "learning_rate": 4.928316724280751e-05, + "loss": 0.0628, + "mean_token_accuracy": 0.9832376003265381, + "num_tokens": 58679318.0, + "step": 22580 + }, + { + "entropy": 0.04947316385805607, + "epoch": 5.264599603683413, + "grad_norm": 0.373046875, + "learning_rate": 4.928265768061895e-05, + "loss": 0.0281, + "mean_token_accuracy": 0.9875920355319977, + "num_tokens": 58706921.0, + "step": 22585 + }, + { + "entropy": 0.06746490080840886, + "epoch": 5.265765240704045, + "grad_norm": 1.9375, + "learning_rate": 4.928214794273324e-05, + "loss": 0.044, + "mean_token_accuracy": 0.9858042418956756, + "num_tokens": 58731640.0, + "step": 22590 + }, + { + "entropy": 0.06307427566498518, + "epoch": 5.266930877724676, + "grad_norm": 0.388671875, + "learning_rate": 4.928163802915798e-05, + "loss": 0.042, + "mean_token_accuracy": 0.9826907038688659, + "num_tokens": 58753369.0, + "step": 22595 + }, + { + "entropy": 0.05262499302625656, + "epoch": 5.2680965147453085, + "grad_norm": 0.609375, + "learning_rate": 4.928112793990078e-05, + "loss": 0.0234, + "mean_token_accuracy": 0.9888963580131531, + "num_tokens": 58793583.0, + "step": 22600 + }, + { + "entropy": 0.06587472511455417, + "epoch": 5.26926215176594, + "grad_norm": 0.62109375, + "learning_rate": 4.928061767496924e-05, + "loss": 0.0519, + "mean_token_accuracy": 0.9828464448451996, + "num_tokens": 58823115.0, + "step": 22605 + }, + { + "entropy": 0.06749656610190868, + "epoch": 5.270427788786572, + "grad_norm": 2.78125, + "learning_rate": 4.928010723437097e-05, + "loss": 0.0627, + "mean_token_accuracy": 0.9848908543586731, + "num_tokens": 58838981.0, + "step": 22610 + }, + { + "entropy": 0.06774584236554801, + "epoch": 5.271593425807204, + "grad_norm": 3.125, + "learning_rate": 4.927959661811359e-05, + "loss": 0.0602, + "mean_token_accuracy": 0.9852911114692688, + "num_tokens": 58862052.0, + "step": 22615 + }, + { + "entropy": 0.06854531690478324, + "epoch": 5.272759062827835, + "grad_norm": 0.94921875, + "learning_rate": 4.92790858262047e-05, + "loss": 0.0544, + "mean_token_accuracy": 0.9801726341247559, + "num_tokens": 58886661.0, + "step": 22620 + }, + { + "entropy": 0.058332338370382784, + "epoch": 5.273924699848467, + "grad_norm": 2.375, + "learning_rate": 4.9278574858651923e-05, + "loss": 0.0617, + "mean_token_accuracy": 0.9846881628036499, + "num_tokens": 58899831.0, + "step": 22625 + }, + { + "entropy": 0.06559796500951051, + "epoch": 5.275090336869099, + "grad_norm": 0.97265625, + "learning_rate": 4.927806371546288e-05, + "loss": 0.0361, + "mean_token_accuracy": 0.9890841841697693, + "num_tokens": 58927345.0, + "step": 22630 + }, + { + "entropy": 0.06151863979175687, + "epoch": 5.2762559738897306, + "grad_norm": 4.15625, + "learning_rate": 4.927755239664519e-05, + "loss": 0.0494, + "mean_token_accuracy": 0.985187166929245, + "num_tokens": 58952386.0, + "step": 22635 + }, + { + "entropy": 0.0975575815886259, + "epoch": 5.277421610910363, + "grad_norm": 0.984375, + "learning_rate": 4.927704090220649e-05, + "loss": 0.1269, + "mean_token_accuracy": 0.9716195166110992, + "num_tokens": 58972365.0, + "step": 22640 + }, + { + "entropy": 0.06953037157654762, + "epoch": 5.278587247930994, + "grad_norm": 1.484375, + "learning_rate": 4.927652923215439e-05, + "loss": 0.0553, + "mean_token_accuracy": 0.9839057862758637, + "num_tokens": 58982805.0, + "step": 22645 + }, + { + "entropy": 0.051419579051434995, + "epoch": 5.279752884951626, + "grad_norm": 2.234375, + "learning_rate": 4.927601738649652e-05, + "loss": 0.0405, + "mean_token_accuracy": 0.9825731217861176, + "num_tokens": 59008791.0, + "step": 22650 + }, + { + "entropy": 0.05781197277829051, + "epoch": 5.280918521972258, + "grad_norm": 0.4921875, + "learning_rate": 4.9275505365240525e-05, + "loss": 0.0301, + "mean_token_accuracy": 0.9908199787139893, + "num_tokens": 59039802.0, + "step": 22655 + }, + { + "entropy": 0.1533332671970129, + "epoch": 5.282084158992889, + "grad_norm": 2.828125, + "learning_rate": 4.927499316839403e-05, + "loss": 0.213, + "mean_token_accuracy": 0.9515707314014434, + "num_tokens": 59063712.0, + "step": 22660 + }, + { + "entropy": 0.156291969679296, + "epoch": 5.283249796013521, + "grad_norm": 0.46484375, + "learning_rate": 4.927448079596468e-05, + "loss": 0.2686, + "mean_token_accuracy": 0.943569415807724, + "num_tokens": 59085381.0, + "step": 22665 + }, + { + "entropy": 0.08054910823702813, + "epoch": 5.2844154330341535, + "grad_norm": 3.484375, + "learning_rate": 4.927396824796011e-05, + "loss": 0.0653, + "mean_token_accuracy": 0.9849894046783447, + "num_tokens": 59096600.0, + "step": 22670 + }, + { + "entropy": 0.04917644914239645, + "epoch": 5.285581070054785, + "grad_norm": 1.0546875, + "learning_rate": 4.9273455524387966e-05, + "loss": 0.0503, + "mean_token_accuracy": 0.9843254506587982, + "num_tokens": 59119465.0, + "step": 22675 + }, + { + "entropy": 0.041685186047106984, + "epoch": 5.286746707075417, + "grad_norm": 0.69140625, + "learning_rate": 4.927294262525589e-05, + "loss": 0.042, + "mean_token_accuracy": 0.984868735074997, + "num_tokens": 59146252.0, + "step": 22680 + }, + { + "entropy": 0.04622435262426734, + "epoch": 5.287912344096048, + "grad_norm": 1.3828125, + "learning_rate": 4.9272429550571536e-05, + "loss": 0.0284, + "mean_token_accuracy": 0.987972092628479, + "num_tokens": 59171522.0, + "step": 22685 + }, + { + "entropy": 0.07690389379858971, + "epoch": 5.28907798111668, + "grad_norm": 1.78125, + "learning_rate": 4.9271916300342544e-05, + "loss": 0.0789, + "mean_token_accuracy": 0.9799573600292206, + "num_tokens": 59185612.0, + "step": 22690 + }, + { + "entropy": 0.07589298877865076, + "epoch": 5.290243618137312, + "grad_norm": 1.515625, + "learning_rate": 4.927140287457658e-05, + "loss": 0.0701, + "mean_token_accuracy": 0.9791806638240814, + "num_tokens": 59195994.0, + "step": 22695 + }, + { + "entropy": 0.06374965868890285, + "epoch": 5.2914092551579435, + "grad_norm": 1.40625, + "learning_rate": 4.927088927328129e-05, + "loss": 0.0695, + "mean_token_accuracy": 0.9842319011688232, + "num_tokens": 59220450.0, + "step": 22700 + }, + { + "entropy": 0.08326494041830301, + "epoch": 5.292574892178576, + "grad_norm": 1.1328125, + "learning_rate": 4.927037549646434e-05, + "loss": 0.0613, + "mean_token_accuracy": 0.9815594732761384, + "num_tokens": 59237551.0, + "step": 22705 + }, + { + "entropy": 0.053751620929688215, + "epoch": 5.293740529199208, + "grad_norm": 0.45703125, + "learning_rate": 4.926986154413338e-05, + "loss": 0.0348, + "mean_token_accuracy": 0.9854931950569152, + "num_tokens": 59267330.0, + "step": 22710 + }, + { + "entropy": 0.076590671017766, + "epoch": 5.294906166219839, + "grad_norm": 3.375, + "learning_rate": 4.926934741629609e-05, + "loss": 0.0948, + "mean_token_accuracy": 0.977790892124176, + "num_tokens": 59279092.0, + "step": 22715 + }, + { + "entropy": 0.0669423419982195, + "epoch": 5.296071803240471, + "grad_norm": 0.455078125, + "learning_rate": 4.9268833112960126e-05, + "loss": 0.0762, + "mean_token_accuracy": 0.9806538939476013, + "num_tokens": 59290726.0, + "step": 22720 + }, + { + "entropy": 0.06741415970027446, + "epoch": 5.297237440261103, + "grad_norm": 1.7421875, + "learning_rate": 4.926831863413316e-05, + "loss": 0.0538, + "mean_token_accuracy": 0.9821211993694305, + "num_tokens": 59311371.0, + "step": 22725 + }, + { + "entropy": 0.0793554861098528, + "epoch": 5.298403077281734, + "grad_norm": 1.515625, + "learning_rate": 4.9267803979822856e-05, + "loss": 0.0552, + "mean_token_accuracy": 0.9833518862724304, + "num_tokens": 59333334.0, + "step": 22730 + }, + { + "entropy": 0.07032034313306212, + "epoch": 5.299568714302366, + "grad_norm": 0.6171875, + "learning_rate": 4.92672891500369e-05, + "loss": 0.0468, + "mean_token_accuracy": 0.9863258957862854, + "num_tokens": 59355525.0, + "step": 22735 + }, + { + "entropy": 0.06814271304756403, + "epoch": 5.300734351322998, + "grad_norm": 1.0625, + "learning_rate": 4.9266774144782965e-05, + "loss": 0.0625, + "mean_token_accuracy": 0.9831117630004883, + "num_tokens": 59368173.0, + "step": 22740 + }, + { + "entropy": 0.08565502576529979, + "epoch": 5.30189998834363, + "grad_norm": 0.65234375, + "learning_rate": 4.926625896406873e-05, + "loss": 0.0617, + "mean_token_accuracy": 0.9830956637859345, + "num_tokens": 59387615.0, + "step": 22745 + }, + { + "entropy": 0.07533656526356936, + "epoch": 5.303065625364262, + "grad_norm": 0.7578125, + "learning_rate": 4.926574360790187e-05, + "loss": 0.0657, + "mean_token_accuracy": 0.9792934238910675, + "num_tokens": 59406251.0, + "step": 22750 + }, + { + "entropy": 0.0985400104895234, + "epoch": 5.304231262384893, + "grad_norm": 0.9921875, + "learning_rate": 4.926522807629008e-05, + "loss": 0.1293, + "mean_token_accuracy": 0.9710362374782562, + "num_tokens": 59442852.0, + "step": 22755 + }, + { + "entropy": 0.05292136138305068, + "epoch": 5.305396899405525, + "grad_norm": 0.703125, + "learning_rate": 4.9264712369241044e-05, + "loss": 0.0253, + "mean_token_accuracy": 0.9833535492420197, + "num_tokens": 59483889.0, + "step": 22760 + }, + { + "entropy": 0.06867659520357847, + "epoch": 5.306562536426157, + "grad_norm": 1.0, + "learning_rate": 4.926419648676245e-05, + "loss": 0.0453, + "mean_token_accuracy": 0.9813343346118927, + "num_tokens": 59503823.0, + "step": 22765 + }, + { + "entropy": 0.05698272874578834, + "epoch": 5.3077281734467885, + "grad_norm": 0.419921875, + "learning_rate": 4.926368042886199e-05, + "loss": 0.0302, + "mean_token_accuracy": 0.9861780762672424, + "num_tokens": 59529127.0, + "step": 22770 + }, + { + "entropy": 0.0628203245345503, + "epoch": 5.308893810467421, + "grad_norm": 0.337890625, + "learning_rate": 4.926316419554737e-05, + "loss": 0.06, + "mean_token_accuracy": 0.9819603025913238, + "num_tokens": 59549039.0, + "step": 22775 + }, + { + "entropy": 0.08705004677176476, + "epoch": 5.310059447488052, + "grad_norm": 2.578125, + "learning_rate": 4.926264778682627e-05, + "loss": 0.105, + "mean_token_accuracy": 0.9772071361541748, + "num_tokens": 59556652.0, + "step": 22780 + }, + { + "entropy": 0.05819199327379465, + "epoch": 5.311225084508684, + "grad_norm": 1.3203125, + "learning_rate": 4.9262131202706404e-05, + "loss": 0.0728, + "mean_token_accuracy": 0.9819917857646943, + "num_tokens": 59567756.0, + "step": 22785 + }, + { + "entropy": 0.07961954735219479, + "epoch": 5.312390721529316, + "grad_norm": 1.671875, + "learning_rate": 4.926161444319547e-05, + "loss": 0.0685, + "mean_token_accuracy": 0.98085697889328, + "num_tokens": 59578826.0, + "step": 22790 + }, + { + "entropy": 0.08680852949619293, + "epoch": 5.313556358549947, + "grad_norm": 2.75, + "learning_rate": 4.926109750830117e-05, + "loss": 0.0905, + "mean_token_accuracy": 0.9772542119026184, + "num_tokens": 59587586.0, + "step": 22795 + }, + { + "entropy": 0.08079705536365508, + "epoch": 5.314721995570579, + "grad_norm": 2.171875, + "learning_rate": 4.9260580398031217e-05, + "loss": 0.054, + "mean_token_accuracy": 0.9830148041248321, + "num_tokens": 59599512.0, + "step": 22800 + }, + { + "entropy": 0.0764385698363185, + "epoch": 5.315887632591211, + "grad_norm": 1.9453125, + "learning_rate": 4.926006311239333e-05, + "loss": 0.0765, + "mean_token_accuracy": 0.9792965352535248, + "num_tokens": 59617321.0, + "step": 22805 + }, + { + "entropy": 0.05908251665532589, + "epoch": 5.317053269611843, + "grad_norm": 1.203125, + "learning_rate": 4.9259545651395206e-05, + "loss": 0.0499, + "mean_token_accuracy": 0.9890253007411957, + "num_tokens": 59637540.0, + "step": 22810 + }, + { + "entropy": 0.10033303014934063, + "epoch": 5.318218906632475, + "grad_norm": 2.296875, + "learning_rate": 4.925902801504457e-05, + "loss": 0.0497, + "mean_token_accuracy": 0.9801102578639984, + "num_tokens": 59649451.0, + "step": 22815 + }, + { + "entropy": 0.07409988492727279, + "epoch": 5.319384543653106, + "grad_norm": 2.203125, + "learning_rate": 4.925851020334914e-05, + "loss": 0.0672, + "mean_token_accuracy": 0.9834694027900696, + "num_tokens": 59671666.0, + "step": 22820 + }, + { + "entropy": 0.05841764649376273, + "epoch": 5.320550180673738, + "grad_norm": 3.078125, + "learning_rate": 4.925799221631664e-05, + "loss": 0.0625, + "mean_token_accuracy": 0.9858494102954865, + "num_tokens": 59689708.0, + "step": 22825 + }, + { + "entropy": 0.07404088694602251, + "epoch": 5.32171581769437, + "grad_norm": 1.3984375, + "learning_rate": 4.925747405395479e-05, + "loss": 0.0631, + "mean_token_accuracy": 0.9827221155166626, + "num_tokens": 59701112.0, + "step": 22830 + }, + { + "entropy": 0.07440090011805296, + "epoch": 5.322881454715001, + "grad_norm": 3.109375, + "learning_rate": 4.925695571627131e-05, + "loss": 0.0681, + "mean_token_accuracy": 0.9816492676734925, + "num_tokens": 59712390.0, + "step": 22835 + }, + { + "entropy": 0.09319182969629765, + "epoch": 5.3240470917356335, + "grad_norm": 6.5, + "learning_rate": 4.925643720327395e-05, + "loss": 0.0705, + "mean_token_accuracy": 0.9809212386608124, + "num_tokens": 59730225.0, + "step": 22840 + }, + { + "entropy": 0.06552286930382252, + "epoch": 5.325212728756266, + "grad_norm": 2.09375, + "learning_rate": 4.9255918514970424e-05, + "loss": 0.0519, + "mean_token_accuracy": 0.9848517298698425, + "num_tokens": 59747533.0, + "step": 22845 + }, + { + "entropy": 0.06686258316040039, + "epoch": 5.326378365776897, + "grad_norm": 3.609375, + "learning_rate": 4.9255399651368465e-05, + "loss": 0.0496, + "mean_token_accuracy": 0.9866720378398895, + "num_tokens": 59772803.0, + "step": 22850 + }, + { + "entropy": 0.08167769797146321, + "epoch": 5.327544002797529, + "grad_norm": 0.62890625, + "learning_rate": 4.9254880612475816e-05, + "loss": 0.0716, + "mean_token_accuracy": 0.9794778764247895, + "num_tokens": 59786475.0, + "step": 22855 + }, + { + "entropy": 0.06763834794983267, + "epoch": 5.328709639818161, + "grad_norm": 0.3203125, + "learning_rate": 4.925436139830022e-05, + "loss": 0.0275, + "mean_token_accuracy": 0.9890903234481812, + "num_tokens": 59815060.0, + "step": 22860 + }, + { + "entropy": 0.04429660914465785, + "epoch": 5.329875276838792, + "grad_norm": 1.328125, + "learning_rate": 4.925384200884942e-05, + "loss": 0.0434, + "mean_token_accuracy": 0.9886953115463257, + "num_tokens": 59855529.0, + "step": 22865 + }, + { + "entropy": 0.06582138538360596, + "epoch": 5.331040913859424, + "grad_norm": 1.5078125, + "learning_rate": 4.925332244413115e-05, + "loss": 0.0621, + "mean_token_accuracy": 0.981554490327835, + "num_tokens": 59869039.0, + "step": 22870 + }, + { + "entropy": 0.07861397005617618, + "epoch": 5.332206550880056, + "grad_norm": 1.765625, + "learning_rate": 4.9252802704153176e-05, + "loss": 0.0727, + "mean_token_accuracy": 0.9805832087993622, + "num_tokens": 59883296.0, + "step": 22875 + }, + { + "entropy": 0.23328736871480943, + "epoch": 5.333372187900688, + "grad_norm": 0.94921875, + "learning_rate": 4.925228278892323e-05, + "loss": 0.3953, + "mean_token_accuracy": 0.9536088466644287, + "num_tokens": 59902762.0, + "step": 22880 + }, + { + "entropy": 0.10579044306650758, + "epoch": 5.33453782492132, + "grad_norm": 0.52734375, + "learning_rate": 4.925176269844907e-05, + "loss": 0.051, + "mean_token_accuracy": 0.9797735571861267, + "num_tokens": 59924881.0, + "step": 22885 + }, + { + "entropy": 0.06493972707539797, + "epoch": 5.335703461941951, + "grad_norm": 1.75, + "learning_rate": 4.925124243273845e-05, + "loss": 0.0758, + "mean_token_accuracy": 0.9782446086406708, + "num_tokens": 59934987.0, + "step": 22890 + }, + { + "entropy": 0.06513304226100444, + "epoch": 5.336869098962583, + "grad_norm": 1.1953125, + "learning_rate": 4.925072199179913e-05, + "loss": 0.0462, + "mean_token_accuracy": 0.9851133108139039, + "num_tokens": 59959324.0, + "step": 22895 + }, + { + "entropy": 0.05224155634641647, + "epoch": 5.338034735983215, + "grad_norm": 0.76953125, + "learning_rate": 4.925020137563887e-05, + "loss": 0.044, + "mean_token_accuracy": 0.9833649158477783, + "num_tokens": 59983184.0, + "step": 22900 + }, + { + "entropy": 0.056037331186234954, + "epoch": 5.339200373003846, + "grad_norm": 0.64453125, + "learning_rate": 4.924968058426545e-05, + "loss": 0.0449, + "mean_token_accuracy": 0.9889161825180054, + "num_tokens": 60001573.0, + "step": 22905 + }, + { + "entropy": 0.04825269635766745, + "epoch": 5.3403660100244785, + "grad_norm": 0.3515625, + "learning_rate": 4.9249159617686604e-05, + "loss": 0.0468, + "mean_token_accuracy": 0.9882710158824921, + "num_tokens": 60022740.0, + "step": 22910 + }, + { + "entropy": 0.1773822302930057, + "epoch": 5.34153164704511, + "grad_norm": 5.46875, + "learning_rate": 4.9248638475910115e-05, + "loss": 0.2673, + "mean_token_accuracy": 0.9561372995376587, + "num_tokens": 60052310.0, + "step": 22915 + }, + { + "entropy": 0.08487709350883961, + "epoch": 5.342697284065742, + "grad_norm": 4.09375, + "learning_rate": 4.924811715894376e-05, + "loss": 0.0975, + "mean_token_accuracy": 0.974593210220337, + "num_tokens": 60070713.0, + "step": 22920 + }, + { + "entropy": 0.056556498538702725, + "epoch": 5.343862921086374, + "grad_norm": 3.625, + "learning_rate": 4.924759566679531e-05, + "loss": 0.0526, + "mean_token_accuracy": 0.984348326921463, + "num_tokens": 60096937.0, + "step": 22925 + }, + { + "entropy": 0.07066576043143868, + "epoch": 5.345028558107005, + "grad_norm": 2.328125, + "learning_rate": 4.9247073999472536e-05, + "loss": 0.0562, + "mean_token_accuracy": 0.9847615897655487, + "num_tokens": 60117778.0, + "step": 22930 + }, + { + "entropy": 0.10020845541730523, + "epoch": 5.346194195127637, + "grad_norm": 1.3515625, + "learning_rate": 4.9246552156983224e-05, + "loss": 0.0264, + "mean_token_accuracy": 0.9856527805328369, + "num_tokens": 60146277.0, + "step": 22935 + }, + { + "entropy": 0.07661400884389877, + "epoch": 5.347359832148269, + "grad_norm": 2.71875, + "learning_rate": 4.924603013933515e-05, + "loss": 0.0775, + "mean_token_accuracy": 0.9822577476501465, + "num_tokens": 60156866.0, + "step": 22940 + }, + { + "entropy": 0.08874692600220442, + "epoch": 5.348525469168901, + "grad_norm": 2.625, + "learning_rate": 4.92455079465361e-05, + "loss": 0.0369, + "mean_token_accuracy": 0.9810423135757447, + "num_tokens": 60176878.0, + "step": 22945 + }, + { + "entropy": 0.07762233018875123, + "epoch": 5.349691106189533, + "grad_norm": 0.921875, + "learning_rate": 4.924498557859386e-05, + "loss": 0.0807, + "mean_token_accuracy": 0.9797823786735534, + "num_tokens": 60194815.0, + "step": 22950 + }, + { + "entropy": 0.05616168519482016, + "epoch": 5.350856743210164, + "grad_norm": 0.96484375, + "learning_rate": 4.924446303551622e-05, + "loss": 0.0522, + "mean_token_accuracy": 0.9837803184986115, + "num_tokens": 60214031.0, + "step": 22955 + }, + { + "entropy": 0.06755325347185134, + "epoch": 5.352022380230796, + "grad_norm": 2.90625, + "learning_rate": 4.9243940317310964e-05, + "loss": 0.0697, + "mean_token_accuracy": 0.9821909844875336, + "num_tokens": 60227009.0, + "step": 22960 + }, + { + "entropy": 0.07493439922109246, + "epoch": 5.353188017251428, + "grad_norm": 0.328125, + "learning_rate": 4.92434174239859e-05, + "loss": 0.0874, + "mean_token_accuracy": 0.9795318305492401, + "num_tokens": 60240587.0, + "step": 22965 + }, + { + "entropy": 0.059706505248323084, + "epoch": 5.354353654272059, + "grad_norm": 0.5703125, + "learning_rate": 4.924289435554882e-05, + "loss": 0.0517, + "mean_token_accuracy": 0.9835958003997802, + "num_tokens": 60265229.0, + "step": 22970 + }, + { + "entropy": 0.0833500050008297, + "epoch": 5.355519291292691, + "grad_norm": 3.140625, + "learning_rate": 4.9242371112007526e-05, + "loss": 0.0761, + "mean_token_accuracy": 0.9778411746025085, + "num_tokens": 60283665.0, + "step": 22975 + }, + { + "entropy": 0.0801816951483488, + "epoch": 5.3566849283133235, + "grad_norm": 3.859375, + "learning_rate": 4.9241847693369816e-05, + "loss": 0.0882, + "mean_token_accuracy": 0.9788227140903473, + "num_tokens": 60292492.0, + "step": 22980 + }, + { + "entropy": 0.055112460954114796, + "epoch": 5.357850565333955, + "grad_norm": 0.734375, + "learning_rate": 4.924132409964349e-05, + "loss": 0.0386, + "mean_token_accuracy": 0.9883932769298553, + "num_tokens": 60319183.0, + "step": 22985 + }, + { + "entropy": 0.06558550810441374, + "epoch": 5.359016202354587, + "grad_norm": 0.3203125, + "learning_rate": 4.924080033083637e-05, + "loss": 0.0699, + "mean_token_accuracy": 0.9772551417350769, + "num_tokens": 60335640.0, + "step": 22990 + }, + { + "entropy": 0.05295390598475933, + "epoch": 5.360181839375219, + "grad_norm": 1.25, + "learning_rate": 4.9240276386956246e-05, + "loss": 0.0433, + "mean_token_accuracy": 0.9863731324672699, + "num_tokens": 60352124.0, + "step": 22995 + }, + { + "entropy": 0.06834200341254473, + "epoch": 5.36134747639585, + "grad_norm": 2.453125, + "learning_rate": 4.923975226801095e-05, + "loss": 0.0382, + "mean_token_accuracy": 0.9834942102432251, + "num_tokens": 60376603.0, + "step": 23000 + }, + { + "entropy": 0.05982331410050392, + "epoch": 5.362513113416482, + "grad_norm": 1.8046875, + "learning_rate": 4.92392279740083e-05, + "loss": 0.0499, + "mean_token_accuracy": 0.9867896795272827, + "num_tokens": 60390601.0, + "step": 23005 + }, + { + "entropy": 0.05300872353836894, + "epoch": 5.3636787504371135, + "grad_norm": 0.310546875, + "learning_rate": 4.9238703504956084e-05, + "loss": 0.052, + "mean_token_accuracy": 0.9840744435787201, + "num_tokens": 60413557.0, + "step": 23010 + }, + { + "entropy": 0.11512393001466989, + "epoch": 5.364844387457746, + "grad_norm": 0.466796875, + "learning_rate": 4.9238178860862155e-05, + "loss": 0.1433, + "mean_token_accuracy": 0.9661681652069092, + "num_tokens": 60447690.0, + "step": 23015 + }, + { + "entropy": 0.08497421033680438, + "epoch": 5.366010024478378, + "grad_norm": 1.75, + "learning_rate": 4.923765404173432e-05, + "loss": 0.0769, + "mean_token_accuracy": 0.9765770733356476, + "num_tokens": 60457956.0, + "step": 23020 + }, + { + "entropy": 0.06545198997482657, + "epoch": 5.367175661499009, + "grad_norm": 3.234375, + "learning_rate": 4.923712904758041e-05, + "loss": 0.0718, + "mean_token_accuracy": 0.9820848286151886, + "num_tokens": 60477779.0, + "step": 23025 + }, + { + "entropy": 0.05556817147880792, + "epoch": 5.368341298519641, + "grad_norm": 0.86328125, + "learning_rate": 4.923660387840826e-05, + "loss": 0.0521, + "mean_token_accuracy": 0.9859776258468628, + "num_tokens": 60496484.0, + "step": 23030 + }, + { + "entropy": 0.06978419441729784, + "epoch": 5.369506935540273, + "grad_norm": 0.10986328125, + "learning_rate": 4.923607853422568e-05, + "loss": 0.0377, + "mean_token_accuracy": 0.983120220899582, + "num_tokens": 60527179.0, + "step": 23035 + }, + { + "entropy": 0.05648170947097242, + "epoch": 5.370672572560904, + "grad_norm": 0.66796875, + "learning_rate": 4.9235553015040525e-05, + "loss": 0.0363, + "mean_token_accuracy": 0.9899722754955291, + "num_tokens": 60554130.0, + "step": 23040 + }, + { + "entropy": 0.06253222562372684, + "epoch": 5.3718382095815365, + "grad_norm": 2.59375, + "learning_rate": 4.923502732086062e-05, + "loss": 0.0654, + "mean_token_accuracy": 0.9840362310409546, + "num_tokens": 60568544.0, + "step": 23045 + }, + { + "entropy": 0.06851723911240697, + "epoch": 5.373003846602168, + "grad_norm": 1.3515625, + "learning_rate": 4.923450145169381e-05, + "loss": 0.0727, + "mean_token_accuracy": 0.9818732261657714, + "num_tokens": 60581662.0, + "step": 23050 + }, + { + "entropy": 0.061725224601104856, + "epoch": 5.3741694836228, + "grad_norm": 1.390625, + "learning_rate": 4.923397540754793e-05, + "loss": 0.0405, + "mean_token_accuracy": 0.9862514436244965, + "num_tokens": 60607397.0, + "step": 23055 + }, + { + "entropy": 0.04291647081263363, + "epoch": 5.375335120643432, + "grad_norm": 0.1533203125, + "learning_rate": 4.923344918843084e-05, + "loss": 0.0226, + "mean_token_accuracy": 0.9882668256759644, + "num_tokens": 60637436.0, + "step": 23060 + }, + { + "entropy": 0.07198273623362184, + "epoch": 5.376500757664063, + "grad_norm": 1.7890625, + "learning_rate": 4.923292279435036e-05, + "loss": 0.0546, + "mean_token_accuracy": 0.9835138738155365, + "num_tokens": 60662709.0, + "step": 23065 + }, + { + "entropy": 0.04424673020839691, + "epoch": 5.377666394684695, + "grad_norm": 3.5625, + "learning_rate": 4.923239622531436e-05, + "loss": 0.0484, + "mean_token_accuracy": 0.9847693383693695, + "num_tokens": 60690611.0, + "step": 23070 + }, + { + "entropy": 0.07281300444155932, + "epoch": 5.378832031705327, + "grad_norm": 0.8359375, + "learning_rate": 4.923186948133068e-05, + "loss": 0.0682, + "mean_token_accuracy": 0.9764643311500549, + "num_tokens": 60714624.0, + "step": 23075 + }, + { + "entropy": 0.0488906929269433, + "epoch": 5.3799976687259585, + "grad_norm": 2.96875, + "learning_rate": 4.923134256240718e-05, + "loss": 0.0612, + "mean_token_accuracy": 0.985430383682251, + "num_tokens": 60747333.0, + "step": 23080 + }, + { + "entropy": 0.07100528012961149, + "epoch": 5.381163305746591, + "grad_norm": 0.498046875, + "learning_rate": 4.923081546855173e-05, + "loss": 0.0735, + "mean_token_accuracy": 0.9796480000019073, + "num_tokens": 60764522.0, + "step": 23085 + }, + { + "entropy": 0.04467966118827462, + "epoch": 5.382328942767222, + "grad_norm": 0.57421875, + "learning_rate": 4.923028819977217e-05, + "loss": 0.0198, + "mean_token_accuracy": 0.9906904339790344, + "num_tokens": 60794466.0, + "step": 23090 + }, + { + "entropy": 0.07430108338594436, + "epoch": 5.383494579787854, + "grad_norm": 0.64453125, + "learning_rate": 4.9229760756076365e-05, + "loss": 0.044, + "mean_token_accuracy": 0.9841860115528107, + "num_tokens": 60823886.0, + "step": 23095 + }, + { + "entropy": 0.06416118433699011, + "epoch": 5.384660216808486, + "grad_norm": 2.375, + "learning_rate": 4.922923313747218e-05, + "loss": 0.0544, + "mean_token_accuracy": 0.9824954152107239, + "num_tokens": 60842778.0, + "step": 23100 + }, + { + "entropy": 0.06129303025081754, + "epoch": 5.385825853829117, + "grad_norm": 2.5, + "learning_rate": 4.92287053439675e-05, + "loss": 0.0441, + "mean_token_accuracy": 0.9878499805927277, + "num_tokens": 60863963.0, + "step": 23105 + }, + { + "entropy": 0.056876167096197604, + "epoch": 5.386991490849749, + "grad_norm": 1.109375, + "learning_rate": 4.922817737557018e-05, + "loss": 0.0511, + "mean_token_accuracy": 0.9832296967506409, + "num_tokens": 60880737.0, + "step": 23110 + }, + { + "entropy": 0.06693640761077405, + "epoch": 5.3881571278703815, + "grad_norm": 3.421875, + "learning_rate": 4.922764923228809e-05, + "loss": 0.0559, + "mean_token_accuracy": 0.9846435010433197, + "num_tokens": 60901158.0, + "step": 23115 + }, + { + "entropy": 0.05447808532044292, + "epoch": 5.389322764891013, + "grad_norm": 1.2890625, + "learning_rate": 4.922712091412912e-05, + "loss": 0.058, + "mean_token_accuracy": 0.984618604183197, + "num_tokens": 60925795.0, + "step": 23120 + }, + { + "entropy": 0.06588453128933906, + "epoch": 5.390488401911645, + "grad_norm": 1.5234375, + "learning_rate": 4.9226592421101134e-05, + "loss": 0.0641, + "mean_token_accuracy": 0.986005574464798, + "num_tokens": 60948082.0, + "step": 23125 + }, + { + "entropy": 0.0593166496604681, + "epoch": 5.391654038932277, + "grad_norm": 0.83984375, + "learning_rate": 4.922606375321201e-05, + "loss": 0.0496, + "mean_token_accuracy": 0.9875987470149994, + "num_tokens": 60959806.0, + "step": 23130 + }, + { + "entropy": 0.04983818177133799, + "epoch": 5.392819675952908, + "grad_norm": 0.36328125, + "learning_rate": 4.922553491046965e-05, + "loss": 0.0235, + "mean_token_accuracy": 0.989845621585846, + "num_tokens": 60990336.0, + "step": 23135 + }, + { + "entropy": 0.043969867564737794, + "epoch": 5.39398531297354, + "grad_norm": 4.375, + "learning_rate": 4.9225005892881917e-05, + "loss": 0.0296, + "mean_token_accuracy": 0.9865171551704407, + "num_tokens": 61029307.0, + "step": 23140 + }, + { + "entropy": 0.06361808869987726, + "epoch": 5.395150949994171, + "grad_norm": 1.71875, + "learning_rate": 4.922447670045672e-05, + "loss": 0.0494, + "mean_token_accuracy": 0.9815893948078156, + "num_tokens": 61059113.0, + "step": 23145 + }, + { + "entropy": 0.05684220269322395, + "epoch": 5.3963165870148035, + "grad_norm": 1.390625, + "learning_rate": 4.922394733320193e-05, + "loss": 0.0478, + "mean_token_accuracy": 0.9857088804244996, + "num_tokens": 61082757.0, + "step": 23150 + }, + { + "entropy": 0.05390456821769476, + "epoch": 5.397482224035436, + "grad_norm": 0.439453125, + "learning_rate": 4.922341779112546e-05, + "loss": 0.0586, + "mean_token_accuracy": 0.9862619757652282, + "num_tokens": 61107803.0, + "step": 23155 + }, + { + "entropy": 0.05104854414239526, + "epoch": 5.398647861056067, + "grad_norm": 1.484375, + "learning_rate": 4.9222888074235194e-05, + "loss": 0.0264, + "mean_token_accuracy": 0.9864234507083893, + "num_tokens": 61134142.0, + "step": 23160 + }, + { + "entropy": 0.05798858366906643, + "epoch": 5.399813498076699, + "grad_norm": 1.3671875, + "learning_rate": 4.922235818253904e-05, + "loss": 0.0315, + "mean_token_accuracy": 0.9854933500289917, + "num_tokens": 61162301.0, + "step": 23165 + }, + { + "entropy": 0.08851273730397224, + "epoch": 5.400979135097331, + "grad_norm": 2.359375, + "learning_rate": 4.922182811604489e-05, + "loss": 0.1097, + "mean_token_accuracy": 0.9737513303756714, + "num_tokens": 61169551.0, + "step": 23170 + }, + { + "entropy": 0.05317346574738622, + "epoch": 5.402144772117962, + "grad_norm": 0.390625, + "learning_rate": 4.922129787476065e-05, + "loss": 0.027, + "mean_token_accuracy": 0.9891198635101318, + "num_tokens": 61194654.0, + "step": 23175 + }, + { + "entropy": 0.05494374092668295, + "epoch": 5.403310409138594, + "grad_norm": 0.2470703125, + "learning_rate": 4.922076745869423e-05, + "loss": 0.0386, + "mean_token_accuracy": 0.9859078824520111, + "num_tokens": 61216063.0, + "step": 23180 + }, + { + "entropy": 0.07559232525527478, + "epoch": 5.404476046159226, + "grad_norm": 2.4375, + "learning_rate": 4.9220236867853544e-05, + "loss": 0.0942, + "mean_token_accuracy": 0.974918645620346, + "num_tokens": 61225374.0, + "step": 23185 + }, + { + "entropy": 0.05903605557978153, + "epoch": 5.405641683179858, + "grad_norm": 2.15625, + "learning_rate": 4.9219706102246494e-05, + "loss": 0.0425, + "mean_token_accuracy": 0.9827019810676575, + "num_tokens": 61248622.0, + "step": 23190 + }, + { + "entropy": 0.07795067802071572, + "epoch": 5.40680732020049, + "grad_norm": 1.6875, + "learning_rate": 4.9219175161880996e-05, + "loss": 0.0693, + "mean_token_accuracy": 0.9779790699481964, + "num_tokens": 61268201.0, + "step": 23195 + }, + { + "entropy": 0.07784415520727635, + "epoch": 5.407972957221121, + "grad_norm": 0.8203125, + "learning_rate": 4.921864404676497e-05, + "loss": 0.0689, + "mean_token_accuracy": 0.9818279802799225, + "num_tokens": 61284818.0, + "step": 23200 + }, + { + "entropy": 0.0624196344986558, + "epoch": 5.409138594241753, + "grad_norm": 2.640625, + "learning_rate": 4.921811275690634e-05, + "loss": 0.0509, + "mean_token_accuracy": 0.985440480709076, + "num_tokens": 61299220.0, + "step": 23205 + }, + { + "entropy": 0.07447193302214146, + "epoch": 5.410304231262385, + "grad_norm": 1.28125, + "learning_rate": 4.921758129231302e-05, + "loss": 0.0516, + "mean_token_accuracy": 0.985534542798996, + "num_tokens": 61328795.0, + "step": 23210 + }, + { + "entropy": 0.07264035008847713, + "epoch": 5.4114698682830165, + "grad_norm": 2.953125, + "learning_rate": 4.921704965299294e-05, + "loss": 0.048, + "mean_token_accuracy": 0.984207808971405, + "num_tokens": 61366583.0, + "step": 23215 + }, + { + "entropy": 0.06596151059493423, + "epoch": 5.412635505303649, + "grad_norm": 0.431640625, + "learning_rate": 4.921651783895403e-05, + "loss": 0.0489, + "mean_token_accuracy": 0.9818928420543671, + "num_tokens": 61395745.0, + "step": 23220 + }, + { + "entropy": 0.0670158120803535, + "epoch": 5.41380114232428, + "grad_norm": 0.212890625, + "learning_rate": 4.921598585020421e-05, + "loss": 0.0558, + "mean_token_accuracy": 0.9837601244449615, + "num_tokens": 61414740.0, + "step": 23225 + }, + { + "entropy": 0.05426953062415123, + "epoch": 5.414966779344912, + "grad_norm": 1.796875, + "learning_rate": 4.9215453686751425e-05, + "loss": 0.027, + "mean_token_accuracy": 0.9845106363296509, + "num_tokens": 61445656.0, + "step": 23230 + }, + { + "entropy": 0.09449506886303424, + "epoch": 5.416132416365544, + "grad_norm": 2.140625, + "learning_rate": 4.92149213486036e-05, + "loss": 0.1055, + "mean_token_accuracy": 0.9766972839832306, + "num_tokens": 61467238.0, + "step": 23235 + }, + { + "entropy": 0.061407574266195294, + "epoch": 5.417298053386175, + "grad_norm": 2.421875, + "learning_rate": 4.921438883576868e-05, + "loss": 0.0448, + "mean_token_accuracy": 0.9856343030929565, + "num_tokens": 61490562.0, + "step": 23240 + }, + { + "entropy": 0.06952263060957194, + "epoch": 5.418463690406807, + "grad_norm": 3.328125, + "learning_rate": 4.92138561482546e-05, + "loss": 0.0752, + "mean_token_accuracy": 0.9781166553497315, + "num_tokens": 61506678.0, + "step": 23245 + }, + { + "entropy": 0.06815708391368389, + "epoch": 5.419629327427439, + "grad_norm": 1.0546875, + "learning_rate": 4.9213323286069306e-05, + "loss": 0.0527, + "mean_token_accuracy": 0.9844055712223053, + "num_tokens": 61528547.0, + "step": 23250 + }, + { + "entropy": 0.07312852032482624, + "epoch": 5.420794964448071, + "grad_norm": 1.8984375, + "learning_rate": 4.9212790249220746e-05, + "loss": 0.0722, + "mean_token_accuracy": 0.9812857508659363, + "num_tokens": 61536967.0, + "step": 23255 + }, + { + "entropy": 0.050287547335028646, + "epoch": 5.421960601468703, + "grad_norm": 2.359375, + "learning_rate": 4.921225703771687e-05, + "loss": 0.0424, + "mean_token_accuracy": 0.9859198093414306, + "num_tokens": 61564543.0, + "step": 23260 + }, + { + "entropy": 0.06520504876971245, + "epoch": 5.423126238489335, + "grad_norm": 1.4375, + "learning_rate": 4.921172365156562e-05, + "loss": 0.0734, + "mean_token_accuracy": 0.9789164006710053, + "num_tokens": 61575211.0, + "step": 23265 + }, + { + "entropy": 0.0722085983492434, + "epoch": 5.424291875509966, + "grad_norm": 6.46875, + "learning_rate": 4.9211190090774956e-05, + "loss": 0.0831, + "mean_token_accuracy": 0.9780640542507172, + "num_tokens": 61590751.0, + "step": 23270 + }, + { + "entropy": 0.09001476243138314, + "epoch": 5.425457512530598, + "grad_norm": 2.4375, + "learning_rate": 4.921065635535284e-05, + "loss": 0.0827, + "mean_token_accuracy": 0.9779356181621551, + "num_tokens": 61598970.0, + "step": 23275 + }, + { + "entropy": 0.07482207007706165, + "epoch": 5.426623149551229, + "grad_norm": 1.7890625, + "learning_rate": 4.921012244530721e-05, + "loss": 0.0803, + "mean_token_accuracy": 0.9787347793579102, + "num_tokens": 61608913.0, + "step": 23280 + }, + { + "entropy": 0.05302851120941341, + "epoch": 5.4277887865718615, + "grad_norm": 2.25, + "learning_rate": 4.920958836064605e-05, + "loss": 0.0378, + "mean_token_accuracy": 0.9879450976848603, + "num_tokens": 61630650.0, + "step": 23285 + }, + { + "entropy": 0.06299825739115476, + "epoch": 5.428954423592494, + "grad_norm": 1.671875, + "learning_rate": 4.920905410137732e-05, + "loss": 0.0444, + "mean_token_accuracy": 0.983812016248703, + "num_tokens": 61653853.0, + "step": 23290 + }, + { + "entropy": 0.07575079947710037, + "epoch": 5.430120060613125, + "grad_norm": 0.474609375, + "learning_rate": 4.920851966750897e-05, + "loss": 0.0713, + "mean_token_accuracy": 0.9819688856601715, + "num_tokens": 61671710.0, + "step": 23295 + }, + { + "entropy": 0.06498388964682818, + "epoch": 5.431285697633757, + "grad_norm": 0.9140625, + "learning_rate": 4.920798505904899e-05, + "loss": 0.0556, + "mean_token_accuracy": 0.9803281247615814, + "num_tokens": 61695854.0, + "step": 23300 + }, + { + "entropy": 0.06389341745525598, + "epoch": 5.432451334654389, + "grad_norm": 0.703125, + "learning_rate": 4.920745027600534e-05, + "loss": 0.0505, + "mean_token_accuracy": 0.9866482019424438, + "num_tokens": 61709957.0, + "step": 23305 + }, + { + "entropy": 0.06346920877695084, + "epoch": 5.43361697167502, + "grad_norm": 1.3671875, + "learning_rate": 4.9206915318385996e-05, + "loss": 0.0555, + "mean_token_accuracy": 0.982583349943161, + "num_tokens": 61733369.0, + "step": 23310 + }, + { + "entropy": 0.08282421119511127, + "epoch": 5.434782608695652, + "grad_norm": 2.828125, + "learning_rate": 4.920638018619894e-05, + "loss": 0.0887, + "mean_token_accuracy": 0.9766203939914704, + "num_tokens": 61750862.0, + "step": 23315 + }, + { + "entropy": 0.07322307983413338, + "epoch": 5.4359482457162835, + "grad_norm": 2.0625, + "learning_rate": 4.920584487945215e-05, + "loss": 0.052, + "mean_token_accuracy": 0.9795838713645935, + "num_tokens": 61775206.0, + "step": 23320 + }, + { + "entropy": 0.06913946568965912, + "epoch": 5.437113882736916, + "grad_norm": 3.59375, + "learning_rate": 4.9205309398153596e-05, + "loss": 0.0503, + "mean_token_accuracy": 0.9846193373203278, + "num_tokens": 61792356.0, + "step": 23325 + }, + { + "entropy": 0.08056791853159666, + "epoch": 5.438279519757548, + "grad_norm": 4.875, + "learning_rate": 4.9204773742311275e-05, + "loss": 0.0916, + "mean_token_accuracy": 0.9753158092498779, + "num_tokens": 61802752.0, + "step": 23330 + }, + { + "entropy": 0.06826584823429585, + "epoch": 5.439445156778179, + "grad_norm": 2.40625, + "learning_rate": 4.920423791193318e-05, + "loss": 0.0527, + "mean_token_accuracy": 0.9859543740749359, + "num_tokens": 61817406.0, + "step": 23335 + }, + { + "entropy": 0.0683171335607767, + "epoch": 5.440610793798811, + "grad_norm": 0.7421875, + "learning_rate": 4.920370190702729e-05, + "loss": 0.0527, + "mean_token_accuracy": 0.9838370501995086, + "num_tokens": 61845861.0, + "step": 23340 + }, + { + "entropy": 0.05109058595262468, + "epoch": 5.441776430819443, + "grad_norm": 0.296875, + "learning_rate": 4.92031657276016e-05, + "loss": 0.0332, + "mean_token_accuracy": 0.987951809167862, + "num_tokens": 61871427.0, + "step": 23345 + }, + { + "entropy": 0.04325137957930565, + "epoch": 5.442942067840074, + "grad_norm": 0.6796875, + "learning_rate": 4.9202629373664114e-05, + "loss": 0.0224, + "mean_token_accuracy": 0.9934803903102875, + "num_tokens": 61905169.0, + "step": 23350 + }, + { + "entropy": 0.059416829235851766, + "epoch": 5.4441077048607065, + "grad_norm": 1.140625, + "learning_rate": 4.920209284522282e-05, + "loss": 0.0422, + "mean_token_accuracy": 0.9885890066623688, + "num_tokens": 61924906.0, + "step": 23355 + }, + { + "entropy": 0.06894025560468435, + "epoch": 5.445273341881338, + "grad_norm": 0.8515625, + "learning_rate": 4.920155614228571e-05, + "loss": 0.0368, + "mean_token_accuracy": 0.9853028953075409, + "num_tokens": 61949361.0, + "step": 23360 + }, + { + "entropy": 0.07303128968924284, + "epoch": 5.44643897890197, + "grad_norm": 5.6875, + "learning_rate": 4.92010192648608e-05, + "loss": 0.069, + "mean_token_accuracy": 0.9811192095279694, + "num_tokens": 61960365.0, + "step": 23365 + }, + { + "entropy": 0.07817558608949185, + "epoch": 5.447604615922602, + "grad_norm": 1.609375, + "learning_rate": 4.92004822129561e-05, + "loss": 0.0836, + "mean_token_accuracy": 0.978845477104187, + "num_tokens": 61970637.0, + "step": 23370 + }, + { + "entropy": 0.06893630996346474, + "epoch": 5.448770252943233, + "grad_norm": 2.453125, + "learning_rate": 4.9199944986579616e-05, + "loss": 0.0599, + "mean_token_accuracy": 0.9823831498622895, + "num_tokens": 61990173.0, + "step": 23375 + }, + { + "entropy": 0.06768325287848712, + "epoch": 5.449935889963865, + "grad_norm": 0.8359375, + "learning_rate": 4.9199407585739344e-05, + "loss": 0.0448, + "mean_token_accuracy": 0.9853338956832886, + "num_tokens": 62007777.0, + "step": 23380 + }, + { + "entropy": 0.04943525260314345, + "epoch": 5.451101526984497, + "grad_norm": 0.89453125, + "learning_rate": 4.919887001044332e-05, + "loss": 0.0362, + "mean_token_accuracy": 0.9880561709403992, + "num_tokens": 62029816.0, + "step": 23385 + }, + { + "entropy": 0.05788094364106655, + "epoch": 5.452267164005129, + "grad_norm": 0.349609375, + "learning_rate": 4.919833226069954e-05, + "loss": 0.0599, + "mean_token_accuracy": 0.9839804172515869, + "num_tokens": 62044288.0, + "step": 23390 + }, + { + "entropy": 0.06028408519923687, + "epoch": 5.453432801025761, + "grad_norm": 3.375, + "learning_rate": 4.919779433651603e-05, + "loss": 0.0457, + "mean_token_accuracy": 0.9855101108551025, + "num_tokens": 62069683.0, + "step": 23395 + }, + { + "entropy": 0.07415076242759824, + "epoch": 5.454598438046393, + "grad_norm": 0.111328125, + "learning_rate": 4.9197256237900815e-05, + "loss": 0.0554, + "mean_token_accuracy": 0.9815491080284119, + "num_tokens": 62098121.0, + "step": 23400 + }, + { + "entropy": 0.055291141476482154, + "epoch": 5.455764075067024, + "grad_norm": 0.33203125, + "learning_rate": 4.919671796486191e-05, + "loss": 0.026, + "mean_token_accuracy": 0.9862238466739655, + "num_tokens": 62136468.0, + "step": 23405 + }, + { + "entropy": 0.046719343215227124, + "epoch": 5.456929712087656, + "grad_norm": 0.54296875, + "learning_rate": 4.919617951740735e-05, + "loss": 0.0252, + "mean_token_accuracy": 0.9928770422935486, + "num_tokens": 62169672.0, + "step": 23410 + }, + { + "entropy": 0.0630274849012494, + "epoch": 5.458095349108287, + "grad_norm": 2.296875, + "learning_rate": 4.919564089554516e-05, + "loss": 0.0426, + "mean_token_accuracy": 0.9808637201786041, + "num_tokens": 62191802.0, + "step": 23415 + }, + { + "entropy": 0.06985501274466514, + "epoch": 5.459260986128919, + "grad_norm": 0.61328125, + "learning_rate": 4.919510209928338e-05, + "loss": 0.0763, + "mean_token_accuracy": 0.9818107604980468, + "num_tokens": 62211482.0, + "step": 23420 + }, + { + "entropy": 0.050887050852179525, + "epoch": 5.4604266231495515, + "grad_norm": 1.7578125, + "learning_rate": 4.9194563128630034e-05, + "loss": 0.0469, + "mean_token_accuracy": 0.9865157008171082, + "num_tokens": 62241942.0, + "step": 23425 + }, + { + "entropy": 0.07516262400895357, + "epoch": 5.461592260170183, + "grad_norm": 4.1875, + "learning_rate": 4.9194023983593164e-05, + "loss": 0.0863, + "mean_token_accuracy": 0.9760660409927369, + "num_tokens": 62252888.0, + "step": 23430 + }, + { + "entropy": 0.0662376806139946, + "epoch": 5.462757897190815, + "grad_norm": 1.8046875, + "learning_rate": 4.91934846641808e-05, + "loss": 0.0362, + "mean_token_accuracy": 0.9875830292701722, + "num_tokens": 62270136.0, + "step": 23435 + }, + { + "entropy": 0.10946454834192991, + "epoch": 5.463923534211447, + "grad_norm": 3.515625, + "learning_rate": 4.9192945170401e-05, + "loss": 0.155, + "mean_token_accuracy": 0.9659136891365051, + "num_tokens": 62300217.0, + "step": 23440 + }, + { + "entropy": 0.0465864603407681, + "epoch": 5.465089171232078, + "grad_norm": 3.328125, + "learning_rate": 4.919240550226179e-05, + "loss": 0.0499, + "mean_token_accuracy": 0.9880475103855133, + "num_tokens": 62325726.0, + "step": 23445 + }, + { + "entropy": 0.058729467820376156, + "epoch": 5.46625480825271, + "grad_norm": 1.2109375, + "learning_rate": 4.919186565977124e-05, + "loss": 0.0248, + "mean_token_accuracy": 0.9913538217544555, + "num_tokens": 62360995.0, + "step": 23450 + }, + { + "entropy": 0.07499024383723736, + "epoch": 5.4674204452733415, + "grad_norm": 1.046875, + "learning_rate": 4.919132564293738e-05, + "loss": 0.0848, + "mean_token_accuracy": 0.9793896615505219, + "num_tokens": 62370698.0, + "step": 23455 + }, + { + "entropy": 0.07226166576147079, + "epoch": 5.468586082293974, + "grad_norm": 2.5, + "learning_rate": 4.919078545176827e-05, + "loss": 0.0683, + "mean_token_accuracy": 0.9799470126628875, + "num_tokens": 62381984.0, + "step": 23460 + }, + { + "entropy": 0.055472942627966405, + "epoch": 5.469751719314606, + "grad_norm": 0.79296875, + "learning_rate": 4.919024508627197e-05, + "loss": 0.0485, + "mean_token_accuracy": 0.9879952669143677, + "num_tokens": 62399610.0, + "step": 23465 + }, + { + "entropy": 0.06007210109382868, + "epoch": 5.470917356335237, + "grad_norm": 0.62109375, + "learning_rate": 4.918970454645653e-05, + "loss": 0.0515, + "mean_token_accuracy": 0.9846825122833252, + "num_tokens": 62419360.0, + "step": 23470 + }, + { + "entropy": 0.055671576596796515, + "epoch": 5.472082993355869, + "grad_norm": 1.140625, + "learning_rate": 4.918916383233001e-05, + "loss": 0.053, + "mean_token_accuracy": 0.9845706820487976, + "num_tokens": 62433719.0, + "step": 23475 + }, + { + "entropy": 0.10006310492753982, + "epoch": 5.473248630376501, + "grad_norm": 3.0625, + "learning_rate": 4.918862294390048e-05, + "loss": 0.1352, + "mean_token_accuracy": 0.9714458584785461, + "num_tokens": 62449051.0, + "step": 23480 + }, + { + "entropy": 0.053108157590031624, + "epoch": 5.474414267397132, + "grad_norm": 0.435546875, + "learning_rate": 4.9188081881176e-05, + "loss": 0.054, + "mean_token_accuracy": 0.98538578748703, + "num_tokens": 62466128.0, + "step": 23485 + }, + { + "entropy": 0.06686123460531235, + "epoch": 5.475579904417764, + "grad_norm": 4.46875, + "learning_rate": 4.918754064416464e-05, + "loss": 0.0633, + "mean_token_accuracy": 0.9822019279003144, + "num_tokens": 62487018.0, + "step": 23490 + }, + { + "entropy": 0.10811963500455021, + "epoch": 5.476745541438396, + "grad_norm": 1.671875, + "learning_rate": 4.918699923287446e-05, + "loss": 0.116, + "mean_token_accuracy": 0.9671446025371552, + "num_tokens": 62514633.0, + "step": 23495 + }, + { + "entropy": 0.04214726220816374, + "epoch": 5.477911178459028, + "grad_norm": 0.703125, + "learning_rate": 4.918645764731355e-05, + "loss": 0.0271, + "mean_token_accuracy": 0.9906268119812012, + "num_tokens": 62545134.0, + "step": 23500 + }, + { + "entropy": 0.0906677044928074, + "epoch": 5.47907681547966, + "grad_norm": 0.9921875, + "learning_rate": 4.9185915887489986e-05, + "loss": 0.0309, + "mean_token_accuracy": 0.9871439218521119, + "num_tokens": 62571647.0, + "step": 23505 + }, + { + "entropy": 0.06742565836757422, + "epoch": 5.480242452500291, + "grad_norm": 1.203125, + "learning_rate": 4.918537395341184e-05, + "loss": 0.0593, + "mean_token_accuracy": 0.9825949847698212, + "num_tokens": 62585472.0, + "step": 23510 + }, + { + "entropy": 0.06784821143373847, + "epoch": 5.481408089520923, + "grad_norm": 4.53125, + "learning_rate": 4.918483184508718e-05, + "loss": 0.0866, + "mean_token_accuracy": 0.9811110019683837, + "num_tokens": 62613113.0, + "step": 23515 + }, + { + "entropy": 0.06738835908472537, + "epoch": 5.482573726541555, + "grad_norm": 2.171875, + "learning_rate": 4.9184289562524114e-05, + "loss": 0.0763, + "mean_token_accuracy": 0.9819285452365876, + "num_tokens": 62624262.0, + "step": 23520 + }, + { + "entropy": 0.06374572729691863, + "epoch": 5.4837393635621865, + "grad_norm": 0.55859375, + "learning_rate": 4.918374710573071e-05, + "loss": 0.0479, + "mean_token_accuracy": 0.984711641073227, + "num_tokens": 62646149.0, + "step": 23525 + }, + { + "entropy": 0.06009006816893816, + "epoch": 5.484905000582819, + "grad_norm": 0.68359375, + "learning_rate": 4.9183204474715066e-05, + "loss": 0.0556, + "mean_token_accuracy": 0.9834051728248596, + "num_tokens": 62672760.0, + "step": 23530 + }, + { + "entropy": 0.07242138041183352, + "epoch": 5.486070637603451, + "grad_norm": 0.8984375, + "learning_rate": 4.918266166948527e-05, + "loss": 0.0755, + "mean_token_accuracy": 0.9801871180534363, + "num_tokens": 62685752.0, + "step": 23535 + }, + { + "entropy": 0.08361536599695682, + "epoch": 5.487236274624082, + "grad_norm": 1.390625, + "learning_rate": 4.918211869004942e-05, + "loss": 0.0631, + "mean_token_accuracy": 0.9825032353401184, + "num_tokens": 62700324.0, + "step": 23540 + }, + { + "entropy": 0.09443598166108132, + "epoch": 5.488401911644714, + "grad_norm": 1.40625, + "learning_rate": 4.91815755364156e-05, + "loss": 0.0763, + "mean_token_accuracy": 0.9776216208934784, + "num_tokens": 62709285.0, + "step": 23545 + }, + { + "entropy": 0.05555082568898797, + "epoch": 5.489567548665345, + "grad_norm": 2.046875, + "learning_rate": 4.918103220859193e-05, + "loss": 0.0319, + "mean_token_accuracy": 0.986697655916214, + "num_tokens": 62729216.0, + "step": 23550 + }, + { + "entropy": 0.06571632707491518, + "epoch": 5.490733185685977, + "grad_norm": 1.4765625, + "learning_rate": 4.918048870658649e-05, + "loss": 0.0546, + "mean_token_accuracy": 0.9831242263317108, + "num_tokens": 62747103.0, + "step": 23555 + }, + { + "entropy": 0.07703730314970017, + "epoch": 5.4918988227066095, + "grad_norm": 0.3671875, + "learning_rate": 4.91799450304074e-05, + "loss": 0.0824, + "mean_token_accuracy": 0.9807705223560333, + "num_tokens": 62760945.0, + "step": 23560 + }, + { + "entropy": 0.06581084001809359, + "epoch": 5.493064459727241, + "grad_norm": 3.078125, + "learning_rate": 4.917940118006276e-05, + "loss": 0.0616, + "mean_token_accuracy": 0.9824943840503693, + "num_tokens": 62778982.0, + "step": 23565 + }, + { + "entropy": 0.0631984619423747, + "epoch": 5.494230096747873, + "grad_norm": 1.296875, + "learning_rate": 4.9178857155560684e-05, + "loss": 0.0596, + "mean_token_accuracy": 0.9853246152400971, + "num_tokens": 62791874.0, + "step": 23570 + }, + { + "entropy": 0.06704922579228878, + "epoch": 5.495395733768505, + "grad_norm": 2.5, + "learning_rate": 4.9178312956909285e-05, + "loss": 0.0628, + "mean_token_accuracy": 0.9803689122200012, + "num_tokens": 62810586.0, + "step": 23575 + }, + { + "entropy": 0.052586893737316134, + "epoch": 5.496561370789136, + "grad_norm": 3.359375, + "learning_rate": 4.9177768584116666e-05, + "loss": 0.0389, + "mean_token_accuracy": 0.9818240642547608, + "num_tokens": 62846529.0, + "step": 23580 + }, + { + "entropy": 0.10002233702689409, + "epoch": 5.497727007809768, + "grad_norm": 0.890625, + "learning_rate": 4.917722403719096e-05, + "loss": 0.0684, + "mean_token_accuracy": 0.9777202427387237, + "num_tokens": 62859684.0, + "step": 23585 + }, + { + "entropy": 0.07765259984880686, + "epoch": 5.498892644830399, + "grad_norm": 0.291015625, + "learning_rate": 4.917667931614028e-05, + "loss": 0.0719, + "mean_token_accuracy": 0.9798839628696442, + "num_tokens": 62874333.0, + "step": 23590 + }, + { + "entropy": 0.06515825875103473, + "epoch": 5.5000582818510315, + "grad_norm": 0.72265625, + "learning_rate": 4.917613442097275e-05, + "loss": 0.0428, + "mean_token_accuracy": 0.9843474686145782, + "num_tokens": 62899424.0, + "step": 23595 + }, + { + "entropy": 0.039471688726916906, + "epoch": 5.501223918871664, + "grad_norm": 0.4609375, + "learning_rate": 4.917558935169649e-05, + "loss": 0.0123, + "mean_token_accuracy": 0.9921641826629639, + "num_tokens": 62941284.0, + "step": 23600 + }, + { + "entropy": 0.060507692955434324, + "epoch": 5.502389555892295, + "grad_norm": 1.8671875, + "learning_rate": 4.917504410831963e-05, + "loss": 0.0476, + "mean_token_accuracy": 0.9880904495716095, + "num_tokens": 62969692.0, + "step": 23605 + }, + { + "entropy": 0.05670437887310982, + "epoch": 5.503555192912927, + "grad_norm": 0.97265625, + "learning_rate": 4.91744986908503e-05, + "loss": 0.0612, + "mean_token_accuracy": 0.9807877004146576, + "num_tokens": 62983795.0, + "step": 23610 + }, + { + "entropy": 0.05615283492952585, + "epoch": 5.504720829933559, + "grad_norm": 2.03125, + "learning_rate": 4.917395309929664e-05, + "loss": 0.0507, + "mean_token_accuracy": 0.9852173507213593, + "num_tokens": 63003615.0, + "step": 23615 + }, + { + "entropy": 0.062404044810682534, + "epoch": 5.50588646695419, + "grad_norm": 0.37109375, + "learning_rate": 4.917340733366678e-05, + "loss": 0.0561, + "mean_token_accuracy": 0.9824782729148864, + "num_tokens": 63042143.0, + "step": 23620 + }, + { + "entropy": 0.061724210530519484, + "epoch": 5.507052103974822, + "grad_norm": 1.1640625, + "learning_rate": 4.917286139396886e-05, + "loss": 0.0598, + "mean_token_accuracy": 0.9842379450798034, + "num_tokens": 63055223.0, + "step": 23625 + }, + { + "entropy": 0.06506040319800377, + "epoch": 5.508217740995454, + "grad_norm": 0.255859375, + "learning_rate": 4.9172315280211026e-05, + "loss": 0.0512, + "mean_token_accuracy": 0.9812526404857635, + "num_tokens": 63079411.0, + "step": 23630 + }, + { + "entropy": 0.08516685860231518, + "epoch": 5.509383378016086, + "grad_norm": 0.55859375, + "learning_rate": 4.91717689924014e-05, + "loss": 0.0768, + "mean_token_accuracy": 0.9786878407001496, + "num_tokens": 63106752.0, + "step": 23635 + }, + { + "entropy": 0.059132724441587924, + "epoch": 5.510549015036718, + "grad_norm": 2.015625, + "learning_rate": 4.9171222530548154e-05, + "loss": 0.0461, + "mean_token_accuracy": 0.9837357699871063, + "num_tokens": 63130321.0, + "step": 23640 + }, + { + "entropy": 0.09355827569961547, + "epoch": 5.511714652057349, + "grad_norm": 3.46875, + "learning_rate": 4.9170675894659426e-05, + "loss": 0.1392, + "mean_token_accuracy": 0.9721016824245453, + "num_tokens": 63149244.0, + "step": 23645 + }, + { + "entropy": 0.07111373730003834, + "epoch": 5.512880289077981, + "grad_norm": 2.546875, + "learning_rate": 4.917012908474336e-05, + "loss": 0.0578, + "mean_token_accuracy": 0.9821716785430908, + "num_tokens": 63164187.0, + "step": 23650 + }, + { + "entropy": 0.062192311882972716, + "epoch": 5.514045926098613, + "grad_norm": 0.51171875, + "learning_rate": 4.9169582100808124e-05, + "loss": 0.032, + "mean_token_accuracy": 0.9870727837085724, + "num_tokens": 63181897.0, + "step": 23655 + }, + { + "entropy": 0.10410164566710592, + "epoch": 5.515211563119244, + "grad_norm": 0.265625, + "learning_rate": 4.916903494286186e-05, + "loss": 0.1312, + "mean_token_accuracy": 0.9689555406570435, + "num_tokens": 63224033.0, + "step": 23660 + }, + { + "entropy": 0.07494805119931698, + "epoch": 5.5163772001398765, + "grad_norm": 1.6328125, + "learning_rate": 4.9168487610912735e-05, + "loss": 0.0874, + "mean_token_accuracy": 0.9813202261924744, + "num_tokens": 63245163.0, + "step": 23665 + }, + { + "entropy": 0.0639274563640356, + "epoch": 5.517542837160509, + "grad_norm": 1.0, + "learning_rate": 4.916794010496891e-05, + "loss": 0.0521, + "mean_token_accuracy": 0.9858429729938507, + "num_tokens": 63267004.0, + "step": 23670 + }, + { + "entropy": 0.08091693036258221, + "epoch": 5.51870847418114, + "grad_norm": 2.359375, + "learning_rate": 4.916739242503855e-05, + "loss": 0.0751, + "mean_token_accuracy": 0.9779129147529602, + "num_tokens": 63284406.0, + "step": 23675 + }, + { + "entropy": 0.055909703485667706, + "epoch": 5.519874111201772, + "grad_norm": 0.51953125, + "learning_rate": 4.916684457112982e-05, + "loss": 0.0489, + "mean_token_accuracy": 0.985709261894226, + "num_tokens": 63305734.0, + "step": 23680 + }, + { + "entropy": 0.06152990758419037, + "epoch": 5.521039748222403, + "grad_norm": 2.796875, + "learning_rate": 4.916629654325088e-05, + "loss": 0.06, + "mean_token_accuracy": 0.9815901160240174, + "num_tokens": 63318392.0, + "step": 23685 + }, + { + "entropy": 0.0552783340215683, + "epoch": 5.522205385243035, + "grad_norm": 1.5078125, + "learning_rate": 4.9165748341409925e-05, + "loss": 0.0527, + "mean_token_accuracy": 0.9854685187339782, + "num_tokens": 63336247.0, + "step": 23690 + }, + { + "entropy": 0.06256055925041437, + "epoch": 5.523371022263667, + "grad_norm": 0.7734375, + "learning_rate": 4.916519996561511e-05, + "loss": 0.0481, + "mean_token_accuracy": 0.9842760384082794, + "num_tokens": 63361742.0, + "step": 23695 + }, + { + "entropy": 0.07285189051181078, + "epoch": 5.524536659284299, + "grad_norm": 2.359375, + "learning_rate": 4.916465141587462e-05, + "loss": 0.0406, + "mean_token_accuracy": 0.9825927436351776, + "num_tokens": 63392024.0, + "step": 23700 + }, + { + "entropy": 0.06600981298834085, + "epoch": 5.525702296304931, + "grad_norm": 0.447265625, + "learning_rate": 4.916410269219664e-05, + "loss": 0.0289, + "mean_token_accuracy": 0.9872182309627533, + "num_tokens": 63425798.0, + "step": 23705 + }, + { + "entropy": 0.06496403273195028, + "epoch": 5.526867933325562, + "grad_norm": 1.8046875, + "learning_rate": 4.916355379458933e-05, + "loss": 0.0451, + "mean_token_accuracy": 0.9795408308506012, + "num_tokens": 63451911.0, + "step": 23710 + }, + { + "entropy": 0.08884159103035927, + "epoch": 5.528033570346194, + "grad_norm": 0.490234375, + "learning_rate": 4.9163004723060894e-05, + "loss": 0.0754, + "mean_token_accuracy": 0.9773557484149933, + "num_tokens": 63462839.0, + "step": 23715 + }, + { + "entropy": 0.07003546878695488, + "epoch": 5.529199207366826, + "grad_norm": 0.32421875, + "learning_rate": 4.9162455477619517e-05, + "loss": 0.0549, + "mean_token_accuracy": 0.9811549305915832, + "num_tokens": 63479583.0, + "step": 23720 + }, + { + "entropy": 0.06235072594136, + "epoch": 5.530364844387458, + "grad_norm": 1.109375, + "learning_rate": 4.916190605827339e-05, + "loss": 0.056, + "mean_token_accuracy": 0.9822233319282532, + "num_tokens": 63491907.0, + "step": 23725 + }, + { + "entropy": 0.06424791738390923, + "epoch": 5.5315304814080895, + "grad_norm": 3.03125, + "learning_rate": 4.91613564650307e-05, + "loss": 0.0396, + "mean_token_accuracy": 0.9856864392757416, + "num_tokens": 63509343.0, + "step": 23730 + }, + { + "entropy": 0.06360748754814267, + "epoch": 5.5326961184287216, + "grad_norm": 1.2109375, + "learning_rate": 4.916080669789965e-05, + "loss": 0.0364, + "mean_token_accuracy": 0.9866265952587128, + "num_tokens": 63530657.0, + "step": 23735 + }, + { + "entropy": 0.07913671238347889, + "epoch": 5.533861755449353, + "grad_norm": 0.486328125, + "learning_rate": 4.916025675688843e-05, + "loss": 0.077, + "mean_token_accuracy": 0.9761590301990509, + "num_tokens": 63549593.0, + "step": 23740 + }, + { + "entropy": 0.06635268032550812, + "epoch": 5.535027392469985, + "grad_norm": 1.3125, + "learning_rate": 4.915970664200524e-05, + "loss": 0.0418, + "mean_token_accuracy": 0.9833239078521728, + "num_tokens": 63565771.0, + "step": 23745 + }, + { + "entropy": 0.07817679923027754, + "epoch": 5.536193029490617, + "grad_norm": 4.125, + "learning_rate": 4.91591563532583e-05, + "loss": 0.0618, + "mean_token_accuracy": 0.9776901960372925, + "num_tokens": 63581604.0, + "step": 23750 + }, + { + "entropy": 0.09487223085016012, + "epoch": 5.537358666511248, + "grad_norm": 6.03125, + "learning_rate": 4.915860589065579e-05, + "loss": 0.0598, + "mean_token_accuracy": 0.9834141552448272, + "num_tokens": 63604443.0, + "step": 23755 + }, + { + "entropy": 0.12501354981213808, + "epoch": 5.53852430353188, + "grad_norm": 1.75, + "learning_rate": 4.9158055254205934e-05, + "loss": 0.1379, + "mean_token_accuracy": 0.9683729887008667, + "num_tokens": 63626224.0, + "step": 23760 + }, + { + "entropy": 0.058947160560637715, + "epoch": 5.5396899405525115, + "grad_norm": 0.5703125, + "learning_rate": 4.915750444391694e-05, + "loss": 0.0521, + "mean_token_accuracy": 0.9822892725467682, + "num_tokens": 63646271.0, + "step": 23765 + }, + { + "entropy": 0.07299381997436286, + "epoch": 5.540855577573144, + "grad_norm": 0.91015625, + "learning_rate": 4.9156953459797024e-05, + "loss": 0.0747, + "mean_token_accuracy": 0.9818307161331177, + "num_tokens": 63658385.0, + "step": 23770 + }, + { + "entropy": 0.06397927738726139, + "epoch": 5.542021214593776, + "grad_norm": 0.5, + "learning_rate": 4.9156402301854395e-05, + "loss": 0.0473, + "mean_token_accuracy": 0.9837787330150605, + "num_tokens": 63679376.0, + "step": 23775 + }, + { + "entropy": 0.06430526990443468, + "epoch": 5.543186851614407, + "grad_norm": 1.453125, + "learning_rate": 4.915585097009727e-05, + "loss": 0.0557, + "mean_token_accuracy": 0.9817375063896179, + "num_tokens": 63695652.0, + "step": 23780 + }, + { + "entropy": 0.08212286848574876, + "epoch": 5.544352488635039, + "grad_norm": 0.423828125, + "learning_rate": 4.9155299464533886e-05, + "loss": 0.0664, + "mean_token_accuracy": 0.9791503429412842, + "num_tokens": 63710844.0, + "step": 23785 + }, + { + "entropy": 0.07861670413985848, + "epoch": 5.545518125655671, + "grad_norm": 1.3671875, + "learning_rate": 4.915474778517245e-05, + "loss": 0.0593, + "mean_token_accuracy": 0.9782687842845916, + "num_tokens": 63729543.0, + "step": 23790 + }, + { + "entropy": 0.0697979001328349, + "epoch": 5.546683762676302, + "grad_norm": 0.61328125, + "learning_rate": 4.9154195932021195e-05, + "loss": 0.0705, + "mean_token_accuracy": 0.9821433663368225, + "num_tokens": 63742595.0, + "step": 23795 + }, + { + "entropy": 0.07056807223707437, + "epoch": 5.5478493996969345, + "grad_norm": 2.1875, + "learning_rate": 4.9153643905088356e-05, + "loss": 0.0828, + "mean_token_accuracy": 0.979724007844925, + "num_tokens": 63754975.0, + "step": 23800 + }, + { + "entropy": 0.06173288859426975, + "epoch": 5.549015036717567, + "grad_norm": 0.30078125, + "learning_rate": 4.9153091704382154e-05, + "loss": 0.0365, + "mean_token_accuracy": 0.9860757470130921, + "num_tokens": 63785355.0, + "step": 23805 + }, + { + "entropy": 0.06818353859707713, + "epoch": 5.550180673738198, + "grad_norm": 1.7265625, + "learning_rate": 4.915253932991083e-05, + "loss": 0.035, + "mean_token_accuracy": 0.9851133286952972, + "num_tokens": 63808617.0, + "step": 23810 + }, + { + "entropy": 0.08297501988708973, + "epoch": 5.55134631075883, + "grad_norm": 1.1875, + "learning_rate": 4.9151986781682615e-05, + "loss": 0.079, + "mean_token_accuracy": 0.9793727993965149, + "num_tokens": 63825558.0, + "step": 23815 + }, + { + "entropy": 0.10554131586104631, + "epoch": 5.552511947779461, + "grad_norm": 0.99609375, + "learning_rate": 4.9151434059705745e-05, + "loss": 0.0688, + "mean_token_accuracy": 0.9800882160663604, + "num_tokens": 63846822.0, + "step": 23820 + }, + { + "entropy": 0.06566274948418141, + "epoch": 5.553677584800093, + "grad_norm": 1.953125, + "learning_rate": 4.9150881163988484e-05, + "loss": 0.0818, + "mean_token_accuracy": 0.9768719375133514, + "num_tokens": 63864897.0, + "step": 23825 + }, + { + "entropy": 0.05825161384418607, + "epoch": 5.554843221820725, + "grad_norm": 2.1875, + "learning_rate": 4.915032809453905e-05, + "loss": 0.0797, + "mean_token_accuracy": 0.9830494523048401, + "num_tokens": 63885269.0, + "step": 23830 + }, + { + "entropy": 0.05645349100232124, + "epoch": 5.5560088588413565, + "grad_norm": 1.8125, + "learning_rate": 4.91497748513657e-05, + "loss": 0.0418, + "mean_token_accuracy": 0.9863303542137146, + "num_tokens": 63901941.0, + "step": 23835 + }, + { + "entropy": 0.06523125125095248, + "epoch": 5.557174495861989, + "grad_norm": 0.47265625, + "learning_rate": 4.914922143447669e-05, + "loss": 0.0544, + "mean_token_accuracy": 0.9860906541347504, + "num_tokens": 63918098.0, + "step": 23840 + }, + { + "entropy": 0.11673079691827297, + "epoch": 5.55834013288262, + "grad_norm": 1.4921875, + "learning_rate": 4.9148667843880266e-05, + "loss": 0.0873, + "mean_token_accuracy": 0.9748226165771484, + "num_tokens": 63927237.0, + "step": 23845 + }, + { + "entropy": 0.06276131253689528, + "epoch": 5.559505769903252, + "grad_norm": 1.0625, + "learning_rate": 4.9148114079584684e-05, + "loss": 0.0505, + "mean_token_accuracy": 0.9863556146621704, + "num_tokens": 63945205.0, + "step": 23850 + }, + { + "entropy": 0.06383510492742062, + "epoch": 5.560671406923884, + "grad_norm": 1.5546875, + "learning_rate": 4.91475601415982e-05, + "loss": 0.0725, + "mean_token_accuracy": 0.9803566753864288, + "num_tokens": 63956982.0, + "step": 23855 + }, + { + "entropy": 0.06630421318113804, + "epoch": 5.561837043944516, + "grad_norm": 3.546875, + "learning_rate": 4.9147006029929074e-05, + "loss": 0.0679, + "mean_token_accuracy": 0.9794648051261902, + "num_tokens": 63974757.0, + "step": 23860 + }, + { + "entropy": 0.07054558731615543, + "epoch": 5.563002680965147, + "grad_norm": 1.7890625, + "learning_rate": 4.914645174458557e-05, + "loss": 0.0624, + "mean_token_accuracy": 0.9827561438083648, + "num_tokens": 63986243.0, + "step": 23865 + }, + { + "entropy": 0.05838492456823587, + "epoch": 5.5641683179857795, + "grad_norm": 1.4140625, + "learning_rate": 4.914589728557595e-05, + "loss": 0.042, + "mean_token_accuracy": 0.9874827086925506, + "num_tokens": 64005214.0, + "step": 23870 + }, + { + "entropy": 0.07171590086072684, + "epoch": 5.565333955006411, + "grad_norm": 0.88671875, + "learning_rate": 4.914534265290849e-05, + "loss": 0.0735, + "mean_token_accuracy": 0.9795938551425933, + "num_tokens": 64041250.0, + "step": 23875 + }, + { + "entropy": 0.10566460490226745, + "epoch": 5.566499592027043, + "grad_norm": 1.4140625, + "learning_rate": 4.914478784659146e-05, + "loss": 0.0637, + "mean_token_accuracy": 0.9845397114753723, + "num_tokens": 64066255.0, + "step": 23880 + }, + { + "entropy": 0.0610381668433547, + "epoch": 5.567665229047675, + "grad_norm": 1.7734375, + "learning_rate": 4.9144232866633124e-05, + "loss": 0.0487, + "mean_token_accuracy": 0.9841365456581116, + "num_tokens": 64082043.0, + "step": 23885 + }, + { + "entropy": 0.074870290979743, + "epoch": 5.568830866068306, + "grad_norm": 0.236328125, + "learning_rate": 4.9143677713041766e-05, + "loss": 0.0691, + "mean_token_accuracy": 0.9827987253665924, + "num_tokens": 64103120.0, + "step": 23890 + }, + { + "entropy": 0.057516278512775895, + "epoch": 5.569996503088938, + "grad_norm": 0.490234375, + "learning_rate": 4.914312238582565e-05, + "loss": 0.0496, + "mean_token_accuracy": 0.9857203364372253, + "num_tokens": 64119929.0, + "step": 23895 + }, + { + "entropy": 0.0727980025112629, + "epoch": 5.5711621401095694, + "grad_norm": 2.953125, + "learning_rate": 4.9142566884993074e-05, + "loss": 0.0572, + "mean_token_accuracy": 0.980837094783783, + "num_tokens": 64138984.0, + "step": 23900 + }, + { + "entropy": 0.05668035298585892, + "epoch": 5.5723277771302016, + "grad_norm": 0.357421875, + "learning_rate": 4.9142011210552314e-05, + "loss": 0.0548, + "mean_token_accuracy": 0.9852408468723297, + "num_tokens": 64161740.0, + "step": 23905 + }, + { + "entropy": 0.10799715518951417, + "epoch": 5.573493414150834, + "grad_norm": 4.09375, + "learning_rate": 4.914145536251166e-05, + "loss": 0.1483, + "mean_token_accuracy": 0.9697591483592987, + "num_tokens": 64184455.0, + "step": 23910 + }, + { + "entropy": 0.08060030173510313, + "epoch": 5.574659051171465, + "grad_norm": 2.03125, + "learning_rate": 4.914089934087939e-05, + "loss": 0.0585, + "mean_token_accuracy": 0.9793602645397186, + "num_tokens": 64199983.0, + "step": 23915 + }, + { + "entropy": 0.0592457982711494, + "epoch": 5.575824688192097, + "grad_norm": 0.74609375, + "learning_rate": 4.91403431456638e-05, + "loss": 0.039, + "mean_token_accuracy": 0.987294489145279, + "num_tokens": 64223559.0, + "step": 23920 + }, + { + "entropy": 0.07702452093362808, + "epoch": 5.576990325212729, + "grad_norm": 1.359375, + "learning_rate": 4.913978677687319e-05, + "loss": 0.0661, + "mean_token_accuracy": 0.9822618186473846, + "num_tokens": 64236810.0, + "step": 23925 + }, + { + "entropy": 0.08921125689521432, + "epoch": 5.57815596223336, + "grad_norm": 1.8984375, + "learning_rate": 4.913923023451585e-05, + "loss": 0.0902, + "mean_token_accuracy": 0.9755136132240295, + "num_tokens": 64259739.0, + "step": 23930 + }, + { + "entropy": 0.0405919854529202, + "epoch": 5.579321599253992, + "grad_norm": 0.98046875, + "learning_rate": 4.9138673518600086e-05, + "loss": 0.0294, + "mean_token_accuracy": 0.9920165717601777, + "num_tokens": 64298571.0, + "step": 23935 + }, + { + "entropy": 0.06870214790105819, + "epoch": 5.5804872362746245, + "grad_norm": 2.34375, + "learning_rate": 4.9138116629134196e-05, + "loss": 0.0721, + "mean_token_accuracy": 0.9808136463165283, + "num_tokens": 64312476.0, + "step": 23940 + }, + { + "entropy": 0.070391511823982, + "epoch": 5.581652873295256, + "grad_norm": 1.203125, + "learning_rate": 4.913755956612648e-05, + "loss": 0.0527, + "mean_token_accuracy": 0.984083354473114, + "num_tokens": 64331356.0, + "step": 23945 + }, + { + "entropy": 0.08463108614087105, + "epoch": 5.582818510315888, + "grad_norm": 0.8515625, + "learning_rate": 4.913700232958524e-05, + "loss": 0.1035, + "mean_token_accuracy": 0.9746655583381653, + "num_tokens": 64340876.0, + "step": 23950 + }, + { + "entropy": 0.06346419043838977, + "epoch": 5.583984147336519, + "grad_norm": 0.921875, + "learning_rate": 4.91364449195188e-05, + "loss": 0.036, + "mean_token_accuracy": 0.9880404353141785, + "num_tokens": 64370087.0, + "step": 23955 + }, + { + "entropy": 0.09586155414581299, + "epoch": 5.585149784357151, + "grad_norm": 2.96875, + "learning_rate": 4.913588733593546e-05, + "loss": 0.0695, + "mean_token_accuracy": 0.9790465831756592, + "num_tokens": 64382026.0, + "step": 23960 + }, + { + "entropy": 0.06308048740029334, + "epoch": 5.586315421377783, + "grad_norm": 0.361328125, + "learning_rate": 4.9135329578843535e-05, + "loss": 0.0527, + "mean_token_accuracy": 0.9852202475070954, + "num_tokens": 64397827.0, + "step": 23965 + }, + { + "entropy": 0.07038025036454201, + "epoch": 5.5874810583984145, + "grad_norm": 0.98828125, + "learning_rate": 4.913477164825135e-05, + "loss": 0.0717, + "mean_token_accuracy": 0.9811891615390778, + "num_tokens": 64418130.0, + "step": 23970 + }, + { + "entropy": 0.08210363052785397, + "epoch": 5.588646695419047, + "grad_norm": 2.125, + "learning_rate": 4.913421354416722e-05, + "loss": 0.068, + "mean_token_accuracy": 0.9791256606578826, + "num_tokens": 64430360.0, + "step": 23975 + }, + { + "entropy": 0.05975691732019186, + "epoch": 5.589812332439678, + "grad_norm": 2.0, + "learning_rate": 4.913365526659946e-05, + "loss": 0.0587, + "mean_token_accuracy": 0.9852451801300048, + "num_tokens": 64454270.0, + "step": 23980 + }, + { + "entropy": 0.0696345467120409, + "epoch": 5.59097796946031, + "grad_norm": 0.46484375, + "learning_rate": 4.913309681555642e-05, + "loss": 0.039, + "mean_token_accuracy": 0.9870121002197265, + "num_tokens": 64476946.0, + "step": 23985 + }, + { + "entropy": 0.07908033691346646, + "epoch": 5.592143606480942, + "grad_norm": 0.609375, + "learning_rate": 4.913253819104639e-05, + "loss": 0.0653, + "mean_token_accuracy": 0.9817940413951873, + "num_tokens": 64499294.0, + "step": 23990 + }, + { + "entropy": 0.06197888310998678, + "epoch": 5.593309243501574, + "grad_norm": 0.796875, + "learning_rate": 4.9131979393077734e-05, + "loss": 0.0425, + "mean_token_accuracy": 0.9863790690898895, + "num_tokens": 64517333.0, + "step": 23995 + }, + { + "entropy": 0.07623457312583923, + "epoch": 5.594474880522205, + "grad_norm": 2.421875, + "learning_rate": 4.9131420421658764e-05, + "loss": 0.0801, + "mean_token_accuracy": 0.9793197691440583, + "num_tokens": 64550934.0, + "step": 24000 + }, + { + "entropy": 0.05599022675305605, + "epoch": 5.595640517542837, + "grad_norm": 0.408203125, + "learning_rate": 4.913086127679782e-05, + "loss": 0.0324, + "mean_token_accuracy": 0.9851845264434814, + "num_tokens": 64580196.0, + "step": 24005 + }, + { + "entropy": 0.07400658950209618, + "epoch": 5.596806154563469, + "grad_norm": 3.375, + "learning_rate": 4.913030195850324e-05, + "loss": 0.0727, + "mean_token_accuracy": 0.9823880016803741, + "num_tokens": 64591487.0, + "step": 24010 + }, + { + "entropy": 0.06357642374932766, + "epoch": 5.597971791584101, + "grad_norm": 0.142578125, + "learning_rate": 4.9129742466783364e-05, + "loss": 0.037, + "mean_token_accuracy": 0.9890712201595306, + "num_tokens": 64619890.0, + "step": 24015 + }, + { + "entropy": 0.08721558898687362, + "epoch": 5.599137428604733, + "grad_norm": 0.9453125, + "learning_rate": 4.912918280164654e-05, + "loss": 0.0571, + "mean_token_accuracy": 0.9823896586894989, + "num_tokens": 64634109.0, + "step": 24020 + }, + { + "entropy": 0.07920245192945004, + "epoch": 5.600303065625364, + "grad_norm": 0.3203125, + "learning_rate": 4.91286229631011e-05, + "loss": 0.0529, + "mean_token_accuracy": 0.9839257597923279, + "num_tokens": 64656244.0, + "step": 24025 + }, + { + "entropy": 0.07264061979949474, + "epoch": 5.601468702645996, + "grad_norm": 0.7578125, + "learning_rate": 4.912806295115541e-05, + "loss": 0.0694, + "mean_token_accuracy": 0.9849851489067077, + "num_tokens": 64667128.0, + "step": 24030 + }, + { + "entropy": 0.06038454240188003, + "epoch": 5.602634339666627, + "grad_norm": 0.1552734375, + "learning_rate": 4.9127502765817814e-05, + "loss": 0.0374, + "mean_token_accuracy": 0.9803289711475373, + "num_tokens": 64690511.0, + "step": 24035 + }, + { + "entropy": 0.07674487382173538, + "epoch": 5.6037999766872595, + "grad_norm": 2.328125, + "learning_rate": 4.912694240709665e-05, + "loss": 0.0843, + "mean_token_accuracy": 0.978994345664978, + "num_tokens": 64710216.0, + "step": 24040 + }, + { + "entropy": 0.06889344537630678, + "epoch": 5.604965613707892, + "grad_norm": 0.8203125, + "learning_rate": 4.91263818750003e-05, + "loss": 0.0371, + "mean_token_accuracy": 0.9858274102210999, + "num_tokens": 64732135.0, + "step": 24045 + }, + { + "entropy": 0.06218187240883708, + "epoch": 5.606131250728523, + "grad_norm": 1.359375, + "learning_rate": 4.912582116953711e-05, + "loss": 0.0547, + "mean_token_accuracy": 0.9839303433895111, + "num_tokens": 64760928.0, + "step": 24050 + }, + { + "entropy": 0.06635083290748298, + "epoch": 5.607296887749155, + "grad_norm": 1.25, + "learning_rate": 4.912526029071543e-05, + "loss": 0.0483, + "mean_token_accuracy": 0.983081066608429, + "num_tokens": 64780021.0, + "step": 24055 + }, + { + "entropy": 0.07519938629120589, + "epoch": 5.608462524769787, + "grad_norm": 3.34375, + "learning_rate": 4.912469923854364e-05, + "loss": 0.0681, + "mean_token_accuracy": 0.981386798620224, + "num_tokens": 64794223.0, + "step": 24060 + }, + { + "entropy": 0.08381996154785157, + "epoch": 5.609628161790418, + "grad_norm": 2.15625, + "learning_rate": 4.9124138013030094e-05, + "loss": 0.084, + "mean_token_accuracy": 0.9787656486034393, + "num_tokens": 64804725.0, + "step": 24065 + }, + { + "entropy": 0.04471158161759377, + "epoch": 5.61079379881105, + "grad_norm": 0.1953125, + "learning_rate": 4.912357661418317e-05, + "loss": 0.027, + "mean_token_accuracy": 0.9886382281780243, + "num_tokens": 64834084.0, + "step": 24070 + }, + { + "entropy": 0.08546677436679602, + "epoch": 5.611959435831682, + "grad_norm": 0.8046875, + "learning_rate": 4.912301504201124e-05, + "loss": 0.0574, + "mean_token_accuracy": 0.9757747769355773, + "num_tokens": 64854352.0, + "step": 24075 + }, + { + "entropy": 0.07056930642575025, + "epoch": 5.613125072852314, + "grad_norm": 3.671875, + "learning_rate": 4.912245329652267e-05, + "loss": 0.0639, + "mean_token_accuracy": 0.9803074657917022, + "num_tokens": 64873354.0, + "step": 24080 + }, + { + "entropy": 0.07950994074344635, + "epoch": 5.614290709872946, + "grad_norm": 0.80078125, + "learning_rate": 4.9121891377725835e-05, + "loss": 0.0858, + "mean_token_accuracy": 0.9795065879821777, + "num_tokens": 64883819.0, + "step": 24085 + }, + { + "entropy": 0.05531720239669084, + "epoch": 5.615456346893577, + "grad_norm": 1.078125, + "learning_rate": 4.9121329285629125e-05, + "loss": 0.041, + "mean_token_accuracy": 0.988337516784668, + "num_tokens": 64898970.0, + "step": 24090 + }, + { + "entropy": 0.07124143727123737, + "epoch": 5.616621983914209, + "grad_norm": 4.03125, + "learning_rate": 4.912076702024092e-05, + "loss": 0.0741, + "mean_token_accuracy": 0.9810739636421204, + "num_tokens": 64914653.0, + "step": 24095 + }, + { + "entropy": 0.09241018267348408, + "epoch": 5.617787620934841, + "grad_norm": 2.328125, + "learning_rate": 4.91202045815696e-05, + "loss": 0.095, + "mean_token_accuracy": 0.9772006750106812, + "num_tokens": 64927604.0, + "step": 24100 + }, + { + "entropy": 0.05182990748435259, + "epoch": 5.618953257955472, + "grad_norm": 2.21875, + "learning_rate": 4.911964196962354e-05, + "loss": 0.0383, + "mean_token_accuracy": 0.9875986397266387, + "num_tokens": 64949566.0, + "step": 24105 + }, + { + "entropy": 0.08643919341266156, + "epoch": 5.6201188949761045, + "grad_norm": 3.390625, + "learning_rate": 4.9119079184411146e-05, + "loss": 0.087, + "mean_token_accuracy": 0.9766907870769501, + "num_tokens": 64958339.0, + "step": 24110 + }, + { + "entropy": 0.11621445324271917, + "epoch": 5.621284531996736, + "grad_norm": 0.41015625, + "learning_rate": 4.911851622594081e-05, + "loss": 0.1477, + "mean_token_accuracy": 0.9691729426383973, + "num_tokens": 64991088.0, + "step": 24115 + }, + { + "entropy": 0.06573995789512992, + "epoch": 5.622450169017368, + "grad_norm": 0.2451171875, + "learning_rate": 4.911795309422092e-05, + "loss": 0.0704, + "mean_token_accuracy": 0.9805086791515351, + "num_tokens": 65008055.0, + "step": 24120 + }, + { + "entropy": 0.04539413256570697, + "epoch": 5.623615806038, + "grad_norm": 0.71484375, + "learning_rate": 4.9117389789259874e-05, + "loss": 0.0336, + "mean_token_accuracy": 0.9910531222820282, + "num_tokens": 65048328.0, + "step": 24125 + }, + { + "entropy": 0.07581941662356258, + "epoch": 5.624781443058632, + "grad_norm": 1.5390625, + "learning_rate": 4.9116826311066075e-05, + "loss": 0.1022, + "mean_token_accuracy": 0.9789832353591919, + "num_tokens": 65072881.0, + "step": 24130 + }, + { + "entropy": 0.0626650300808251, + "epoch": 5.625947080079263, + "grad_norm": 2.046875, + "learning_rate": 4.911626265964792e-05, + "loss": 0.0612, + "mean_token_accuracy": 0.9832917273044586, + "num_tokens": 65095466.0, + "step": 24135 + }, + { + "entropy": 0.06437070518732071, + "epoch": 5.627112717099895, + "grad_norm": 1.078125, + "learning_rate": 4.9115698835013823e-05, + "loss": 0.0539, + "mean_token_accuracy": 0.9849261939525604, + "num_tokens": 65110121.0, + "step": 24140 + }, + { + "entropy": 0.06489365249872207, + "epoch": 5.628278354120527, + "grad_norm": 0.515625, + "learning_rate": 4.9115134837172176e-05, + "loss": 0.0466, + "mean_token_accuracy": 0.98541299700737, + "num_tokens": 65141686.0, + "step": 24145 + }, + { + "entropy": 0.0634211104363203, + "epoch": 5.629443991141159, + "grad_norm": 0.77734375, + "learning_rate": 4.9114570666131406e-05, + "loss": 0.0446, + "mean_token_accuracy": 0.9839524507522583, + "num_tokens": 65159098.0, + "step": 24150 + }, + { + "entropy": 0.04414686663076282, + "epoch": 5.630609628161791, + "grad_norm": 0.19921875, + "learning_rate": 4.911400632189991e-05, + "loss": 0.0129, + "mean_token_accuracy": 0.9903716087341309, + "num_tokens": 65193723.0, + "step": 24155 + }, + { + "entropy": 0.06207293402403593, + "epoch": 5.631775265182422, + "grad_norm": 1.4765625, + "learning_rate": 4.911344180448612e-05, + "loss": 0.0396, + "mean_token_accuracy": 0.9857115924358368, + "num_tokens": 65213189.0, + "step": 24160 + }, + { + "entropy": 0.06852437127381564, + "epoch": 5.632940902203054, + "grad_norm": 0.48828125, + "learning_rate": 4.911287711389844e-05, + "loss": 0.0622, + "mean_token_accuracy": 0.9851773381233215, + "num_tokens": 65225625.0, + "step": 24165 + }, + { + "entropy": 0.07459627948701382, + "epoch": 5.634106539223685, + "grad_norm": 1.3828125, + "learning_rate": 4.9112312250145296e-05, + "loss": 0.0729, + "mean_token_accuracy": 0.9799397945404053, + "num_tokens": 65253579.0, + "step": 24170 + }, + { + "entropy": 0.09025459066033363, + "epoch": 5.635272176244317, + "grad_norm": 3.734375, + "learning_rate": 4.911174721323512e-05, + "loss": 0.1323, + "mean_token_accuracy": 0.969058758020401, + "num_tokens": 65260612.0, + "step": 24175 + }, + { + "entropy": 0.060559450881555676, + "epoch": 5.6364378132649495, + "grad_norm": 0.4296875, + "learning_rate": 4.9111182003176315e-05, + "loss": 0.0331, + "mean_token_accuracy": 0.9849679410457611, + "num_tokens": 65290280.0, + "step": 24180 + }, + { + "entropy": 0.07913712412118912, + "epoch": 5.637603450285581, + "grad_norm": 0.51953125, + "learning_rate": 4.911061661997733e-05, + "loss": 0.0844, + "mean_token_accuracy": 0.9759842038154602, + "num_tokens": 65305469.0, + "step": 24185 + }, + { + "entropy": 0.06736504193395376, + "epoch": 5.638769087306213, + "grad_norm": 0.75390625, + "learning_rate": 4.9110051063646586e-05, + "loss": 0.0443, + "mean_token_accuracy": 0.9866178870201111, + "num_tokens": 65330611.0, + "step": 24190 + }, + { + "entropy": 0.08924924843013286, + "epoch": 5.639934724326845, + "grad_norm": 3.59375, + "learning_rate": 4.910948533419251e-05, + "loss": 0.092, + "mean_token_accuracy": 0.9765866577625275, + "num_tokens": 65339161.0, + "step": 24195 + }, + { + "entropy": 0.09531253166496753, + "epoch": 5.641100361347476, + "grad_norm": 2.171875, + "learning_rate": 4.910891943162356e-05, + "loss": 0.0851, + "mean_token_accuracy": 0.9777402102947235, + "num_tokens": 65360121.0, + "step": 24200 + }, + { + "entropy": 0.07371218502521515, + "epoch": 5.642265998368108, + "grad_norm": 0.47265625, + "learning_rate": 4.910835335594815e-05, + "loss": 0.067, + "mean_token_accuracy": 0.9795070946216583, + "num_tokens": 65374365.0, + "step": 24205 + }, + { + "entropy": 0.06099022421985865, + "epoch": 5.64343163538874, + "grad_norm": 1.140625, + "learning_rate": 4.9107787107174735e-05, + "loss": 0.0241, + "mean_token_accuracy": 0.9872618436813354, + "num_tokens": 65398143.0, + "step": 24210 + }, + { + "entropy": 0.06053253598511219, + "epoch": 5.644597272409372, + "grad_norm": 4.53125, + "learning_rate": 4.9107220685311756e-05, + "loss": 0.0545, + "mean_token_accuracy": 0.985034990310669, + "num_tokens": 65413147.0, + "step": 24215 + }, + { + "entropy": 0.05846654055640101, + "epoch": 5.645762909430004, + "grad_norm": 2.515625, + "learning_rate": 4.910665409036765e-05, + "loss": 0.0443, + "mean_token_accuracy": 0.983678150177002, + "num_tokens": 65438930.0, + "step": 24220 + }, + { + "entropy": 0.056470193434506655, + "epoch": 5.646928546450635, + "grad_norm": 0.1728515625, + "learning_rate": 4.910608732235089e-05, + "loss": 0.0507, + "mean_token_accuracy": 0.9855823874473572, + "num_tokens": 65467306.0, + "step": 24225 + }, + { + "entropy": 0.0693120218347758, + "epoch": 5.648094183471267, + "grad_norm": 2.609375, + "learning_rate": 4.91055203812699e-05, + "loss": 0.0758, + "mean_token_accuracy": 0.9780898749828338, + "num_tokens": 65485922.0, + "step": 24230 + }, + { + "entropy": 0.08311540931463242, + "epoch": 5.649259820491899, + "grad_norm": 1.3828125, + "learning_rate": 4.910495326713315e-05, + "loss": 0.0886, + "mean_token_accuracy": 0.978338223695755, + "num_tokens": 65509037.0, + "step": 24235 + }, + { + "entropy": 0.06125993989408016, + "epoch": 5.65042545751253, + "grad_norm": 2.5, + "learning_rate": 4.910438597994909e-05, + "loss": 0.0538, + "mean_token_accuracy": 0.9835680603981019, + "num_tokens": 65524153.0, + "step": 24240 + }, + { + "entropy": 0.06852566041052341, + "epoch": 5.651591094533162, + "grad_norm": 0.56640625, + "learning_rate": 4.910381851972618e-05, + "loss": 0.0601, + "mean_token_accuracy": 0.985653680562973, + "num_tokens": 65536747.0, + "step": 24245 + }, + { + "entropy": 0.06987055232748389, + "epoch": 5.652756731553794, + "grad_norm": 0.392578125, + "learning_rate": 4.9103250886472886e-05, + "loss": 0.0751, + "mean_token_accuracy": 0.9799535036087036, + "num_tokens": 65556692.0, + "step": 24250 + }, + { + "entropy": 0.05222968608140945, + "epoch": 5.653922368574426, + "grad_norm": 2.328125, + "learning_rate": 4.910268308019766e-05, + "loss": 0.039, + "mean_token_accuracy": 0.9869933426380157, + "num_tokens": 65583964.0, + "step": 24255 + }, + { + "entropy": 0.06949933720752596, + "epoch": 5.655088005595058, + "grad_norm": 3.59375, + "learning_rate": 4.910211510090899e-05, + "loss": 0.077, + "mean_token_accuracy": 0.9771763205528259, + "num_tokens": 65599854.0, + "step": 24260 + }, + { + "entropy": 0.0793121935799718, + "epoch": 5.65625364261569, + "grad_norm": 1.875, + "learning_rate": 4.910154694861533e-05, + "loss": 0.0734, + "mean_token_accuracy": 0.978898000717163, + "num_tokens": 65613891.0, + "step": 24265 + }, + { + "entropy": 0.05992090366780758, + "epoch": 5.657419279636321, + "grad_norm": 2.28125, + "learning_rate": 4.910097862332515e-05, + "loss": 0.0604, + "mean_token_accuracy": 0.9835952401161194, + "num_tokens": 65642481.0, + "step": 24270 + }, + { + "entropy": 0.056962525472044945, + "epoch": 5.658584916656953, + "grad_norm": 2.609375, + "learning_rate": 4.9100410125046934e-05, + "loss": 0.0559, + "mean_token_accuracy": 0.9861100137233734, + "num_tokens": 65665910.0, + "step": 24275 + }, + { + "entropy": 0.0507665459997952, + "epoch": 5.6597505536775845, + "grad_norm": 0.8125, + "learning_rate": 4.909984145378915e-05, + "loss": 0.0314, + "mean_token_accuracy": 0.986362773180008, + "num_tokens": 65711085.0, + "step": 24280 + }, + { + "entropy": 0.08935830974951386, + "epoch": 5.660916190698217, + "grad_norm": 0.23828125, + "learning_rate": 4.909927260956029e-05, + "loss": 0.1072, + "mean_token_accuracy": 0.9721687018871308, + "num_tokens": 65735010.0, + "step": 24285 + }, + { + "entropy": 0.07759722573682666, + "epoch": 5.662081827718849, + "grad_norm": 0.251953125, + "learning_rate": 4.9098703592368825e-05, + "loss": 0.0358, + "mean_token_accuracy": 0.9860342264175415, + "num_tokens": 65757801.0, + "step": 24290 + }, + { + "entropy": 0.06456798985600472, + "epoch": 5.66324746473948, + "grad_norm": 0.2314453125, + "learning_rate": 4.909813440222325e-05, + "loss": 0.0502, + "mean_token_accuracy": 0.9860153675079346, + "num_tokens": 65790889.0, + "step": 24295 + }, + { + "entropy": 0.06786645632237195, + "epoch": 5.664413101760112, + "grad_norm": 0.37890625, + "learning_rate": 4.9097565039132034e-05, + "loss": 0.0504, + "mean_token_accuracy": 0.9867601275444031, + "num_tokens": 65810793.0, + "step": 24300 + }, + { + "entropy": 0.17485631480813027, + "epoch": 5.665578738780743, + "grad_norm": 1.7578125, + "learning_rate": 4.909699550310369e-05, + "loss": 0.34, + "mean_token_accuracy": 0.95076944231987, + "num_tokens": 65826864.0, + "step": 24305 + }, + { + "entropy": 0.07269075997173786, + "epoch": 5.666744375801375, + "grad_norm": 0.33203125, + "learning_rate": 4.90964257941467e-05, + "loss": 0.0508, + "mean_token_accuracy": 0.9866540789604187, + "num_tokens": 65845292.0, + "step": 24310 + }, + { + "entropy": 0.07192633729428052, + "epoch": 5.6679100128220075, + "grad_norm": 3.703125, + "learning_rate": 4.909585591226956e-05, + "loss": 0.0837, + "mean_token_accuracy": 0.9803862571716309, + "num_tokens": 65861238.0, + "step": 24315 + }, + { + "entropy": 0.05109905377030373, + "epoch": 5.669075649842639, + "grad_norm": 1.4921875, + "learning_rate": 4.909528585748076e-05, + "loss": 0.0379, + "mean_token_accuracy": 0.9892950415611267, + "num_tokens": 65890623.0, + "step": 24320 + }, + { + "entropy": 0.06906082537025213, + "epoch": 5.670241286863271, + "grad_norm": 4.40625, + "learning_rate": 4.9094715629788814e-05, + "loss": 0.0731, + "mean_token_accuracy": 0.9789024889469147, + "num_tokens": 65914480.0, + "step": 24325 + }, + { + "entropy": 0.057262393087148665, + "epoch": 5.671406923883903, + "grad_norm": 1.03125, + "learning_rate": 4.9094145229202214e-05, + "loss": 0.0688, + "mean_token_accuracy": 0.9827243983745575, + "num_tokens": 65941588.0, + "step": 24330 + }, + { + "entropy": 0.0782860929146409, + "epoch": 5.672572560904534, + "grad_norm": 0.90625, + "learning_rate": 4.9093574655729475e-05, + "loss": 0.0694, + "mean_token_accuracy": 0.9805294811725617, + "num_tokens": 65953294.0, + "step": 24335 + }, + { + "entropy": 0.05869090519845486, + "epoch": 5.673738197925166, + "grad_norm": 0.1943359375, + "learning_rate": 4.90930039093791e-05, + "loss": 0.0613, + "mean_token_accuracy": 0.9824559628963471, + "num_tokens": 65981631.0, + "step": 24340 + }, + { + "entropy": 0.08221575478091836, + "epoch": 5.674903834945798, + "grad_norm": 4.46875, + "learning_rate": 4.909243299015959e-05, + "loss": 0.0797, + "mean_token_accuracy": 0.9766084969043731, + "num_tokens": 66002579.0, + "step": 24345 + }, + { + "entropy": 0.056899439869448544, + "epoch": 5.6760694719664295, + "grad_norm": 0.400390625, + "learning_rate": 4.909186189807948e-05, + "loss": 0.0522, + "mean_token_accuracy": 0.9842858016490936, + "num_tokens": 66040407.0, + "step": 24350 + }, + { + "entropy": 0.06606799438595772, + "epoch": 5.677235108987062, + "grad_norm": 0.9765625, + "learning_rate": 4.909129063314727e-05, + "loss": 0.0535, + "mean_token_accuracy": 0.9835278391838074, + "num_tokens": 66053760.0, + "step": 24355 + }, + { + "entropy": 0.07696728855371475, + "epoch": 5.678400746007693, + "grad_norm": 1.3203125, + "learning_rate": 4.909071919537148e-05, + "loss": 0.0787, + "mean_token_accuracy": 0.976880544424057, + "num_tokens": 66064281.0, + "step": 24360 + }, + { + "entropy": 0.06062213182449341, + "epoch": 5.679566383028325, + "grad_norm": 0.7421875, + "learning_rate": 4.9090147584760635e-05, + "loss": 0.0449, + "mean_token_accuracy": 0.988343334197998, + "num_tokens": 66078714.0, + "step": 24365 + }, + { + "entropy": 0.07671241629868746, + "epoch": 5.680732020048957, + "grad_norm": 2.59375, + "learning_rate": 4.908957580132326e-05, + "loss": 0.0796, + "mean_token_accuracy": 0.9792423665523529, + "num_tokens": 66112610.0, + "step": 24370 + }, + { + "entropy": 0.05066694635897875, + "epoch": 5.681897657069588, + "grad_norm": 0.6171875, + "learning_rate": 4.908900384506787e-05, + "loss": 0.0239, + "mean_token_accuracy": 0.9861576557159424, + "num_tokens": 66142200.0, + "step": 24375 + }, + { + "entropy": 0.05256279185414314, + "epoch": 5.68306329409022, + "grad_norm": 0.25390625, + "learning_rate": 4.908843171600301e-05, + "loss": 0.0493, + "mean_token_accuracy": 0.9895726382732392, + "num_tokens": 66170098.0, + "step": 24380 + }, + { + "entropy": 0.08458961248397827, + "epoch": 5.684228931110852, + "grad_norm": 0.5546875, + "learning_rate": 4.90878594141372e-05, + "loss": 0.0625, + "mean_token_accuracy": 0.9803022623062134, + "num_tokens": 66189130.0, + "step": 24385 + }, + { + "entropy": 0.07308434527367354, + "epoch": 5.685394568131484, + "grad_norm": 1.7421875, + "learning_rate": 4.908728693947898e-05, + "loss": 0.0579, + "mean_token_accuracy": 0.9815151989459991, + "num_tokens": 66202993.0, + "step": 24390 + }, + { + "entropy": 0.051412354689091444, + "epoch": 5.686560205152116, + "grad_norm": 0.76953125, + "learning_rate": 4.908671429203687e-05, + "loss": 0.0537, + "mean_token_accuracy": 0.9856079936027526, + "num_tokens": 66232433.0, + "step": 24395 + }, + { + "entropy": 0.054090891033411026, + "epoch": 5.687725842172748, + "grad_norm": 0.9453125, + "learning_rate": 4.908614147181944e-05, + "loss": 0.0492, + "mean_token_accuracy": 0.9889610230922699, + "num_tokens": 66261764.0, + "step": 24400 + }, + { + "entropy": 0.06562711736187339, + "epoch": 5.688891479193379, + "grad_norm": 0.353515625, + "learning_rate": 4.908556847883521e-05, + "loss": 0.0472, + "mean_token_accuracy": 0.9882732093334198, + "num_tokens": 66288006.0, + "step": 24405 + }, + { + "entropy": 0.18064768142066895, + "epoch": 5.690057116214011, + "grad_norm": 0.306640625, + "learning_rate": 4.908499531309272e-05, + "loss": 0.3302, + "mean_token_accuracy": 0.9339430838823318, + "num_tokens": 66316631.0, + "step": 24410 + }, + { + "entropy": 0.10602510198950768, + "epoch": 5.691222753234642, + "grad_norm": 0.74609375, + "learning_rate": 4.908442197460053e-05, + "loss": 0.1215, + "mean_token_accuracy": 0.9769471049308777, + "num_tokens": 66336224.0, + "step": 24415 + }, + { + "entropy": 0.06400628378614784, + "epoch": 5.6923883902552745, + "grad_norm": 0.291015625, + "learning_rate": 4.908384846336719e-05, + "loss": 0.0554, + "mean_token_accuracy": 0.9797611176967621, + "num_tokens": 66353209.0, + "step": 24420 + }, + { + "entropy": 0.07065552687272429, + "epoch": 5.693554027275907, + "grad_norm": 1.5625, + "learning_rate": 4.9083274779401236e-05, + "loss": 0.0472, + "mean_token_accuracy": 0.9867741584777832, + "num_tokens": 66369700.0, + "step": 24425 + }, + { + "entropy": 0.07249236945062876, + "epoch": 5.694719664296538, + "grad_norm": 1.109375, + "learning_rate": 4.908270092271124e-05, + "loss": 0.0446, + "mean_token_accuracy": 0.9849403977394104, + "num_tokens": 66385062.0, + "step": 24430 + }, + { + "entropy": 0.07194965444505215, + "epoch": 5.69588530131717, + "grad_norm": 0.53125, + "learning_rate": 4.908212689330575e-05, + "loss": 0.0627, + "mean_token_accuracy": 0.9791293621063233, + "num_tokens": 66402553.0, + "step": 24435 + }, + { + "entropy": 0.06428283378481865, + "epoch": 5.697050938337801, + "grad_norm": 0.283203125, + "learning_rate": 4.908155269119333e-05, + "loss": 0.0495, + "mean_token_accuracy": 0.9872363924980163, + "num_tokens": 66428555.0, + "step": 24440 + }, + { + "entropy": 0.053512285463511944, + "epoch": 5.698216575358433, + "grad_norm": 1.09375, + "learning_rate": 4.908097831638253e-05, + "loss": 0.0335, + "mean_token_accuracy": 0.9889600455760956, + "num_tokens": 66450496.0, + "step": 24445 + }, + { + "entropy": 0.03742195102386177, + "epoch": 5.699382212379065, + "grad_norm": 0.376953125, + "learning_rate": 4.9080403768881934e-05, + "loss": 0.0243, + "mean_token_accuracy": 0.9910203576087951, + "num_tokens": 66497427.0, + "step": 24450 + }, + { + "entropy": 0.08320451527833939, + "epoch": 5.700547849399697, + "grad_norm": 3.390625, + "learning_rate": 4.90798290487001e-05, + "loss": 0.132, + "mean_token_accuracy": 0.9729970693588257, + "num_tokens": 66529137.0, + "step": 24455 + }, + { + "entropy": 0.07671122914180159, + "epoch": 5.701713486420329, + "grad_norm": 1.8125, + "learning_rate": 4.9079254155845596e-05, + "loss": 0.0636, + "mean_token_accuracy": 0.976071572303772, + "num_tokens": 66545793.0, + "step": 24460 + }, + { + "entropy": 0.0630271015688777, + "epoch": 5.702879123440961, + "grad_norm": 0.54296875, + "learning_rate": 4.9078679090326995e-05, + "loss": 0.0664, + "mean_token_accuracy": 0.9832329630851746, + "num_tokens": 66565804.0, + "step": 24465 + }, + { + "entropy": 0.04792722817510366, + "epoch": 5.704044760461592, + "grad_norm": 0.8828125, + "learning_rate": 4.907810385215287e-05, + "loss": 0.0242, + "mean_token_accuracy": 0.989013385772705, + "num_tokens": 66598404.0, + "step": 24470 + }, + { + "entropy": 0.08894434310495854, + "epoch": 5.705210397482224, + "grad_norm": 4.53125, + "learning_rate": 4.907752844133181e-05, + "loss": 0.0824, + "mean_token_accuracy": 0.9780226051807404, + "num_tokens": 66608057.0, + "step": 24475 + }, + { + "entropy": 0.08381523173302412, + "epoch": 5.706376034502856, + "grad_norm": 1.4375, + "learning_rate": 4.907695285787238e-05, + "loss": 0.0709, + "mean_token_accuracy": 0.979823499917984, + "num_tokens": 66635470.0, + "step": 24480 + }, + { + "entropy": 0.05908362194895744, + "epoch": 5.7075416715234875, + "grad_norm": 0.6640625, + "learning_rate": 4.907637710178318e-05, + "loss": 0.0199, + "mean_token_accuracy": 0.9885694861412049, + "num_tokens": 66669770.0, + "step": 24485 + }, + { + "entropy": 0.07350371684879065, + "epoch": 5.70870730854412, + "grad_norm": 0.2001953125, + "learning_rate": 4.9075801173072776e-05, + "loss": 0.0414, + "mean_token_accuracy": 0.9812188148498535, + "num_tokens": 66702048.0, + "step": 24490 + }, + { + "entropy": 0.04958838382735849, + "epoch": 5.709872945564751, + "grad_norm": 0.412109375, + "learning_rate": 4.907522507174977e-05, + "loss": 0.0265, + "mean_token_accuracy": 0.9889841973781586, + "num_tokens": 66732023.0, + "step": 24495 + }, + { + "entropy": 0.049441782478243115, + "epoch": 5.711038582585383, + "grad_norm": 0.890625, + "learning_rate": 4.907464879782275e-05, + "loss": 0.0297, + "mean_token_accuracy": 0.9903727948665619, + "num_tokens": 66763793.0, + "step": 24500 + }, + { + "entropy": 0.07775519993156195, + "epoch": 5.712204219606015, + "grad_norm": 1.0546875, + "learning_rate": 4.90740723513003e-05, + "loss": 0.0823, + "mean_token_accuracy": 0.9773679137229919, + "num_tokens": 66774721.0, + "step": 24505 + }, + { + "entropy": 0.06073737666010857, + "epoch": 5.713369856626646, + "grad_norm": 2.921875, + "learning_rate": 4.9073495732191024e-05, + "loss": 0.0572, + "mean_token_accuracy": 0.9840463936328888, + "num_tokens": 66792015.0, + "step": 24510 + }, + { + "entropy": 0.06353724235668778, + "epoch": 5.714535493647278, + "grad_norm": 0.41796875, + "learning_rate": 4.9072918940503526e-05, + "loss": 0.0465, + "mean_token_accuracy": 0.9877960324287415, + "num_tokens": 66814488.0, + "step": 24515 + }, + { + "entropy": 0.07438728669658304, + "epoch": 5.7157011306679095, + "grad_norm": 1.1171875, + "learning_rate": 4.9072341976246393e-05, + "loss": 0.0356, + "mean_token_accuracy": 0.985117620229721, + "num_tokens": 66840584.0, + "step": 24520 + }, + { + "entropy": 0.07487269369885326, + "epoch": 5.716866767688542, + "grad_norm": 0.2138671875, + "learning_rate": 4.907176483942824e-05, + "loss": 0.0563, + "mean_token_accuracy": 0.9825186014175415, + "num_tokens": 66862103.0, + "step": 24525 + }, + { + "entropy": 0.11075683934614063, + "epoch": 5.718032404709174, + "grad_norm": 1.9921875, + "learning_rate": 4.907118753005767e-05, + "loss": 0.0695, + "mean_token_accuracy": 0.9796536266803741, + "num_tokens": 66880019.0, + "step": 24530 + }, + { + "entropy": 0.07921370901167393, + "epoch": 5.719198041729806, + "grad_norm": 0.953125, + "learning_rate": 4.9070610048143284e-05, + "loss": 0.0577, + "mean_token_accuracy": 0.9792716860771179, + "num_tokens": 66890624.0, + "step": 24535 + }, + { + "entropy": 0.062124581448733805, + "epoch": 5.720363678750437, + "grad_norm": 1.4609375, + "learning_rate": 4.907003239369371e-05, + "loss": 0.0425, + "mean_token_accuracy": 0.9866943776607513, + "num_tokens": 66910862.0, + "step": 24540 + }, + { + "entropy": 0.14402158558368683, + "epoch": 5.721529315771069, + "grad_norm": 0.3046875, + "learning_rate": 4.906945456671754e-05, + "loss": 0.2548, + "mean_token_accuracy": 0.9451413333415986, + "num_tokens": 66931638.0, + "step": 24545 + }, + { + "entropy": 0.04773062439635396, + "epoch": 5.7226949527917, + "grad_norm": 0.3515625, + "learning_rate": 4.90688765672234e-05, + "loss": 0.0223, + "mean_token_accuracy": 0.9901167631149292, + "num_tokens": 66957123.0, + "step": 24550 + }, + { + "entropy": 0.07032151008024812, + "epoch": 5.7238605898123325, + "grad_norm": 0.7265625, + "learning_rate": 4.9068298395219915e-05, + "loss": 0.0258, + "mean_token_accuracy": 0.9859899163246155, + "num_tokens": 66983294.0, + "step": 24555 + }, + { + "entropy": 0.07757879067212343, + "epoch": 5.725026226832965, + "grad_norm": 1.28125, + "learning_rate": 4.90677200507157e-05, + "loss": 0.056, + "mean_token_accuracy": 0.9782130300998688, + "num_tokens": 67002262.0, + "step": 24560 + }, + { + "entropy": 0.080276258289814, + "epoch": 5.726191863853596, + "grad_norm": 1.2734375, + "learning_rate": 4.906714153371937e-05, + "loss": 0.069, + "mean_token_accuracy": 0.9813676834106445, + "num_tokens": 67021324.0, + "step": 24565 + }, + { + "entropy": 0.07185540087521076, + "epoch": 5.727357500874228, + "grad_norm": 1.03125, + "learning_rate": 4.906656284423958e-05, + "loss": 0.0714, + "mean_token_accuracy": 0.9795701384544373, + "num_tokens": 67031846.0, + "step": 24570 + }, + { + "entropy": 0.08011389188468457, + "epoch": 5.728523137894859, + "grad_norm": 1.15625, + "learning_rate": 4.906598398228493e-05, + "loss": 0.0604, + "mean_token_accuracy": 0.982747620344162, + "num_tokens": 67044837.0, + "step": 24575 + }, + { + "entropy": 0.06071140300482512, + "epoch": 5.729688774915491, + "grad_norm": 2.046875, + "learning_rate": 4.9065404947864065e-05, + "loss": 0.0501, + "mean_token_accuracy": 0.9861461579799652, + "num_tokens": 67063797.0, + "step": 24580 + }, + { + "entropy": 0.0752846309915185, + "epoch": 5.730854411936123, + "grad_norm": 0.88671875, + "learning_rate": 4.9064825740985615e-05, + "loss": 0.0759, + "mean_token_accuracy": 0.980017501115799, + "num_tokens": 67075399.0, + "step": 24585 + }, + { + "entropy": 0.05309299118816853, + "epoch": 5.7320200489567545, + "grad_norm": 1.234375, + "learning_rate": 4.906424636165822e-05, + "loss": 0.0353, + "mean_token_accuracy": 0.9873521506786347, + "num_tokens": 67104347.0, + "step": 24590 + }, + { + "entropy": 0.08549241051077842, + "epoch": 5.733185685977387, + "grad_norm": 1.8828125, + "learning_rate": 4.906366680989052e-05, + "loss": 0.0628, + "mean_token_accuracy": 0.9774488568305969, + "num_tokens": 67113934.0, + "step": 24595 + }, + { + "entropy": 0.058443698287010196, + "epoch": 5.734351322998019, + "grad_norm": 2.09375, + "learning_rate": 4.906308708569115e-05, + "loss": 0.0424, + "mean_token_accuracy": 0.9856965959072113, + "num_tokens": 67138145.0, + "step": 24600 + }, + { + "entropy": 0.05035996092483401, + "epoch": 5.73551696001865, + "grad_norm": 1.1875, + "learning_rate": 4.906250718906877e-05, + "loss": 0.0212, + "mean_token_accuracy": 0.9902314424514771, + "num_tokens": 67168380.0, + "step": 24605 + }, + { + "entropy": 0.08484232518821955, + "epoch": 5.736682597039282, + "grad_norm": 2.625, + "learning_rate": 4.906192712003201e-05, + "loss": 0.0933, + "mean_token_accuracy": 0.9787089943885803, + "num_tokens": 67178203.0, + "step": 24610 + }, + { + "entropy": 0.06751094851642847, + "epoch": 5.737848234059914, + "grad_norm": 0.76953125, + "learning_rate": 4.906134687858953e-05, + "loss": 0.0493, + "mean_token_accuracy": 0.978972727060318, + "num_tokens": 67192998.0, + "step": 24615 + }, + { + "entropy": 0.06727732773870229, + "epoch": 5.739013871080545, + "grad_norm": 1.1171875, + "learning_rate": 4.9060766464749966e-05, + "loss": 0.0449, + "mean_token_accuracy": 0.9797036170959472, + "num_tokens": 67211708.0, + "step": 24620 + }, + { + "entropy": 0.07089311145246029, + "epoch": 5.7401795081011775, + "grad_norm": 1.1171875, + "learning_rate": 4.9060185878522e-05, + "loss": 0.0451, + "mean_token_accuracy": 0.987681120634079, + "num_tokens": 67240395.0, + "step": 24625 + }, + { + "entropy": 0.07671239618211985, + "epoch": 5.741345145121809, + "grad_norm": 2.4375, + "learning_rate": 4.905960511991427e-05, + "loss": 0.0693, + "mean_token_accuracy": 0.9838557839393616, + "num_tokens": 67260047.0, + "step": 24630 + }, + { + "entropy": 0.09863593205809593, + "epoch": 5.742510782142441, + "grad_norm": 1.59375, + "learning_rate": 4.905902418893544e-05, + "loss": 0.0601, + "mean_token_accuracy": 0.9812259912490845, + "num_tokens": 67280789.0, + "step": 24635 + }, + { + "entropy": 0.07931637912988662, + "epoch": 5.743676419163073, + "grad_norm": 2.703125, + "learning_rate": 4.905844308559417e-05, + "loss": 0.0545, + "mean_token_accuracy": 0.9830240905284882, + "num_tokens": 67293564.0, + "step": 24640 + }, + { + "entropy": 0.07518193898722529, + "epoch": 5.744842056183704, + "grad_norm": 1.4453125, + "learning_rate": 4.905786180989914e-05, + "loss": 0.0534, + "mean_token_accuracy": 0.9804893136024475, + "num_tokens": 67311845.0, + "step": 24645 + }, + { + "entropy": 0.08153504896908999, + "epoch": 5.746007693204336, + "grad_norm": 1.9453125, + "learning_rate": 4.9057280361859e-05, + "loss": 0.0774, + "mean_token_accuracy": 0.9775378167629242, + "num_tokens": 67322321.0, + "step": 24650 + }, + { + "entropy": 0.11738669704645872, + "epoch": 5.7471733302249675, + "grad_norm": 2.25, + "learning_rate": 4.9056698741482425e-05, + "loss": 0.1217, + "mean_token_accuracy": 0.9739999532699585, + "num_tokens": 67344001.0, + "step": 24655 + }, + { + "entropy": 0.06384739736095071, + "epoch": 5.7483389672456, + "grad_norm": 0.396484375, + "learning_rate": 4.9056116948778094e-05, + "loss": 0.0589, + "mean_token_accuracy": 0.9812793254852294, + "num_tokens": 67360094.0, + "step": 24660 + }, + { + "entropy": 0.05202016243711114, + "epoch": 5.749504604266232, + "grad_norm": 0.2177734375, + "learning_rate": 4.9055534983754674e-05, + "loss": 0.0673, + "mean_token_accuracy": 0.984355503320694, + "num_tokens": 67388936.0, + "step": 24665 + }, + { + "entropy": 0.054006512835621835, + "epoch": 5.750670241286863, + "grad_norm": 0.44140625, + "learning_rate": 4.9054952846420846e-05, + "loss": 0.0334, + "mean_token_accuracy": 0.9864718377590179, + "num_tokens": 67415275.0, + "step": 24670 + }, + { + "entropy": 0.057743340730667114, + "epoch": 5.751835878307495, + "grad_norm": 1.640625, + "learning_rate": 4.905437053678529e-05, + "loss": 0.0464, + "mean_token_accuracy": 0.9884009540081025, + "num_tokens": 67431476.0, + "step": 24675 + }, + { + "entropy": 0.08277987511828541, + "epoch": 5.753001515328127, + "grad_norm": 1.40625, + "learning_rate": 4.9053788054856695e-05, + "loss": 0.0371, + "mean_token_accuracy": 0.9903243362903595, + "num_tokens": 67459871.0, + "step": 24680 + }, + { + "entropy": 0.06267971489578486, + "epoch": 5.754167152348758, + "grad_norm": 1.71875, + "learning_rate": 4.9053205400643745e-05, + "loss": 0.0359, + "mean_token_accuracy": 0.9884569525718689, + "num_tokens": 67489158.0, + "step": 24685 + }, + { + "entropy": 0.06875952733680606, + "epoch": 5.75533278936939, + "grad_norm": 0.890625, + "learning_rate": 4.905262257415512e-05, + "loss": 0.0641, + "mean_token_accuracy": 0.9833064436912536, + "num_tokens": 67508215.0, + "step": 24690 + }, + { + "entropy": 0.06311565637588501, + "epoch": 5.7564984263900225, + "grad_norm": 2.734375, + "learning_rate": 4.905203957539952e-05, + "loss": 0.0724, + "mean_token_accuracy": 0.9840085566043854, + "num_tokens": 67528107.0, + "step": 24695 + }, + { + "entropy": 0.05413348004221916, + "epoch": 5.757664063410654, + "grad_norm": 0.81640625, + "learning_rate": 4.9051456404385634e-05, + "loss": 0.0527, + "mean_token_accuracy": 0.9849828660488129, + "num_tokens": 67554365.0, + "step": 24700 + }, + { + "entropy": 0.06338634807616472, + "epoch": 5.758829700431286, + "grad_norm": 1.4453125, + "learning_rate": 4.9050873061122156e-05, + "loss": 0.0479, + "mean_token_accuracy": 0.984088945388794, + "num_tokens": 67574710.0, + "step": 24705 + }, + { + "entropy": 0.07404083982110024, + "epoch": 5.759995337451917, + "grad_norm": 0.85546875, + "learning_rate": 4.905028954561779e-05, + "loss": 0.0557, + "mean_token_accuracy": 0.9842141985893249, + "num_tokens": 67603496.0, + "step": 24710 + }, + { + "entropy": 0.07604032196104527, + "epoch": 5.761160974472549, + "grad_norm": 0.7734375, + "learning_rate": 4.9049705857881236e-05, + "loss": 0.0817, + "mean_token_accuracy": 0.9802056968212127, + "num_tokens": 67613424.0, + "step": 24715 + }, + { + "entropy": 0.05118763484060764, + "epoch": 5.762326611493181, + "grad_norm": 0.2216796875, + "learning_rate": 4.904912199792119e-05, + "loss": 0.0312, + "mean_token_accuracy": 0.9855857014656066, + "num_tokens": 67640651.0, + "step": 24720 + }, + { + "entropy": 0.08875577114522457, + "epoch": 5.7634922485138125, + "grad_norm": 2.765625, + "learning_rate": 4.904853796574637e-05, + "loss": 0.088, + "mean_token_accuracy": 0.980398166179657, + "num_tokens": 67649753.0, + "step": 24725 + }, + { + "entropy": 0.0819103766232729, + "epoch": 5.764657885534445, + "grad_norm": 1.3984375, + "learning_rate": 4.904795376136547e-05, + "loss": 0.0784, + "mean_token_accuracy": 0.9783179640769959, + "num_tokens": 67667271.0, + "step": 24730 + }, + { + "entropy": 0.07176627768203617, + "epoch": 5.765823522555077, + "grad_norm": 0.435546875, + "learning_rate": 4.9047369384787216e-05, + "loss": 0.0478, + "mean_token_accuracy": 0.984830129146576, + "num_tokens": 67695058.0, + "step": 24735 + }, + { + "entropy": 0.06707020187750459, + "epoch": 5.766989159575708, + "grad_norm": 2.609375, + "learning_rate": 4.9046784836020315e-05, + "loss": 0.0616, + "mean_token_accuracy": 0.9823902189731598, + "num_tokens": 67711561.0, + "step": 24740 + }, + { + "entropy": 0.07110977135598659, + "epoch": 5.76815479659634, + "grad_norm": 0.8203125, + "learning_rate": 4.904620011507349e-05, + "loss": 0.0887, + "mean_token_accuracy": 0.9792767465114594, + "num_tokens": 67721138.0, + "step": 24745 + }, + { + "entropy": 0.07955138608813286, + "epoch": 5.769320433616972, + "grad_norm": 0.5703125, + "learning_rate": 4.904561522195545e-05, + "loss": 0.0561, + "mean_token_accuracy": 0.9796516954898834, + "num_tokens": 67737594.0, + "step": 24750 + }, + { + "entropy": 0.07870314586907626, + "epoch": 5.770486070637603, + "grad_norm": 1.421875, + "learning_rate": 4.904503015667492e-05, + "loss": 0.0583, + "mean_token_accuracy": 0.9815060377120972, + "num_tokens": 67761885.0, + "step": 24755 + }, + { + "entropy": 0.07288762480020523, + "epoch": 5.771651707658235, + "grad_norm": 2.015625, + "learning_rate": 4.904444491924063e-05, + "loss": 0.064, + "mean_token_accuracy": 0.9826235294342041, + "num_tokens": 67779670.0, + "step": 24760 + }, + { + "entropy": 0.06011434905230999, + "epoch": 5.772817344678867, + "grad_norm": 1.1875, + "learning_rate": 4.90438595096613e-05, + "loss": 0.0502, + "mean_token_accuracy": 0.9867224514484405, + "num_tokens": 67792226.0, + "step": 24765 + }, + { + "entropy": 0.07408210225403308, + "epoch": 5.773982981699499, + "grad_norm": 4.8125, + "learning_rate": 4.904327392794566e-05, + "loss": 0.0899, + "mean_token_accuracy": 0.9811750411987304, + "num_tokens": 67802060.0, + "step": 24770 + }, + { + "entropy": 0.09298908114433288, + "epoch": 5.775148618720131, + "grad_norm": 0.54296875, + "learning_rate": 4.904268817410245e-05, + "loss": 0.0886, + "mean_token_accuracy": 0.9772188723087311, + "num_tokens": 67822059.0, + "step": 24775 + }, + { + "entropy": 0.06530466936528682, + "epoch": 5.776314255740762, + "grad_norm": 0.46484375, + "learning_rate": 4.904210224814039e-05, + "loss": 0.0591, + "mean_token_accuracy": 0.983297997713089, + "num_tokens": 67845758.0, + "step": 24780 + }, + { + "entropy": 0.07186365425586701, + "epoch": 5.777479892761394, + "grad_norm": 0.77734375, + "learning_rate": 4.904151615006823e-05, + "loss": 0.0581, + "mean_token_accuracy": 0.983722984790802, + "num_tokens": 67867822.0, + "step": 24785 + }, + { + "entropy": 0.07986092139035464, + "epoch": 5.778645529782025, + "grad_norm": 2.90625, + "learning_rate": 4.90409298798947e-05, + "loss": 0.064, + "mean_token_accuracy": 0.9811047136783599, + "num_tokens": 67879967.0, + "step": 24790 + }, + { + "entropy": 0.07390990536659955, + "epoch": 5.7798111668026575, + "grad_norm": 1.9375, + "learning_rate": 4.9040343437628554e-05, + "loss": 0.0457, + "mean_token_accuracy": 0.9846989333629608, + "num_tokens": 67901376.0, + "step": 24795 + }, + { + "entropy": 0.061554950382560494, + "epoch": 5.78097680382329, + "grad_norm": 0.326171875, + "learning_rate": 4.903975682327853e-05, + "loss": 0.023, + "mean_token_accuracy": 0.986127781867981, + "num_tokens": 67925039.0, + "step": 24800 + }, + { + "entropy": 0.06639809599146247, + "epoch": 5.782142440843921, + "grad_norm": 2.421875, + "learning_rate": 4.903917003685337e-05, + "loss": 0.0544, + "mean_token_accuracy": 0.983638447523117, + "num_tokens": 67951929.0, + "step": 24805 + }, + { + "entropy": 0.044928194023668766, + "epoch": 5.783308077864553, + "grad_norm": 1.4765625, + "learning_rate": 4.903858307836183e-05, + "loss": 0.0189, + "mean_token_accuracy": 0.9886196672916412, + "num_tokens": 67986379.0, + "step": 24810 + }, + { + "entropy": 0.06619902048259974, + "epoch": 5.784473714885185, + "grad_norm": 3.390625, + "learning_rate": 4.9037995947812656e-05, + "loss": 0.0578, + "mean_token_accuracy": 0.9844651639461517, + "num_tokens": 68004294.0, + "step": 24815 + }, + { + "entropy": 0.052559500001370905, + "epoch": 5.785639351905816, + "grad_norm": 0.640625, + "learning_rate": 4.903740864521462e-05, + "loss": 0.039, + "mean_token_accuracy": 0.9900806844234467, + "num_tokens": 68025024.0, + "step": 24820 + }, + { + "entropy": 0.07270161435008049, + "epoch": 5.786804988926448, + "grad_norm": 1.78125, + "learning_rate": 4.9036821170576466e-05, + "loss": 0.08, + "mean_token_accuracy": 0.9809543669223786, + "num_tokens": 68035351.0, + "step": 24825 + }, + { + "entropy": 0.09084619544446468, + "epoch": 5.7879706259470804, + "grad_norm": 1.296875, + "learning_rate": 4.903623352390695e-05, + "loss": 0.0839, + "mean_token_accuracy": 0.9778399229049682, + "num_tokens": 68043865.0, + "step": 24830 + }, + { + "entropy": 0.05886190002784133, + "epoch": 5.789136262967712, + "grad_norm": 1.3125, + "learning_rate": 4.9035645705214836e-05, + "loss": 0.0394, + "mean_token_accuracy": 0.9871821939945221, + "num_tokens": 68067274.0, + "step": 24835 + }, + { + "entropy": 0.05819834126159549, + "epoch": 5.790301899988344, + "grad_norm": 0.53515625, + "learning_rate": 4.90350577145089e-05, + "loss": 0.032, + "mean_token_accuracy": 0.9864734590053559, + "num_tokens": 68096912.0, + "step": 24840 + }, + { + "entropy": 0.0587355166207999, + "epoch": 5.791467537008975, + "grad_norm": 1.4453125, + "learning_rate": 4.903446955179791e-05, + "loss": 0.0359, + "mean_token_accuracy": 0.9889220237731934, + "num_tokens": 68122295.0, + "step": 24845 + }, + { + "entropy": 0.06773356515914201, + "epoch": 5.792633174029607, + "grad_norm": 1.59375, + "learning_rate": 4.903388121709062e-05, + "loss": 0.045, + "mean_token_accuracy": 0.9855106472969055, + "num_tokens": 68150932.0, + "step": 24850 + }, + { + "entropy": 0.1423711057752371, + "epoch": 5.793798811050239, + "grad_norm": 1.328125, + "learning_rate": 4.9033292710395815e-05, + "loss": 0.2357, + "mean_token_accuracy": 0.9351215898990631, + "num_tokens": 68191463.0, + "step": 24855 + }, + { + "entropy": 0.07398759815841913, + "epoch": 5.79496444807087, + "grad_norm": 1.6328125, + "learning_rate": 4.903270403172228e-05, + "loss": 0.0748, + "mean_token_accuracy": 0.9809467673301697, + "num_tokens": 68206785.0, + "step": 24860 + }, + { + "entropy": 0.04514298690482974, + "epoch": 5.7961300850915025, + "grad_norm": 1.3515625, + "learning_rate": 4.9032115181078767e-05, + "loss": 0.0361, + "mean_token_accuracy": 0.9884274780750275, + "num_tokens": 68227908.0, + "step": 24865 + }, + { + "entropy": 0.07980956807732582, + "epoch": 5.797295722112135, + "grad_norm": 3.546875, + "learning_rate": 4.9031526158474075e-05, + "loss": 0.0986, + "mean_token_accuracy": 0.975833123922348, + "num_tokens": 68237536.0, + "step": 24870 + }, + { + "entropy": 0.0604316022247076, + "epoch": 5.798461359132766, + "grad_norm": 3.765625, + "learning_rate": 4.903093696391699e-05, + "loss": 0.0573, + "mean_token_accuracy": 0.9862626194953918, + "num_tokens": 68251155.0, + "step": 24875 + }, + { + "entropy": 0.0642173401080072, + "epoch": 5.799626996153398, + "grad_norm": 0.921875, + "learning_rate": 4.903034759741629e-05, + "loss": 0.0497, + "mean_token_accuracy": 0.9833175539970398, + "num_tokens": 68272527.0, + "step": 24880 + }, + { + "entropy": 0.07729770168662072, + "epoch": 5.80079263317403, + "grad_norm": 4.5625, + "learning_rate": 4.9029758058980755e-05, + "loss": 0.0728, + "mean_token_accuracy": 0.9799280226230621, + "num_tokens": 68287262.0, + "step": 24885 + }, + { + "entropy": 0.09029734618961811, + "epoch": 5.801958270194661, + "grad_norm": 1.3046875, + "learning_rate": 4.90291683486192e-05, + "loss": 0.0831, + "mean_token_accuracy": 0.9771609783172608, + "num_tokens": 68295984.0, + "step": 24890 + }, + { + "entropy": 0.052832887321710584, + "epoch": 5.803123907215293, + "grad_norm": 0.87109375, + "learning_rate": 4.902857846634039e-05, + "loss": 0.0481, + "mean_token_accuracy": 0.9858577847480774, + "num_tokens": 68310873.0, + "step": 24895 + }, + { + "entropy": 0.07836331203579902, + "epoch": 5.804289544235925, + "grad_norm": 1.6953125, + "learning_rate": 4.9027988412153147e-05, + "loss": 0.0851, + "mean_token_accuracy": 0.9786667168140412, + "num_tokens": 68320263.0, + "step": 24900 + }, + { + "entropy": 0.08196291290223598, + "epoch": 5.805455181256557, + "grad_norm": 0.8515625, + "learning_rate": 4.9027398186066256e-05, + "loss": 0.0584, + "mean_token_accuracy": 0.9852247297763824, + "num_tokens": 68350270.0, + "step": 24905 + }, + { + "entropy": 0.13181650806218387, + "epoch": 5.806620818277189, + "grad_norm": 3.265625, + "learning_rate": 4.9026807788088516e-05, + "loss": 0.1793, + "mean_token_accuracy": 0.9595660865306854, + "num_tokens": 68374624.0, + "step": 24910 + }, + { + "entropy": 0.0995724380016327, + "epoch": 5.80778645529782, + "grad_norm": 1.5703125, + "learning_rate": 4.902621721822873e-05, + "loss": 0.0736, + "mean_token_accuracy": 0.9790677905082703, + "num_tokens": 68385069.0, + "step": 24915 + }, + { + "entropy": 0.0691030714660883, + "epoch": 5.808952092318452, + "grad_norm": 0.50390625, + "learning_rate": 4.902562647649571e-05, + "loss": 0.066, + "mean_token_accuracy": 0.9839153230190277, + "num_tokens": 68400909.0, + "step": 24920 + }, + { + "entropy": 0.06300949761644006, + "epoch": 5.810117729339083, + "grad_norm": 1.640625, + "learning_rate": 4.902503556289827e-05, + "loss": 0.0507, + "mean_token_accuracy": 0.9794705331325531, + "num_tokens": 68433503.0, + "step": 24925 + }, + { + "entropy": 0.06272582067176699, + "epoch": 5.811283366359715, + "grad_norm": 2.078125, + "learning_rate": 4.9024444477445216e-05, + "loss": 0.043, + "mean_token_accuracy": 0.9844630897045136, + "num_tokens": 68451031.0, + "step": 24930 + }, + { + "entropy": 0.056623499002307655, + "epoch": 5.8124490033803475, + "grad_norm": 1.515625, + "learning_rate": 4.9023853220145355e-05, + "loss": 0.0447, + "mean_token_accuracy": 0.9855444729328156, + "num_tokens": 68474316.0, + "step": 24935 + }, + { + "entropy": 0.05010606348514557, + "epoch": 5.813614640400979, + "grad_norm": 1.2734375, + "learning_rate": 4.9023261791007514e-05, + "loss": 0.0405, + "mean_token_accuracy": 0.9896503448486328, + "num_tokens": 68495309.0, + "step": 24940 + }, + { + "entropy": 0.07280549835413694, + "epoch": 5.814780277421611, + "grad_norm": 0.91015625, + "learning_rate": 4.902267019004051e-05, + "loss": 0.0646, + "mean_token_accuracy": 0.9830634236335755, + "num_tokens": 68507603.0, + "step": 24945 + }, + { + "entropy": 0.05676093138754368, + "epoch": 5.815945914442243, + "grad_norm": 1.6953125, + "learning_rate": 4.902207841725315e-05, + "loss": 0.057, + "mean_token_accuracy": 0.9855646133422852, + "num_tokens": 68523419.0, + "step": 24950 + }, + { + "entropy": 0.06060009114444256, + "epoch": 5.817111551462874, + "grad_norm": 0.8125, + "learning_rate": 4.9021486472654285e-05, + "loss": 0.0292, + "mean_token_accuracy": 0.9882301330566406, + "num_tokens": 68545958.0, + "step": 24955 + }, + { + "entropy": 0.07530084438621998, + "epoch": 5.818277188483506, + "grad_norm": 0.79296875, + "learning_rate": 4.902089435625272e-05, + "loss": 0.0792, + "mean_token_accuracy": 0.9814654409885406, + "num_tokens": 68557288.0, + "step": 24960 + }, + { + "entropy": 0.05552833992987871, + "epoch": 5.819442825504138, + "grad_norm": 0.46484375, + "learning_rate": 4.9020302068057296e-05, + "loss": 0.0355, + "mean_token_accuracy": 0.9849902153015136, + "num_tokens": 68591863.0, + "step": 24965 + }, + { + "entropy": 0.06605686107650399, + "epoch": 5.82060846252477, + "grad_norm": 0.5859375, + "learning_rate": 4.9019709608076834e-05, + "loss": 0.0331, + "mean_token_accuracy": 0.9879513800144195, + "num_tokens": 68623535.0, + "step": 24970 + }, + { + "entropy": 0.06880107838660479, + "epoch": 5.821774099545402, + "grad_norm": 0.58203125, + "learning_rate": 4.901911697632018e-05, + "loss": 0.08, + "mean_token_accuracy": 0.9790106236934661, + "num_tokens": 68636782.0, + "step": 24975 + }, + { + "entropy": 0.07890882007777691, + "epoch": 5.822939736566033, + "grad_norm": 1.6328125, + "learning_rate": 4.901852417279617e-05, + "loss": 0.063, + "mean_token_accuracy": 0.9829611778259277, + "num_tokens": 68646651.0, + "step": 24980 + }, + { + "entropy": 0.047220236714929344, + "epoch": 5.824105373586665, + "grad_norm": 0.1455078125, + "learning_rate": 4.9017931197513625e-05, + "loss": 0.046, + "mean_token_accuracy": 0.9886944651603699, + "num_tokens": 68672593.0, + "step": 24985 + }, + { + "entropy": 0.05217403545975685, + "epoch": 5.825271010607297, + "grad_norm": 0.51953125, + "learning_rate": 4.9017338050481415e-05, + "loss": 0.0391, + "mean_token_accuracy": 0.9889337956905365, + "num_tokens": 68695953.0, + "step": 24990 + }, + { + "entropy": 0.05944271394982934, + "epoch": 5.826436647627928, + "grad_norm": 0.8359375, + "learning_rate": 4.901674473170837e-05, + "loss": 0.0412, + "mean_token_accuracy": 0.9885189294815063, + "num_tokens": 68725653.0, + "step": 24995 + }, + { + "entropy": 0.06879550032317638, + "epoch": 5.8276022846485604, + "grad_norm": 1.0703125, + "learning_rate": 4.901615124120333e-05, + "loss": 0.0664, + "mean_token_accuracy": 0.9819593906402588, + "num_tokens": 68738463.0, + "step": 25000 + }, + { + "entropy": 0.0660898657515645, + "epoch": 5.8287679216691926, + "grad_norm": 2.234375, + "learning_rate": 4.901555757897517e-05, + "loss": 0.0783, + "mean_token_accuracy": 0.9784412801265716, + "num_tokens": 68758793.0, + "step": 25005 + }, + { + "entropy": 0.059831819776445624, + "epoch": 5.829933558689824, + "grad_norm": 3.34375, + "learning_rate": 4.9014963745032714e-05, + "loss": 0.0446, + "mean_token_accuracy": 0.9873811423778533, + "num_tokens": 68789321.0, + "step": 25010 + }, + { + "entropy": 0.04351752800866961, + "epoch": 5.831099195710456, + "grad_norm": 0.2353515625, + "learning_rate": 4.9014369739384836e-05, + "loss": 0.0233, + "mean_token_accuracy": 0.9915945589542389, + "num_tokens": 68827648.0, + "step": 25015 + }, + { + "entropy": 0.04593317694962025, + "epoch": 5.832264832731088, + "grad_norm": 0.7109375, + "learning_rate": 4.9013775562040384e-05, + "loss": 0.045, + "mean_token_accuracy": 0.9876484513282776, + "num_tokens": 68848301.0, + "step": 25020 + }, + { + "entropy": 0.08194901645183564, + "epoch": 5.833430469751719, + "grad_norm": 4.8125, + "learning_rate": 4.901318121300822e-05, + "loss": 0.0842, + "mean_token_accuracy": 0.9750150680541992, + "num_tokens": 68857732.0, + "step": 25025 + }, + { + "entropy": 0.05717827407643199, + "epoch": 5.834596106772351, + "grad_norm": 0.388671875, + "learning_rate": 4.90125866922972e-05, + "loss": 0.053, + "mean_token_accuracy": 0.9848073601722718, + "num_tokens": 68877921.0, + "step": 25030 + }, + { + "entropy": 0.06308312909677624, + "epoch": 5.8357617437929825, + "grad_norm": 0.333984375, + "learning_rate": 4.901199199991621e-05, + "loss": 0.05, + "mean_token_accuracy": 0.9814503490924835, + "num_tokens": 68894260.0, + "step": 25035 + }, + { + "entropy": 0.0596246593631804, + "epoch": 5.836927380813615, + "grad_norm": 2.375, + "learning_rate": 4.9011397135874095e-05, + "loss": 0.0486, + "mean_token_accuracy": 0.985901540517807, + "num_tokens": 68914302.0, + "step": 25040 + }, + { + "entropy": 0.0679067311808467, + "epoch": 5.838093017834247, + "grad_norm": 1.0703125, + "learning_rate": 4.901080210017974e-05, + "loss": 0.06, + "mean_token_accuracy": 0.9837885200977325, + "num_tokens": 68929950.0, + "step": 25045 + }, + { + "entropy": 0.052906651981174946, + "epoch": 5.839258654854878, + "grad_norm": 0.294921875, + "learning_rate": 4.9010206892842004e-05, + "loss": 0.0358, + "mean_token_accuracy": 0.9889191925525666, + "num_tokens": 68958878.0, + "step": 25050 + }, + { + "entropy": 0.07227160930633544, + "epoch": 5.84042429187551, + "grad_norm": 1.7421875, + "learning_rate": 4.900961151386978e-05, + "loss": 0.0636, + "mean_token_accuracy": 0.9838977217674255, + "num_tokens": 68970844.0, + "step": 25055 + }, + { + "entropy": 0.04979464411735535, + "epoch": 5.841589928896141, + "grad_norm": 0.203125, + "learning_rate": 4.9009015963271935e-05, + "loss": 0.0324, + "mean_token_accuracy": 0.9865056037902832, + "num_tokens": 68996822.0, + "step": 25060 + }, + { + "entropy": 0.0801444560289383, + "epoch": 5.842755565916773, + "grad_norm": 0.953125, + "learning_rate": 4.900842024105735e-05, + "loss": 0.0591, + "mean_token_accuracy": 0.9850879967212677, + "num_tokens": 69008715.0, + "step": 25065 + }, + { + "entropy": 0.06933778412640094, + "epoch": 5.8439212029374055, + "grad_norm": 1.4140625, + "learning_rate": 4.90078243472349e-05, + "loss": 0.0671, + "mean_token_accuracy": 0.9800980925559998, + "num_tokens": 69027853.0, + "step": 25070 + }, + { + "entropy": 0.0758416060358286, + "epoch": 5.845086839958037, + "grad_norm": 0.51953125, + "learning_rate": 4.9007228281813497e-05, + "loss": 0.0634, + "mean_token_accuracy": 0.9792212009429931, + "num_tokens": 69052064.0, + "step": 25075 + }, + { + "entropy": 0.04220882719382644, + "epoch": 5.846252476978669, + "grad_norm": 0.1767578125, + "learning_rate": 4.9006632044802005e-05, + "loss": 0.019, + "mean_token_accuracy": 0.9897603571414948, + "num_tokens": 69093950.0, + "step": 25080 + }, + { + "entropy": 0.0862182735465467, + "epoch": 5.847418113999301, + "grad_norm": 1.0859375, + "learning_rate": 4.900603563620933e-05, + "loss": 0.0696, + "mean_token_accuracy": 0.977711945772171, + "num_tokens": 69111718.0, + "step": 25085 + }, + { + "entropy": 0.05932206539437175, + "epoch": 5.848583751019932, + "grad_norm": 1.8984375, + "learning_rate": 4.9005439056044345e-05, + "loss": 0.0557, + "mean_token_accuracy": 0.984957355260849, + "num_tokens": 69130818.0, + "step": 25090 + }, + { + "entropy": 0.0707325934432447, + "epoch": 5.849749388040564, + "grad_norm": 3.09375, + "learning_rate": 4.900484230431596e-05, + "loss": 0.0727, + "mean_token_accuracy": 0.9831813514232636, + "num_tokens": 69150168.0, + "step": 25095 + }, + { + "entropy": 0.07086216192692518, + "epoch": 5.850915025061196, + "grad_norm": 2.515625, + "learning_rate": 4.900424538103307e-05, + "loss": 0.0651, + "mean_token_accuracy": 0.9826680719852448, + "num_tokens": 69169786.0, + "step": 25100 + }, + { + "entropy": 0.1979743585921824, + "epoch": 5.8520806620818275, + "grad_norm": 0.5546875, + "learning_rate": 4.900364828620459e-05, + "loss": 0.2013, + "mean_token_accuracy": 0.9713845193386078, + "num_tokens": 69203927.0, + "step": 25105 + }, + { + "entropy": 0.08615749217569828, + "epoch": 5.85324629910246, + "grad_norm": 4.3125, + "learning_rate": 4.900305101983941e-05, + "loss": 0.0884, + "mean_token_accuracy": 0.9804681181907654, + "num_tokens": 69212942.0, + "step": 25110 + }, + { + "entropy": 0.06537931114435196, + "epoch": 5.854411936123091, + "grad_norm": 2.359375, + "learning_rate": 4.9002453581946426e-05, + "loss": 0.0269, + "mean_token_accuracy": 0.9828318059444427, + "num_tokens": 69259238.0, + "step": 25115 + }, + { + "entropy": 0.06119229989126325, + "epoch": 5.855577573143723, + "grad_norm": 0.31640625, + "learning_rate": 4.9001855972534566e-05, + "loss": 0.0665, + "mean_token_accuracy": 0.982484656572342, + "num_tokens": 69278421.0, + "step": 25120 + }, + { + "entropy": 0.08115847948938608, + "epoch": 5.856743210164355, + "grad_norm": 0.51953125, + "learning_rate": 4.900125819161273e-05, + "loss": 0.0287, + "mean_token_accuracy": 0.9896356463432312, + "num_tokens": 69306027.0, + "step": 25125 + }, + { + "entropy": 0.05680079516023397, + "epoch": 5.857908847184986, + "grad_norm": 3.90625, + "learning_rate": 4.900066023918984e-05, + "loss": 0.0574, + "mean_token_accuracy": 0.9836540341377258, + "num_tokens": 69333581.0, + "step": 25130 + }, + { + "entropy": 0.06749532804824412, + "epoch": 5.859074484205618, + "grad_norm": 2.203125, + "learning_rate": 4.900006211527481e-05, + "loss": 0.0542, + "mean_token_accuracy": 0.9833632111549377, + "num_tokens": 69353480.0, + "step": 25135 + }, + { + "entropy": 0.06454726718366147, + "epoch": 5.8602401212262505, + "grad_norm": 0.453125, + "learning_rate": 4.899946381987655e-05, + "loss": 0.0358, + "mean_token_accuracy": 0.9881761074066162, + "num_tokens": 69387451.0, + "step": 25140 + }, + { + "entropy": 0.0817156407982111, + "epoch": 5.861405758246882, + "grad_norm": 1.96875, + "learning_rate": 4.899886535300399e-05, + "loss": 0.0876, + "mean_token_accuracy": 0.9789597153663635, + "num_tokens": 69406319.0, + "step": 25145 + }, + { + "entropy": 0.059768567234277724, + "epoch": 5.862571395267514, + "grad_norm": 0.859375, + "learning_rate": 4.899826671466606e-05, + "loss": 0.0551, + "mean_token_accuracy": 0.9846877813339233, + "num_tokens": 69422523.0, + "step": 25150 + }, + { + "entropy": 0.057445686869323255, + "epoch": 5.863737032288146, + "grad_norm": 0.265625, + "learning_rate": 4.899766790487167e-05, + "loss": 0.0507, + "mean_token_accuracy": 0.9862017810344696, + "num_tokens": 69456953.0, + "step": 25155 + }, + { + "entropy": 0.07467758394777775, + "epoch": 5.864902669308777, + "grad_norm": 0.4296875, + "learning_rate": 4.899706892362976e-05, + "loss": 0.0775, + "mean_token_accuracy": 0.9791662037372589, + "num_tokens": 69468183.0, + "step": 25160 + }, + { + "entropy": 0.07420080313459039, + "epoch": 5.866068306329409, + "grad_norm": 1.9453125, + "learning_rate": 4.899646977094926e-05, + "loss": 0.0563, + "mean_token_accuracy": 0.9782842993736267, + "num_tokens": 69483658.0, + "step": 25165 + }, + { + "entropy": 0.0779596921056509, + "epoch": 5.8672339433500404, + "grad_norm": 1.359375, + "learning_rate": 4.8995870446839107e-05, + "loss": 0.0725, + "mean_token_accuracy": 0.9799931585788727, + "num_tokens": 69492891.0, + "step": 25170 + }, + { + "entropy": 0.058550332672894, + "epoch": 5.8683995803706726, + "grad_norm": 0.50390625, + "learning_rate": 4.899527095130823e-05, + "loss": 0.0458, + "mean_token_accuracy": 0.9878552556037903, + "num_tokens": 69511105.0, + "step": 25175 + }, + { + "entropy": 0.11132394783198833, + "epoch": 5.869565217391305, + "grad_norm": 4.1875, + "learning_rate": 4.899467128436558e-05, + "loss": 0.0796, + "mean_token_accuracy": 0.9794100940227508, + "num_tokens": 69525774.0, + "step": 25180 + }, + { + "entropy": 0.057683018036186695, + "epoch": 5.870730854411936, + "grad_norm": 1.71875, + "learning_rate": 4.8994071446020086e-05, + "loss": 0.0572, + "mean_token_accuracy": 0.9849148869514466, + "num_tokens": 69539887.0, + "step": 25185 + }, + { + "entropy": 0.04408614346757531, + "epoch": 5.871896491432568, + "grad_norm": 0.578125, + "learning_rate": 4.8993471436280696e-05, + "loss": 0.0371, + "mean_token_accuracy": 0.9894815742969513, + "num_tokens": 69574395.0, + "step": 25190 + }, + { + "entropy": 0.08741942159831524, + "epoch": 5.873062128453199, + "grad_norm": 1.7109375, + "learning_rate": 4.899287125515637e-05, + "loss": 0.0734, + "mean_token_accuracy": 0.980907928943634, + "num_tokens": 69586689.0, + "step": 25195 + }, + { + "entropy": 0.05762271042913199, + "epoch": 5.874227765473831, + "grad_norm": 0.546875, + "learning_rate": 4.899227090265604e-05, + "loss": 0.0275, + "mean_token_accuracy": 0.9872725248336792, + "num_tokens": 69611654.0, + "step": 25200 + }, + { + "entropy": 0.08529213555157185, + "epoch": 5.875393402494463, + "grad_norm": 2.90625, + "learning_rate": 4.899167037878867e-05, + "loss": 0.0796, + "mean_token_accuracy": 0.978586596250534, + "num_tokens": 69620488.0, + "step": 25205 + }, + { + "entropy": 0.07133572660386563, + "epoch": 5.876559039515095, + "grad_norm": 1.4296875, + "learning_rate": 4.89910696835632e-05, + "loss": 0.0635, + "mean_token_accuracy": 0.9817762792110443, + "num_tokens": 69640920.0, + "step": 25210 + }, + { + "entropy": 0.05597841432318092, + "epoch": 5.877724676535727, + "grad_norm": 0.52734375, + "learning_rate": 4.899046881698861e-05, + "loss": 0.0475, + "mean_token_accuracy": 0.9782732903957367, + "num_tokens": 69663848.0, + "step": 25215 + }, + { + "entropy": 0.06624952824786305, + "epoch": 5.878890313556359, + "grad_norm": 2.65625, + "learning_rate": 4.898986777907384e-05, + "loss": 0.0535, + "mean_token_accuracy": 0.9852926313877106, + "num_tokens": 69692834.0, + "step": 25220 + }, + { + "entropy": 0.06026315214112401, + "epoch": 5.88005595057699, + "grad_norm": 0.6171875, + "learning_rate": 4.8989266569827865e-05, + "loss": 0.0363, + "mean_token_accuracy": 0.9877964973449707, + "num_tokens": 69715431.0, + "step": 25225 + }, + { + "entropy": 0.05216656900011003, + "epoch": 5.881221587597622, + "grad_norm": 0.5625, + "learning_rate": 4.898866518925964e-05, + "loss": 0.04, + "mean_token_accuracy": 0.9874097645282746, + "num_tokens": 69741887.0, + "step": 25230 + }, + { + "entropy": 0.08897986803203821, + "epoch": 5.882387224618254, + "grad_norm": 1.359375, + "learning_rate": 4.8988063637378135e-05, + "loss": 0.0874, + "mean_token_accuracy": 0.9695273995399475, + "num_tokens": 69751481.0, + "step": 25235 + }, + { + "entropy": 0.049010620545595886, + "epoch": 5.8835528616388855, + "grad_norm": 0.185546875, + "learning_rate": 4.898746191419233e-05, + "loss": 0.0305, + "mean_token_accuracy": 0.9865678668022155, + "num_tokens": 69786531.0, + "step": 25240 + }, + { + "entropy": 0.057374946866184474, + "epoch": 5.884718498659518, + "grad_norm": 1.5390625, + "learning_rate": 4.898686001971118e-05, + "loss": 0.0372, + "mean_token_accuracy": 0.9866810023784638, + "num_tokens": 69812725.0, + "step": 25245 + }, + { + "entropy": 0.06412132494151593, + "epoch": 5.885884135680149, + "grad_norm": 1.4140625, + "learning_rate": 4.8986257953943675e-05, + "loss": 0.0747, + "mean_token_accuracy": 0.9813116431236267, + "num_tokens": 69830628.0, + "step": 25250 + }, + { + "entropy": 0.06648487970232964, + "epoch": 5.887049772700781, + "grad_norm": 0.8828125, + "learning_rate": 4.8985655716898794e-05, + "loss": 0.0621, + "mean_token_accuracy": 0.984495198726654, + "num_tokens": 69843119.0, + "step": 25255 + }, + { + "entropy": 0.05889953942969441, + "epoch": 5.888215409721413, + "grad_norm": 1.1640625, + "learning_rate": 4.89850533085855e-05, + "loss": 0.0558, + "mean_token_accuracy": 0.9845393478870392, + "num_tokens": 69859043.0, + "step": 25260 + }, + { + "entropy": 0.06169169787317515, + "epoch": 5.889381046742044, + "grad_norm": 1.15625, + "learning_rate": 4.898445072901279e-05, + "loss": 0.0377, + "mean_token_accuracy": 0.9878328561782836, + "num_tokens": 69884035.0, + "step": 25265 + }, + { + "entropy": 0.06493881866335868, + "epoch": 5.890546683762676, + "grad_norm": 0.66796875, + "learning_rate": 4.898384797818965e-05, + "loss": 0.0598, + "mean_token_accuracy": 0.9817538022994995, + "num_tokens": 69903093.0, + "step": 25270 + }, + { + "entropy": 0.07026399262249469, + "epoch": 5.891712320783308, + "grad_norm": 0.6953125, + "learning_rate": 4.898324505612506e-05, + "loss": 0.0477, + "mean_token_accuracy": 0.9830464065074921, + "num_tokens": 69924802.0, + "step": 25275 + }, + { + "entropy": 0.07336622122675181, + "epoch": 5.89287795780394, + "grad_norm": 1.25, + "learning_rate": 4.8982641962828016e-05, + "loss": 0.0701, + "mean_token_accuracy": 0.9820722460746765, + "num_tokens": 69940112.0, + "step": 25280 + }, + { + "entropy": 0.07239239010959864, + "epoch": 5.894043594824572, + "grad_norm": 0.4296875, + "learning_rate": 4.898203869830751e-05, + "loss": 0.0312, + "mean_token_accuracy": 0.986901706457138, + "num_tokens": 69976722.0, + "step": 25285 + }, + { + "entropy": 0.07194672748446465, + "epoch": 5.895209231845204, + "grad_norm": 3.609375, + "learning_rate": 4.898143526257254e-05, + "loss": 0.0517, + "mean_token_accuracy": 0.985188215970993, + "num_tokens": 69992244.0, + "step": 25290 + }, + { + "entropy": 0.0679439775645733, + "epoch": 5.896374868865835, + "grad_norm": 2.234375, + "learning_rate": 4.89808316556321e-05, + "loss": 0.0652, + "mean_token_accuracy": 0.983660078048706, + "num_tokens": 70002583.0, + "step": 25295 + }, + { + "entropy": 0.045691716391593216, + "epoch": 5.897540505886467, + "grad_norm": 1.328125, + "learning_rate": 4.898022787749518e-05, + "loss": 0.0417, + "mean_token_accuracy": 0.9899048924446106, + "num_tokens": 70036696.0, + "step": 25300 + }, + { + "entropy": 0.0655904158949852, + "epoch": 5.898706142907098, + "grad_norm": 3.34375, + "learning_rate": 4.8979623928170807e-05, + "loss": 0.0667, + "mean_token_accuracy": 0.9835781037807465, + "num_tokens": 70057959.0, + "step": 25305 + }, + { + "entropy": 0.0502521482296288, + "epoch": 5.8998717799277305, + "grad_norm": 0.427734375, + "learning_rate": 4.897901980766798e-05, + "loss": 0.0388, + "mean_token_accuracy": 0.98756023645401, + "num_tokens": 70097961.0, + "step": 25310 + }, + { + "entropy": 0.06395963728427886, + "epoch": 5.901037416948363, + "grad_norm": 2.078125, + "learning_rate": 4.897841551599569e-05, + "loss": 0.069, + "mean_token_accuracy": 0.9781058013439179, + "num_tokens": 70118450.0, + "step": 25315 + }, + { + "entropy": 0.07023835629224777, + "epoch": 5.902203053968994, + "grad_norm": 0.83984375, + "learning_rate": 4.8977811053162966e-05, + "loss": 0.0714, + "mean_token_accuracy": 0.9810861945152283, + "num_tokens": 70137787.0, + "step": 25320 + }, + { + "entropy": 0.06961941700428724, + "epoch": 5.903368690989626, + "grad_norm": 1.9296875, + "learning_rate": 4.897720641917881e-05, + "loss": 0.0599, + "mean_token_accuracy": 0.981745857000351, + "num_tokens": 70165019.0, + "step": 25325 + }, + { + "entropy": 0.05609036097303033, + "epoch": 5.904534328010257, + "grad_norm": 0.310546875, + "learning_rate": 4.897660161405225e-05, + "loss": 0.0447, + "mean_token_accuracy": 0.9861968576908111, + "num_tokens": 70193670.0, + "step": 25330 + }, + { + "entropy": 0.07962943669408559, + "epoch": 5.905699965030889, + "grad_norm": 3.015625, + "learning_rate": 4.89759966377923e-05, + "loss": 0.075, + "mean_token_accuracy": 0.978878664970398, + "num_tokens": 70207721.0, + "step": 25335 + }, + { + "entropy": 0.06705031860619784, + "epoch": 5.906865602051521, + "grad_norm": 1.5, + "learning_rate": 4.8975391490407974e-05, + "loss": 0.0494, + "mean_token_accuracy": 0.9814827382564545, + "num_tokens": 70227807.0, + "step": 25340 + }, + { + "entropy": 0.06056810254231095, + "epoch": 5.9080312390721526, + "grad_norm": 0.34765625, + "learning_rate": 4.89747861719083e-05, + "loss": 0.0579, + "mean_token_accuracy": 0.9835603713989258, + "num_tokens": 70252966.0, + "step": 25345 + }, + { + "entropy": 0.07554044676944613, + "epoch": 5.909196876092785, + "grad_norm": 1.140625, + "learning_rate": 4.8974180682302306e-05, + "loss": 0.0442, + "mean_token_accuracy": 0.9850922226905823, + "num_tokens": 70274362.0, + "step": 25350 + }, + { + "entropy": 0.0693486931733787, + "epoch": 5.910362513113417, + "grad_norm": 2.265625, + "learning_rate": 4.897357502159902e-05, + "loss": 0.0732, + "mean_token_accuracy": 0.9794533431529999, + "num_tokens": 70309391.0, + "step": 25355 + }, + { + "entropy": 0.069387202616781, + "epoch": 5.911528150134048, + "grad_norm": 0.5, + "learning_rate": 4.8972969189807475e-05, + "loss": 0.0533, + "mean_token_accuracy": 0.9858072459697723, + "num_tokens": 70326010.0, + "step": 25360 + }, + { + "entropy": 0.03837292743846774, + "epoch": 5.91269378715468, + "grad_norm": 0.490234375, + "learning_rate": 4.8972363186936706e-05, + "loss": 0.0166, + "mean_token_accuracy": 0.9922250151634217, + "num_tokens": 70359986.0, + "step": 25365 + }, + { + "entropy": 0.08435314483940601, + "epoch": 5.913859424175312, + "grad_norm": 1.65625, + "learning_rate": 4.897175701299574e-05, + "loss": 0.0892, + "mean_token_accuracy": 0.977598226070404, + "num_tokens": 70380482.0, + "step": 25370 + }, + { + "entropy": 0.06352117350324989, + "epoch": 5.915025061195943, + "grad_norm": 3.328125, + "learning_rate": 4.897115066799363e-05, + "loss": 0.0618, + "mean_token_accuracy": 0.9802210390567779, + "num_tokens": 70406011.0, + "step": 25375 + }, + { + "entropy": 0.08407215159386397, + "epoch": 5.9161906982165755, + "grad_norm": 1.0234375, + "learning_rate": 4.89705441519394e-05, + "loss": 0.0715, + "mean_token_accuracy": 0.9791640937328339, + "num_tokens": 70416918.0, + "step": 25380 + }, + { + "entropy": 0.07379502542316914, + "epoch": 5.917356335237207, + "grad_norm": 4.375, + "learning_rate": 4.8969937464842115e-05, + "loss": 0.0648, + "mean_token_accuracy": 0.9773381769657135, + "num_tokens": 70435729.0, + "step": 25385 + }, + { + "entropy": 0.05902850423008203, + "epoch": 5.918521972257839, + "grad_norm": 0.7890625, + "learning_rate": 4.89693306067108e-05, + "loss": 0.0354, + "mean_token_accuracy": 0.9863932430744171, + "num_tokens": 70462047.0, + "step": 25390 + }, + { + "entropy": 0.06693649515509606, + "epoch": 5.919687609278471, + "grad_norm": 1.4453125, + "learning_rate": 4.896872357755452e-05, + "loss": 0.0455, + "mean_token_accuracy": 0.9824758410453797, + "num_tokens": 70481681.0, + "step": 25395 + }, + { + "entropy": 0.04593549780547619, + "epoch": 5.920853246299102, + "grad_norm": 0.427734375, + "learning_rate": 4.896811637738232e-05, + "loss": 0.027, + "mean_token_accuracy": 0.9869483411312103, + "num_tokens": 70504472.0, + "step": 25400 + }, + { + "entropy": 0.07661759108304977, + "epoch": 5.922018883319734, + "grad_norm": 1.203125, + "learning_rate": 4.896750900620326e-05, + "loss": 0.0319, + "mean_token_accuracy": 0.9825215280056, + "num_tokens": 70530182.0, + "step": 25405 + }, + { + "entropy": 0.09024886526167393, + "epoch": 5.923184520340366, + "grad_norm": 1.5625, + "learning_rate": 4.89669014640264e-05, + "loss": 0.0898, + "mean_token_accuracy": 0.9773705959320068, + "num_tokens": 70541455.0, + "step": 25410 + }, + { + "entropy": 0.08583233803510666, + "epoch": 5.924350157360998, + "grad_norm": 3.1875, + "learning_rate": 4.896629375086077e-05, + "loss": 0.0797, + "mean_token_accuracy": 0.9771179258823395, + "num_tokens": 70550552.0, + "step": 25415 + }, + { + "entropy": 0.06757635474205018, + "epoch": 5.92551579438163, + "grad_norm": 1.7265625, + "learning_rate": 4.896568586671547e-05, + "loss": 0.0633, + "mean_token_accuracy": 0.9803970336914063, + "num_tokens": 70573952.0, + "step": 25420 + }, + { + "entropy": 0.059069217182695866, + "epoch": 5.926681431402262, + "grad_norm": 0.94921875, + "learning_rate": 4.896507781159954e-05, + "loss": 0.0554, + "mean_token_accuracy": 0.9838501870632171, + "num_tokens": 70591204.0, + "step": 25425 + }, + { + "entropy": 0.07336567882448435, + "epoch": 5.927847068422893, + "grad_norm": 1.4765625, + "learning_rate": 4.8964469585522066e-05, + "loss": 0.0657, + "mean_token_accuracy": 0.9818897664546966, + "num_tokens": 70603793.0, + "step": 25430 + }, + { + "entropy": 0.08957900255918502, + "epoch": 5.929012705443525, + "grad_norm": 2.46875, + "learning_rate": 4.89638611884921e-05, + "loss": 0.0868, + "mean_token_accuracy": 0.9778998076915741, + "num_tokens": 70612772.0, + "step": 25435 + }, + { + "entropy": 0.05743040251545608, + "epoch": 5.930178342464156, + "grad_norm": 0.78125, + "learning_rate": 4.896325262051872e-05, + "loss": 0.0192, + "mean_token_accuracy": 0.9897181570529938, + "num_tokens": 70655512.0, + "step": 25440 + }, + { + "entropy": 0.06604121178388596, + "epoch": 5.931343979484788, + "grad_norm": 1.40625, + "learning_rate": 4.8962643881611e-05, + "loss": 0.0513, + "mean_token_accuracy": 0.9855192124843597, + "num_tokens": 70673994.0, + "step": 25445 + }, + { + "entropy": 0.08524401690810919, + "epoch": 5.9325096165054205, + "grad_norm": 4.5, + "learning_rate": 4.8962034971778027e-05, + "loss": 0.0937, + "mean_token_accuracy": 0.9765569984912872, + "num_tokens": 70699151.0, + "step": 25450 + }, + { + "entropy": 0.0642532754689455, + "epoch": 5.933675253526052, + "grad_norm": 1.8125, + "learning_rate": 4.8961425891028865e-05, + "loss": 0.0416, + "mean_token_accuracy": 0.9848453521728515, + "num_tokens": 70728641.0, + "step": 25455 + }, + { + "entropy": 0.08150929920375347, + "epoch": 5.934840890546684, + "grad_norm": 1.0859375, + "learning_rate": 4.8960816639372606e-05, + "loss": 0.0724, + "mean_token_accuracy": 0.9757475137710572, + "num_tokens": 70755529.0, + "step": 25460 + }, + { + "entropy": 0.06778738740831614, + "epoch": 5.936006527567315, + "grad_norm": 1.328125, + "learning_rate": 4.8960207216818335e-05, + "loss": 0.0593, + "mean_token_accuracy": 0.9825225472450256, + "num_tokens": 70767883.0, + "step": 25465 + }, + { + "entropy": 0.05971136726438999, + "epoch": 5.937172164587947, + "grad_norm": 0.71875, + "learning_rate": 4.8959597623375134e-05, + "loss": 0.0462, + "mean_token_accuracy": 0.9880515694618225, + "num_tokens": 70810408.0, + "step": 25470 + }, + { + "entropy": 0.06987338345497847, + "epoch": 5.938337801608579, + "grad_norm": 1.078125, + "learning_rate": 4.8958987859052095e-05, + "loss": 0.0533, + "mean_token_accuracy": 0.9834602892398834, + "num_tokens": 70826934.0, + "step": 25475 + }, + { + "entropy": 0.08081718422472477, + "epoch": 5.9395034386292105, + "grad_norm": 2.078125, + "learning_rate": 4.895837792385832e-05, + "loss": 0.0893, + "mean_token_accuracy": 0.9789685130119323, + "num_tokens": 70835743.0, + "step": 25480 + }, + { + "entropy": 0.05064443135634065, + "epoch": 5.940669075649843, + "grad_norm": 0.8828125, + "learning_rate": 4.895776781780289e-05, + "loss": 0.0274, + "mean_token_accuracy": 0.9879032492637634, + "num_tokens": 70864498.0, + "step": 25485 + }, + { + "entropy": 0.0775157648138702, + "epoch": 5.941834712670475, + "grad_norm": 0.353515625, + "learning_rate": 4.895715754089491e-05, + "loss": 0.0509, + "mean_token_accuracy": 0.9845824480056763, + "num_tokens": 70889378.0, + "step": 25490 + }, + { + "entropy": 0.0937342531979084, + "epoch": 5.943000349691106, + "grad_norm": 2.03125, + "learning_rate": 4.8956547093143476e-05, + "loss": 0.087, + "mean_token_accuracy": 0.9744981229305267, + "num_tokens": 70899630.0, + "step": 25495 + }, + { + "entropy": 0.05566701851785183, + "epoch": 5.944165986711738, + "grad_norm": 2.671875, + "learning_rate": 4.89559364745577e-05, + "loss": 0.0472, + "mean_token_accuracy": 0.9846188008785248, + "num_tokens": 70923647.0, + "step": 25500 + }, + { + "entropy": 0.06326587796211243, + "epoch": 5.94533162373237, + "grad_norm": 0.77734375, + "learning_rate": 4.8955325685146666e-05, + "loss": 0.0496, + "mean_token_accuracy": 0.9842358469963074, + "num_tokens": 70948453.0, + "step": 25505 + }, + { + "entropy": 0.060926478728652, + "epoch": 5.946497260753001, + "grad_norm": 3.96875, + "learning_rate": 4.8954714724919506e-05, + "loss": 0.055, + "mean_token_accuracy": 0.984597235918045, + "num_tokens": 70972669.0, + "step": 25510 + }, + { + "entropy": 0.06100227106362581, + "epoch": 5.947662897773633, + "grad_norm": 1.3203125, + "learning_rate": 4.895410359388533e-05, + "loss": 0.0426, + "mean_token_accuracy": 0.9850693285465241, + "num_tokens": 71001021.0, + "step": 25515 + }, + { + "entropy": 0.06199376685544848, + "epoch": 5.948828534794265, + "grad_norm": 1.7734375, + "learning_rate": 4.895349229205323e-05, + "loss": 0.0545, + "mean_token_accuracy": 0.984418374300003, + "num_tokens": 71021573.0, + "step": 25520 + }, + { + "entropy": 0.07707611806690692, + "epoch": 5.949994171814897, + "grad_norm": 2.53125, + "learning_rate": 4.8952880819432333e-05, + "loss": 0.0506, + "mean_token_accuracy": 0.9833807289600373, + "num_tokens": 71038416.0, + "step": 25525 + }, + { + "entropy": 0.08036872111260891, + "epoch": 5.951159808835529, + "grad_norm": 2.4375, + "learning_rate": 4.895226917603175e-05, + "loss": 0.0776, + "mean_token_accuracy": 0.9776204705238343, + "num_tokens": 71053137.0, + "step": 25530 + }, + { + "entropy": 0.07707045953720808, + "epoch": 5.95232544585616, + "grad_norm": 1.03125, + "learning_rate": 4.8951657361860623e-05, + "loss": 0.0602, + "mean_token_accuracy": 0.9830064177513123, + "num_tokens": 71065875.0, + "step": 25535 + }, + { + "entropy": 0.06487810891121626, + "epoch": 5.953491082876792, + "grad_norm": 0.75390625, + "learning_rate": 4.895104537692806e-05, + "loss": 0.0282, + "mean_token_accuracy": 0.9820800602436066, + "num_tokens": 71085386.0, + "step": 25540 + }, + { + "entropy": 0.1173665277659893, + "epoch": 5.954656719897424, + "grad_norm": 2.5625, + "learning_rate": 4.895043322124318e-05, + "loss": 0.1445, + "mean_token_accuracy": 0.9714154601097107, + "num_tokens": 71104444.0, + "step": 25545 + }, + { + "entropy": 0.0770050410181284, + "epoch": 5.9558223569180555, + "grad_norm": 0.2021484375, + "learning_rate": 4.894982089481511e-05, + "loss": 0.0646, + "mean_token_accuracy": 0.9791462659835816, + "num_tokens": 71122244.0, + "step": 25550 + }, + { + "entropy": 0.0616754699498415, + "epoch": 5.956987993938688, + "grad_norm": 1.09375, + "learning_rate": 4.8949208397653004e-05, + "loss": 0.0594, + "mean_token_accuracy": 0.9835680782794952, + "num_tokens": 71147606.0, + "step": 25555 + }, + { + "entropy": 0.061316430754959586, + "epoch": 5.95815363095932, + "grad_norm": 0.79296875, + "learning_rate": 4.894859572976598e-05, + "loss": 0.0545, + "mean_token_accuracy": 0.9843459963798523, + "num_tokens": 71162452.0, + "step": 25560 + }, + { + "entropy": 0.05889552496373653, + "epoch": 5.959319267979951, + "grad_norm": 1.140625, + "learning_rate": 4.8947982891163164e-05, + "loss": 0.045, + "mean_token_accuracy": 0.982684051990509, + "num_tokens": 71177001.0, + "step": 25565 + }, + { + "entropy": 0.09865431338548661, + "epoch": 5.960484905000583, + "grad_norm": 0.88671875, + "learning_rate": 4.894736988185371e-05, + "loss": 0.0539, + "mean_token_accuracy": 0.9783219814300537, + "num_tokens": 71206647.0, + "step": 25570 + }, + { + "entropy": 0.046166717913001774, + "epoch": 5.961650542021214, + "grad_norm": 0.419921875, + "learning_rate": 4.894675670184675e-05, + "loss": 0.0324, + "mean_token_accuracy": 0.9870345771312714, + "num_tokens": 71233006.0, + "step": 25575 + }, + { + "entropy": 0.06371776573359966, + "epoch": 5.962816179041846, + "grad_norm": 3.078125, + "learning_rate": 4.894614335115143e-05, + "loss": 0.0517, + "mean_token_accuracy": 0.9826191842556, + "num_tokens": 71250289.0, + "step": 25580 + }, + { + "entropy": 0.062395652197301386, + "epoch": 5.9639818160624785, + "grad_norm": 0.60546875, + "learning_rate": 4.89455298297769e-05, + "loss": 0.0548, + "mean_token_accuracy": 0.9834360301494598, + "num_tokens": 71263976.0, + "step": 25585 + }, + { + "entropy": 0.06196978203952312, + "epoch": 5.96514745308311, + "grad_norm": 2.4375, + "learning_rate": 4.894491613773231e-05, + "loss": 0.0487, + "mean_token_accuracy": 0.984034126996994, + "num_tokens": 71290800.0, + "step": 25590 + }, + { + "entropy": 0.07303077168762684, + "epoch": 5.966313090103742, + "grad_norm": 2.296875, + "learning_rate": 4.89443022750268e-05, + "loss": 0.0615, + "mean_token_accuracy": 0.9787440538406372, + "num_tokens": 71303915.0, + "step": 25595 + }, + { + "entropy": 0.07619455568492413, + "epoch": 5.967478727124373, + "grad_norm": 0.69921875, + "learning_rate": 4.8943688241669536e-05, + "loss": 0.0525, + "mean_token_accuracy": 0.9836428880691528, + "num_tokens": 71318505.0, + "step": 25600 + }, + { + "entropy": 0.06338299652561545, + "epoch": 5.968644364145005, + "grad_norm": 0.6015625, + "learning_rate": 4.8943074037669654e-05, + "loss": 0.0555, + "mean_token_accuracy": 0.9859287023544312, + "num_tokens": 71341853.0, + "step": 25605 + }, + { + "entropy": 0.0515817703679204, + "epoch": 5.969810001165637, + "grad_norm": 0.41015625, + "learning_rate": 4.8942459663036346e-05, + "loss": 0.0267, + "mean_token_accuracy": 0.9896455585956574, + "num_tokens": 71377062.0, + "step": 25610 + }, + { + "entropy": 0.06094341482967138, + "epoch": 5.970975638186268, + "grad_norm": 0.35546875, + "learning_rate": 4.894184511777874e-05, + "loss": 0.0573, + "mean_token_accuracy": 0.9822888076305389, + "num_tokens": 71390415.0, + "step": 25615 + }, + { + "entropy": 0.055656261183321475, + "epoch": 5.9721412752069005, + "grad_norm": 0.98046875, + "learning_rate": 4.894123040190602e-05, + "loss": 0.0389, + "mean_token_accuracy": 0.987447464466095, + "num_tokens": 71427479.0, + "step": 25620 + }, + { + "entropy": 0.05644221818074584, + "epoch": 5.973306912227533, + "grad_norm": 0.478515625, + "learning_rate": 4.894061551542734e-05, + "loss": 0.0153, + "mean_token_accuracy": 0.986136132478714, + "num_tokens": 71470655.0, + "step": 25625 + }, + { + "entropy": 0.045355524122715, + "epoch": 5.974472549248164, + "grad_norm": 1.46875, + "learning_rate": 4.894000045835188e-05, + "loss": 0.0374, + "mean_token_accuracy": 0.9896352529525757, + "num_tokens": 71499833.0, + "step": 25630 + }, + { + "entropy": 0.06681758724153042, + "epoch": 5.975638186268796, + "grad_norm": 0.17578125, + "learning_rate": 4.8939385230688807e-05, + "loss": 0.0523, + "mean_token_accuracy": 0.983164769411087, + "num_tokens": 71524600.0, + "step": 25635 + }, + { + "entropy": 0.0596080549992621, + "epoch": 5.976803823289428, + "grad_norm": 2.296875, + "learning_rate": 4.893876983244729e-05, + "loss": 0.0522, + "mean_token_accuracy": 0.9858713269233703, + "num_tokens": 71546467.0, + "step": 25640 + }, + { + "entropy": 0.08861620575189591, + "epoch": 5.977969460310059, + "grad_norm": 8.3125, + "learning_rate": 4.893815426363652e-05, + "loss": 0.0821, + "mean_token_accuracy": 0.9688912212848664, + "num_tokens": 71579206.0, + "step": 25645 + }, + { + "entropy": 0.08461652826517821, + "epoch": 5.979135097330691, + "grad_norm": 0.92578125, + "learning_rate": 4.893753852426565e-05, + "loss": 0.0789, + "mean_token_accuracy": 0.9783106327056885, + "num_tokens": 71589261.0, + "step": 25650 + }, + { + "entropy": 0.060508431307971476, + "epoch": 5.980300734351323, + "grad_norm": 0.7265625, + "learning_rate": 4.893692261434389e-05, + "loss": 0.057, + "mean_token_accuracy": 0.9857955038547516, + "num_tokens": 71603908.0, + "step": 25655 + }, + { + "entropy": 0.050046111829578875, + "epoch": 5.981466371371955, + "grad_norm": 0.83203125, + "learning_rate": 4.8936306533880405e-05, + "loss": 0.0253, + "mean_token_accuracy": 0.986841905117035, + "num_tokens": 71626686.0, + "step": 25660 + }, + { + "entropy": 0.07740709893405437, + "epoch": 5.982632008392587, + "grad_norm": 0.96484375, + "learning_rate": 4.893569028288439e-05, + "loss": 0.0644, + "mean_token_accuracy": 0.984851849079132, + "num_tokens": 71638219.0, + "step": 25665 + }, + { + "entropy": 0.07101895548403263, + "epoch": 5.983797645413218, + "grad_norm": 2.015625, + "learning_rate": 4.8935073861365034e-05, + "loss": 0.0591, + "mean_token_accuracy": 0.97741579413414, + "num_tokens": 71651552.0, + "step": 25670 + }, + { + "entropy": 0.06982565447688102, + "epoch": 5.98496328243385, + "grad_norm": 2.078125, + "learning_rate": 4.8934457269331527e-05, + "loss": 0.0715, + "mean_token_accuracy": 0.9800402879714966, + "num_tokens": 71662332.0, + "step": 25675 + }, + { + "entropy": 0.088308035582304, + "epoch": 5.986128919454482, + "grad_norm": 0.90625, + "learning_rate": 4.8933840506793065e-05, + "loss": 0.0857, + "mean_token_accuracy": 0.9786054491996765, + "num_tokens": 71671220.0, + "step": 25680 + }, + { + "entropy": 0.08883246891200543, + "epoch": 5.987294556475113, + "grad_norm": 0.3515625, + "learning_rate": 4.8933223573758845e-05, + "loss": 0.0496, + "mean_token_accuracy": 0.9835711300373078, + "num_tokens": 71696436.0, + "step": 25685 + }, + { + "entropy": 0.07261432018131017, + "epoch": 5.9884601934957455, + "grad_norm": 2.625, + "learning_rate": 4.893260647023806e-05, + "loss": 0.0756, + "mean_token_accuracy": 0.9792898654937744, + "num_tokens": 71711295.0, + "step": 25690 + }, + { + "entropy": 0.08523041075095535, + "epoch": 5.989625830516378, + "grad_norm": 1.7890625, + "learning_rate": 4.893198919623992e-05, + "loss": 0.0682, + "mean_token_accuracy": 0.976850026845932, + "num_tokens": 71727227.0, + "step": 25695 + }, + { + "entropy": 0.06633602287620306, + "epoch": 5.990791467537009, + "grad_norm": 1.9375, + "learning_rate": 4.893137175177363e-05, + "loss": 0.0585, + "mean_token_accuracy": 0.9812263011932373, + "num_tokens": 71743408.0, + "step": 25700 + }, + { + "entropy": 0.05985606387257576, + "epoch": 5.991957104557641, + "grad_norm": 3.203125, + "learning_rate": 4.893075413684839e-05, + "loss": 0.0669, + "mean_token_accuracy": 0.9821926891803742, + "num_tokens": 71766920.0, + "step": 25705 + }, + { + "entropy": 0.06824294216930867, + "epoch": 5.993122741578272, + "grad_norm": 3.171875, + "learning_rate": 4.893013635147341e-05, + "loss": 0.0909, + "mean_token_accuracy": 0.9769490718841553, + "num_tokens": 71777572.0, + "step": 25710 + }, + { + "entropy": 0.08433944657444954, + "epoch": 5.994288378598904, + "grad_norm": 1.203125, + "learning_rate": 4.89295183956579e-05, + "loss": 0.072, + "mean_token_accuracy": 0.9772077202796936, + "num_tokens": 71788992.0, + "step": 25715 + }, + { + "entropy": 0.06912859678268432, + "epoch": 5.995454015619536, + "grad_norm": 3.078125, + "learning_rate": 4.892890026941109e-05, + "loss": 0.0489, + "mean_token_accuracy": 0.9828975081443787, + "num_tokens": 71811320.0, + "step": 25720 + }, + { + "entropy": 0.0763224471360445, + "epoch": 5.996619652640168, + "grad_norm": 1.1171875, + "learning_rate": 4.892828197274218e-05, + "loss": 0.0814, + "mean_token_accuracy": 0.9780821740627289, + "num_tokens": 71829482.0, + "step": 25725 + }, + { + "entropy": 0.03815199537202716, + "epoch": 5.9977852896608, + "grad_norm": 0.93359375, + "learning_rate": 4.892766350566041e-05, + "loss": 0.0193, + "mean_token_accuracy": 0.9918882131576539, + "num_tokens": 71883754.0, + "step": 25730 + }, + { + "entropy": 0.06713078990578651, + "epoch": 5.998950926681431, + "grad_norm": 1.9765625, + "learning_rate": 4.892704486817498e-05, + "loss": 0.0522, + "mean_token_accuracy": 0.9826019644737244, + "num_tokens": 71895540.0, + "step": 25735 + }, + { + "entropy": 0.06625933138032754, + "epoch": 6.0, + "grad_norm": 2.75, + "learning_rate": 4.892642606029512e-05, + "loss": 0.044, + "mean_token_accuracy": 0.985924243927002, + "num_tokens": 71923000.0, + "step": 25740 + }, + { + "entropy": 0.05685313232243061, + "epoch": 6.001165637020632, + "grad_norm": 0.62109375, + "learning_rate": 4.892580708203007e-05, + "loss": 0.0352, + "mean_token_accuracy": 0.9892299056053162, + "num_tokens": 71937131.0, + "step": 25745 + }, + { + "entropy": 0.05876139011234045, + "epoch": 6.002331274041263, + "grad_norm": 0.57421875, + "learning_rate": 4.8925187933389035e-05, + "loss": 0.0284, + "mean_token_accuracy": 0.9893492102622986, + "num_tokens": 71959498.0, + "step": 25750 + }, + { + "entropy": 0.05102113718166947, + "epoch": 6.003496911061895, + "grad_norm": 1.0234375, + "learning_rate": 4.892456861438128e-05, + "loss": 0.0223, + "mean_token_accuracy": 0.9925749123096466, + "num_tokens": 71982142.0, + "step": 25755 + }, + { + "entropy": 0.05887993331998587, + "epoch": 6.0046625480825275, + "grad_norm": 1.1484375, + "learning_rate": 4.8923949125016013e-05, + "loss": 0.0386, + "mean_token_accuracy": 0.9897330641746521, + "num_tokens": 71994131.0, + "step": 25760 + }, + { + "entropy": 0.05944917807355523, + "epoch": 6.005828185103159, + "grad_norm": 0.94140625, + "learning_rate": 4.892332946530249e-05, + "loss": 0.0271, + "mean_token_accuracy": 0.9928303897380829, + "num_tokens": 72013539.0, + "step": 25765 + }, + { + "entropy": 0.0366467990912497, + "epoch": 6.006993822123791, + "grad_norm": 0.5625, + "learning_rate": 4.892270963524994e-05, + "loss": 0.016, + "mean_token_accuracy": 0.9950882434844971, + "num_tokens": 72038175.0, + "step": 25770 + }, + { + "entropy": 0.05455034887418151, + "epoch": 6.008159459144422, + "grad_norm": 3.96875, + "learning_rate": 4.8922089634867606e-05, + "loss": 0.0328, + "mean_token_accuracy": 0.9901614844799042, + "num_tokens": 72055936.0, + "step": 25775 + }, + { + "entropy": 0.05095590045675635, + "epoch": 6.009325096165054, + "grad_norm": 2.234375, + "learning_rate": 4.892146946416474e-05, + "loss": 0.0312, + "mean_token_accuracy": 0.9916975438594818, + "num_tokens": 72073682.0, + "step": 25780 + }, + { + "entropy": 0.060593480616807936, + "epoch": 6.010490733185686, + "grad_norm": 2.03125, + "learning_rate": 4.892084912315059e-05, + "loss": 0.0325, + "mean_token_accuracy": 0.9904182434082032, + "num_tokens": 72085726.0, + "step": 25785 + }, + { + "entropy": 0.05342343971133232, + "epoch": 6.0116563702063175, + "grad_norm": 1.015625, + "learning_rate": 4.892022861183439e-05, + "loss": 0.0268, + "mean_token_accuracy": 0.9912295579910279, + "num_tokens": 72107893.0, + "step": 25790 + }, + { + "entropy": 0.06415905207395553, + "epoch": 6.01282200722695, + "grad_norm": 1.9921875, + "learning_rate": 4.891960793022541e-05, + "loss": 0.0397, + "mean_token_accuracy": 0.9880828559398651, + "num_tokens": 72126518.0, + "step": 25795 + }, + { + "entropy": 0.05017009107396007, + "epoch": 6.013987644247582, + "grad_norm": 1.84375, + "learning_rate": 4.8918987078332904e-05, + "loss": 0.0242, + "mean_token_accuracy": 0.9938202440738678, + "num_tokens": 72154112.0, + "step": 25800 + }, + { + "entropy": 0.07650583721697331, + "epoch": 6.015153281268213, + "grad_norm": 3.953125, + "learning_rate": 4.8918366056166114e-05, + "loss": 0.0514, + "mean_token_accuracy": 0.9844214498996735, + "num_tokens": 72176069.0, + "step": 25805 + }, + { + "entropy": 0.05705469730310142, + "epoch": 6.016318918288845, + "grad_norm": 0.1572265625, + "learning_rate": 4.891774486373432e-05, + "loss": 0.0212, + "mean_token_accuracy": 0.9931414604187012, + "num_tokens": 72199150.0, + "step": 25810 + }, + { + "entropy": 0.06287220679223537, + "epoch": 6.017484555309476, + "grad_norm": 1.7890625, + "learning_rate": 4.891712350104678e-05, + "loss": 0.0337, + "mean_token_accuracy": 0.9904451191425323, + "num_tokens": 72218038.0, + "step": 25815 + }, + { + "entropy": 0.05191007032990456, + "epoch": 6.018650192330108, + "grad_norm": 0.490234375, + "learning_rate": 4.891650196811275e-05, + "loss": 0.0135, + "mean_token_accuracy": 0.9937026917934417, + "num_tokens": 72264266.0, + "step": 25820 + }, + { + "entropy": 0.05475984732620418, + "epoch": 6.01981582935074, + "grad_norm": 2.75, + "learning_rate": 4.891588026494151e-05, + "loss": 0.0107, + "mean_token_accuracy": 0.9927169740200043, + "num_tokens": 72305044.0, + "step": 25825 + }, + { + "entropy": 0.05737621607258916, + "epoch": 6.020981466371372, + "grad_norm": 0.55859375, + "learning_rate": 4.8915258391542316e-05, + "loss": 0.0269, + "mean_token_accuracy": 0.991907000541687, + "num_tokens": 72323624.0, + "step": 25830 + }, + { + "entropy": 0.06937633194029331, + "epoch": 6.022147103392004, + "grad_norm": 2.265625, + "learning_rate": 4.8914636347924454e-05, + "loss": 0.0611, + "mean_token_accuracy": 0.9854352653026581, + "num_tokens": 72337594.0, + "step": 25835 + }, + { + "entropy": 0.06716333478689193, + "epoch": 6.023312740412636, + "grad_norm": 1.75, + "learning_rate": 4.89140141340972e-05, + "loss": 0.0324, + "mean_token_accuracy": 0.990732753276825, + "num_tokens": 72347443.0, + "step": 25840 + }, + { + "entropy": 0.05915342541411519, + "epoch": 6.024478377433267, + "grad_norm": 0.453125, + "learning_rate": 4.891339175006981e-05, + "loss": 0.0364, + "mean_token_accuracy": 0.9867867410182953, + "num_tokens": 72384961.0, + "step": 25845 + }, + { + "entropy": 0.09259512033313513, + "epoch": 6.025644014453899, + "grad_norm": 1.3984375, + "learning_rate": 4.891276919585161e-05, + "loss": 0.0653, + "mean_token_accuracy": 0.9823358118534088, + "num_tokens": 72399191.0, + "step": 25850 + }, + { + "entropy": 0.08896644115447998, + "epoch": 6.02680965147453, + "grad_norm": 0.296875, + "learning_rate": 4.891214647145184e-05, + "loss": 0.0761, + "mean_token_accuracy": 0.985335499048233, + "num_tokens": 72434806.0, + "step": 25855 + }, + { + "entropy": 0.05383868329226971, + "epoch": 6.0279752884951625, + "grad_norm": 0.447265625, + "learning_rate": 4.8911523576879795e-05, + "loss": 0.0238, + "mean_token_accuracy": 0.9926303625106812, + "num_tokens": 72452382.0, + "step": 25860 + }, + { + "entropy": 0.06760429283604026, + "epoch": 6.029140925515795, + "grad_norm": 1.625, + "learning_rate": 4.891090051214478e-05, + "loss": 0.0319, + "mean_token_accuracy": 0.989197313785553, + "num_tokens": 72473686.0, + "step": 25865 + }, + { + "entropy": 0.06802155338227749, + "epoch": 6.030306562536426, + "grad_norm": 2.3125, + "learning_rate": 4.891027727725607e-05, + "loss": 0.0272, + "mean_token_accuracy": 0.9918610870838165, + "num_tokens": 72485853.0, + "step": 25870 + }, + { + "entropy": 0.0644018879160285, + "epoch": 6.031472199557058, + "grad_norm": 2.203125, + "learning_rate": 4.890965387222297e-05, + "loss": 0.0432, + "mean_token_accuracy": 0.9895402014255523, + "num_tokens": 72505760.0, + "step": 25875 + }, + { + "entropy": 0.06654907325282693, + "epoch": 6.03263783657769, + "grad_norm": 0.337890625, + "learning_rate": 4.8909030297054764e-05, + "loss": 0.0427, + "mean_token_accuracy": 0.992247325181961, + "num_tokens": 72532642.0, + "step": 25880 + }, + { + "entropy": 0.05838645258918405, + "epoch": 6.033803473598321, + "grad_norm": 2.9375, + "learning_rate": 4.890840655176076e-05, + "loss": 0.0355, + "mean_token_accuracy": 0.9868348956108093, + "num_tokens": 72556266.0, + "step": 25885 + }, + { + "entropy": 0.06037556882947683, + "epoch": 6.034969110618953, + "grad_norm": 2.234375, + "learning_rate": 4.890778263635025e-05, + "loss": 0.0318, + "mean_token_accuracy": 0.9884869992733002, + "num_tokens": 72569724.0, + "step": 25890 + }, + { + "entropy": 0.058738169819116594, + "epoch": 6.0361347476395855, + "grad_norm": 2.609375, + "learning_rate": 4.890715855083255e-05, + "loss": 0.0247, + "mean_token_accuracy": 0.993329894542694, + "num_tokens": 72582251.0, + "step": 25895 + }, + { + "entropy": 0.06071986351162195, + "epoch": 6.037300384660217, + "grad_norm": 0.859375, + "learning_rate": 4.890653429521695e-05, + "loss": 0.0309, + "mean_token_accuracy": 0.988233745098114, + "num_tokens": 72595487.0, + "step": 25900 + }, + { + "entropy": 0.05742992917075753, + "epoch": 6.038466021680849, + "grad_norm": 1.7421875, + "learning_rate": 4.8905909869512775e-05, + "loss": 0.0181, + "mean_token_accuracy": 0.9917668163776397, + "num_tokens": 72627186.0, + "step": 25905 + }, + { + "entropy": 0.06477688588201999, + "epoch": 6.03963165870148, + "grad_norm": 1.1484375, + "learning_rate": 4.890528527372933e-05, + "loss": 0.0485, + "mean_token_accuracy": 0.9858067691326141, + "num_tokens": 72637192.0, + "step": 25910 + }, + { + "entropy": 0.05110297799110412, + "epoch": 6.040797295722112, + "grad_norm": 0.98046875, + "learning_rate": 4.8904660507875924e-05, + "loss": 0.0218, + "mean_token_accuracy": 0.9923175871372223, + "num_tokens": 72665602.0, + "step": 25915 + }, + { + "entropy": 0.12344857305288315, + "epoch": 6.041962932742744, + "grad_norm": 2.546875, + "learning_rate": 4.890403557196187e-05, + "loss": 0.056, + "mean_token_accuracy": 0.9839066147804261, + "num_tokens": 72675252.0, + "step": 25920 + }, + { + "entropy": 0.06564762499183416, + "epoch": 6.043128569763375, + "grad_norm": 4.9375, + "learning_rate": 4.8903410465996495e-05, + "loss": 0.0278, + "mean_token_accuracy": 0.9908899009227753, + "num_tokens": 72698132.0, + "step": 25925 + }, + { + "entropy": 0.07632733806967736, + "epoch": 6.0442942067840075, + "grad_norm": 2.890625, + "learning_rate": 4.890278518998912e-05, + "loss": 0.0393, + "mean_token_accuracy": 0.9884995639324188, + "num_tokens": 72713793.0, + "step": 25930 + }, + { + "entropy": 0.0680435385555029, + "epoch": 6.04545984380464, + "grad_norm": 0.9609375, + "learning_rate": 4.8902159743949073e-05, + "loss": 0.0506, + "mean_token_accuracy": 0.9874895095825196, + "num_tokens": 72723522.0, + "step": 25935 + }, + { + "entropy": 0.07830537669360638, + "epoch": 6.046625480825271, + "grad_norm": 1.671875, + "learning_rate": 4.890153412788567e-05, + "loss": 0.0334, + "mean_token_accuracy": 0.986879688501358, + "num_tokens": 72737014.0, + "step": 25940 + }, + { + "entropy": 0.07722733989357948, + "epoch": 6.047791117845903, + "grad_norm": 2.671875, + "learning_rate": 4.890090834180824e-05, + "loss": 0.0561, + "mean_token_accuracy": 0.9857710182666779, + "num_tokens": 72745948.0, + "step": 25945 + }, + { + "entropy": 0.0765731481835246, + "epoch": 6.048956754866534, + "grad_norm": 0.55078125, + "learning_rate": 4.8900282385726127e-05, + "loss": 0.0434, + "mean_token_accuracy": 0.9872212827205658, + "num_tokens": 72763772.0, + "step": 25950 + }, + { + "entropy": 0.06433370187878609, + "epoch": 6.050122391887166, + "grad_norm": 0.98046875, + "learning_rate": 4.8899656259648655e-05, + "loss": 0.0257, + "mean_token_accuracy": 0.988516104221344, + "num_tokens": 72784836.0, + "step": 25955 + }, + { + "entropy": 0.04958404209464788, + "epoch": 6.051288028907798, + "grad_norm": 0.306640625, + "learning_rate": 4.8899029963585155e-05, + "loss": 0.0303, + "mean_token_accuracy": 0.9914787411689758, + "num_tokens": 72800542.0, + "step": 25960 + }, + { + "entropy": 0.05592640060931444, + "epoch": 6.05245366592843, + "grad_norm": 1.046875, + "learning_rate": 4.889840349754498e-05, + "loss": 0.0216, + "mean_token_accuracy": 0.9904370605945587, + "num_tokens": 72823728.0, + "step": 25965 + }, + { + "entropy": 0.06810164107009768, + "epoch": 6.053619302949062, + "grad_norm": 0.4609375, + "learning_rate": 4.889777686153746e-05, + "loss": 0.0343, + "mean_token_accuracy": 0.9900489509105682, + "num_tokens": 72845251.0, + "step": 25970 + }, + { + "entropy": 0.07986075691878795, + "epoch": 6.054784939969694, + "grad_norm": 5.34375, + "learning_rate": 4.889715005557194e-05, + "loss": 0.0563, + "mean_token_accuracy": 0.98703653216362, + "num_tokens": 72867014.0, + "step": 25975 + }, + { + "entropy": 0.058453739061951636, + "epoch": 6.055950576990325, + "grad_norm": 0.380859375, + "learning_rate": 4.889652307965778e-05, + "loss": 0.0185, + "mean_token_accuracy": 0.99262655377388, + "num_tokens": 72890050.0, + "step": 25980 + }, + { + "entropy": 0.06057945545762777, + "epoch": 6.057116214010957, + "grad_norm": 1.1796875, + "learning_rate": 4.889589593380432e-05, + "loss": 0.0237, + "mean_token_accuracy": 0.9909035623073578, + "num_tokens": 72914583.0, + "step": 25985 + }, + { + "entropy": 0.1058798679150641, + "epoch": 6.058281851031588, + "grad_norm": 0.2431640625, + "learning_rate": 4.88952686180209e-05, + "loss": 0.0489, + "mean_token_accuracy": 0.9859530746936798, + "num_tokens": 72948565.0, + "step": 25990 + }, + { + "entropy": 0.07051466554403304, + "epoch": 6.05944748805222, + "grad_norm": 2.546875, + "learning_rate": 4.889464113231689e-05, + "loss": 0.036, + "mean_token_accuracy": 0.9863975763320922, + "num_tokens": 72960750.0, + "step": 25995 + }, + { + "entropy": 0.0715387485921383, + "epoch": 6.0606131250728525, + "grad_norm": 1.15625, + "learning_rate": 4.889401347670165e-05, + "loss": 0.047, + "mean_token_accuracy": 0.9889802992343902, + "num_tokens": 72981087.0, + "step": 26000 + }, + { + "entropy": 0.1161620058119297, + "epoch": 6.061778762093484, + "grad_norm": 0.33984375, + "learning_rate": 4.8893385651184524e-05, + "loss": 0.0313, + "mean_token_accuracy": 0.9857727110385894, + "num_tokens": 73000120.0, + "step": 26005 + }, + { + "entropy": 0.06805797554552555, + "epoch": 6.062944399114116, + "grad_norm": 3.21875, + "learning_rate": 4.889275765577488e-05, + "loss": 0.0362, + "mean_token_accuracy": 0.9873246729373932, + "num_tokens": 73028760.0, + "step": 26010 + }, + { + "entropy": 0.0626982631161809, + "epoch": 6.064110036134748, + "grad_norm": 1.2890625, + "learning_rate": 4.889212949048209e-05, + "loss": 0.0254, + "mean_token_accuracy": 0.988321989774704, + "num_tokens": 73058804.0, + "step": 26015 + }, + { + "entropy": 0.053884850721806286, + "epoch": 6.065275673155379, + "grad_norm": 0.40625, + "learning_rate": 4.889150115531551e-05, + "loss": 0.0154, + "mean_token_accuracy": 0.9942508041858673, + "num_tokens": 73080961.0, + "step": 26020 + }, + { + "entropy": 0.05890355911105871, + "epoch": 6.066441310176011, + "grad_norm": 1.03125, + "learning_rate": 4.889087265028452e-05, + "loss": 0.0467, + "mean_token_accuracy": 0.9890576004981995, + "num_tokens": 73096398.0, + "step": 26025 + }, + { + "entropy": 0.0513009587302804, + "epoch": 6.067606947196643, + "grad_norm": 0.490234375, + "learning_rate": 4.889024397539848e-05, + "loss": 0.0192, + "mean_token_accuracy": 0.9914946913719177, + "num_tokens": 73122113.0, + "step": 26030 + }, + { + "entropy": 0.06160991545766592, + "epoch": 6.068772584217275, + "grad_norm": 0.765625, + "learning_rate": 4.888961513066677e-05, + "loss": 0.0293, + "mean_token_accuracy": 0.9876994311809539, + "num_tokens": 73150370.0, + "step": 26035 + }, + { + "entropy": 0.06331438571214676, + "epoch": 6.069938221237907, + "grad_norm": 0.49609375, + "learning_rate": 4.888898611609877e-05, + "loss": 0.0264, + "mean_token_accuracy": 0.9909036993980408, + "num_tokens": 73175420.0, + "step": 26040 + }, + { + "entropy": 0.13340820157900452, + "epoch": 6.071103858258538, + "grad_norm": 3.765625, + "learning_rate": 4.888835693170386e-05, + "loss": 0.1831, + "mean_token_accuracy": 0.9607505977153779, + "num_tokens": 73217582.0, + "step": 26045 + }, + { + "entropy": 0.05735396733507514, + "epoch": 6.07226949527917, + "grad_norm": 1.203125, + "learning_rate": 4.8887727577491414e-05, + "loss": 0.0231, + "mean_token_accuracy": 0.9909033179283142, + "num_tokens": 73246641.0, + "step": 26050 + }, + { + "entropy": 0.08030778989195823, + "epoch": 6.073435132299802, + "grad_norm": 2.40625, + "learning_rate": 4.888709805347082e-05, + "loss": 0.0286, + "mean_token_accuracy": 0.9901704609394073, + "num_tokens": 73258761.0, + "step": 26055 + }, + { + "entropy": 0.07896085307002068, + "epoch": 6.074600769320433, + "grad_norm": 4.125, + "learning_rate": 4.888646835965147e-05, + "loss": 0.0595, + "mean_token_accuracy": 0.9851579368114471, + "num_tokens": 73268333.0, + "step": 26060 + }, + { + "entropy": 0.06064739301800728, + "epoch": 6.0757664063410655, + "grad_norm": 0.453125, + "learning_rate": 4.888583849604275e-05, + "loss": 0.0227, + "mean_token_accuracy": 0.9928803324699402, + "num_tokens": 73287812.0, + "step": 26065 + }, + { + "entropy": 0.04188825218006968, + "epoch": 6.076932043361698, + "grad_norm": 0.51171875, + "learning_rate": 4.888520846265405e-05, + "loss": 0.0127, + "mean_token_accuracy": 0.9933510839939117, + "num_tokens": 73325445.0, + "step": 26070 + }, + { + "entropy": 0.05650821551680565, + "epoch": 6.078097680382329, + "grad_norm": 3.15625, + "learning_rate": 4.888457825949478e-05, + "loss": 0.0233, + "mean_token_accuracy": 0.9924087584018707, + "num_tokens": 73349736.0, + "step": 26075 + }, + { + "entropy": 0.0596388409845531, + "epoch": 6.079263317402961, + "grad_norm": 0.10205078125, + "learning_rate": 4.888394788657431e-05, + "loss": 0.0391, + "mean_token_accuracy": 0.9876825332641601, + "num_tokens": 73384001.0, + "step": 26080 + }, + { + "entropy": 0.06745455488562584, + "epoch": 6.080428954423592, + "grad_norm": 2.515625, + "learning_rate": 4.8883317343902065e-05, + "loss": 0.0347, + "mean_token_accuracy": 0.9912264704704284, + "num_tokens": 73405821.0, + "step": 26085 + }, + { + "entropy": 0.07572416644543409, + "epoch": 6.081594591444224, + "grad_norm": 0.6875, + "learning_rate": 4.888268663148743e-05, + "loss": 0.0218, + "mean_token_accuracy": 0.9936153054237366, + "num_tokens": 73425459.0, + "step": 26090 + }, + { + "entropy": 0.05982452109456062, + "epoch": 6.082760228464856, + "grad_norm": 1.6953125, + "learning_rate": 4.8882055749339814e-05, + "loss": 0.0166, + "mean_token_accuracy": 0.9946097016334534, + "num_tokens": 73451504.0, + "step": 26095 + }, + { + "entropy": 0.05443444773554802, + "epoch": 6.0839258654854875, + "grad_norm": 0.91015625, + "learning_rate": 4.8881424697468635e-05, + "loss": 0.0124, + "mean_token_accuracy": 0.992647510766983, + "num_tokens": 73491398.0, + "step": 26100 + }, + { + "entropy": 0.06279923100955784, + "epoch": 6.08509150250612, + "grad_norm": 1.4453125, + "learning_rate": 4.88807934758833e-05, + "loss": 0.0299, + "mean_token_accuracy": 0.9905305862426758, + "num_tokens": 73521552.0, + "step": 26105 + }, + { + "entropy": 0.07592700524255633, + "epoch": 6.086257139526752, + "grad_norm": 2.484375, + "learning_rate": 4.8880162084593204e-05, + "loss": 0.0568, + "mean_token_accuracy": 0.9855045139789581, + "num_tokens": 73546810.0, + "step": 26110 + }, + { + "entropy": 0.060017453879117964, + "epoch": 6.087422776547383, + "grad_norm": 0.1787109375, + "learning_rate": 4.887953052360778e-05, + "loss": 0.0246, + "mean_token_accuracy": 0.9929731905460357, + "num_tokens": 73570295.0, + "step": 26115 + }, + { + "entropy": 0.07486429456621409, + "epoch": 6.088588413568015, + "grad_norm": 2.1875, + "learning_rate": 4.887889879293644e-05, + "loss": 0.0276, + "mean_token_accuracy": 0.9905243337154388, + "num_tokens": 73582178.0, + "step": 26120 + }, + { + "entropy": 0.09151984080672264, + "epoch": 6.089754050588646, + "grad_norm": 3.53125, + "learning_rate": 4.8878266892588605e-05, + "loss": 0.034, + "mean_token_accuracy": 0.9858936965465546, + "num_tokens": 73597195.0, + "step": 26125 + }, + { + "entropy": 0.05330041013658047, + "epoch": 6.090919687609278, + "grad_norm": 0.73828125, + "learning_rate": 4.887763482257369e-05, + "loss": 0.0359, + "mean_token_accuracy": 0.9910796642303467, + "num_tokens": 73622884.0, + "step": 26130 + }, + { + "entropy": 0.05964062893763185, + "epoch": 6.0920853246299105, + "grad_norm": 1.9453125, + "learning_rate": 4.887700258290113e-05, + "loss": 0.0266, + "mean_token_accuracy": 0.9897635579109192, + "num_tokens": 73649204.0, + "step": 26135 + }, + { + "entropy": 0.06973530426621437, + "epoch": 6.093250961650542, + "grad_norm": 1.8671875, + "learning_rate": 4.887637017358036e-05, + "loss": 0.0227, + "mean_token_accuracy": 0.9914168953895569, + "num_tokens": 73661502.0, + "step": 26140 + }, + { + "entropy": 0.07038565017282963, + "epoch": 6.094416598671174, + "grad_norm": 0.216796875, + "learning_rate": 4.887573759462079e-05, + "loss": 0.0263, + "mean_token_accuracy": 0.9904610097408295, + "num_tokens": 73685578.0, + "step": 26145 + }, + { + "entropy": 0.04337377091869712, + "epoch": 6.095582235691806, + "grad_norm": 0.310546875, + "learning_rate": 4.887510484603186e-05, + "loss": 0.0156, + "mean_token_accuracy": 0.9947210431098938, + "num_tokens": 73715768.0, + "step": 26150 + }, + { + "entropy": 0.07746588047593832, + "epoch": 6.096747872712437, + "grad_norm": 2.34375, + "learning_rate": 4.887447192782302e-05, + "loss": 0.0371, + "mean_token_accuracy": 0.9873392581939697, + "num_tokens": 73728063.0, + "step": 26155 + }, + { + "entropy": 0.053429150208830836, + "epoch": 6.097913509733069, + "grad_norm": 2.421875, + "learning_rate": 4.887383884000368e-05, + "loss": 0.0274, + "mean_token_accuracy": 0.9937780797481537, + "num_tokens": 73757134.0, + "step": 26160 + }, + { + "entropy": 0.0564187285490334, + "epoch": 6.099079146753701, + "grad_norm": 3.15625, + "learning_rate": 4.88732055825833e-05, + "loss": 0.0269, + "mean_token_accuracy": 0.9911048114299774, + "num_tokens": 73781153.0, + "step": 26165 + }, + { + "entropy": 0.07645439747720957, + "epoch": 6.1002447837743325, + "grad_norm": 0.291015625, + "learning_rate": 4.887257215557133e-05, + "loss": 0.0337, + "mean_token_accuracy": 0.9878023087978363, + "num_tokens": 73796312.0, + "step": 26170 + }, + { + "entropy": 0.07418763572350144, + "epoch": 6.101410420794965, + "grad_norm": 0.1796875, + "learning_rate": 4.8871938558977194e-05, + "loss": 0.0304, + "mean_token_accuracy": 0.9894303560256958, + "num_tokens": 73814524.0, + "step": 26175 + }, + { + "entropy": 0.0833287613466382, + "epoch": 6.102576057815596, + "grad_norm": 3.328125, + "learning_rate": 4.887130479281035e-05, + "loss": 0.0294, + "mean_token_accuracy": 0.9921909213066101, + "num_tokens": 73831240.0, + "step": 26180 + }, + { + "entropy": 0.08284763377159834, + "epoch": 6.103741694836228, + "grad_norm": 0.208984375, + "learning_rate": 4.8870670857080246e-05, + "loss": 0.0206, + "mean_token_accuracy": 0.9901773989200592, + "num_tokens": 73853877.0, + "step": 26185 + }, + { + "entropy": 0.05893292501568794, + "epoch": 6.10490733185686, + "grad_norm": 1.3984375, + "learning_rate": 4.887003675179634e-05, + "loss": 0.034, + "mean_token_accuracy": 0.9904161393642426, + "num_tokens": 73883298.0, + "step": 26190 + }, + { + "entropy": 0.07303530490025878, + "epoch": 6.106072968877491, + "grad_norm": 0.291015625, + "learning_rate": 4.8869402476968083e-05, + "loss": 0.0285, + "mean_token_accuracy": 0.9904853463172912, + "num_tokens": 73903304.0, + "step": 26195 + }, + { + "entropy": 0.08997571468353271, + "epoch": 6.107238605898123, + "grad_norm": 0.96875, + "learning_rate": 4.886876803260494e-05, + "loss": 0.0275, + "mean_token_accuracy": 0.9917174279689789, + "num_tokens": 73914506.0, + "step": 26200 + }, + { + "entropy": 0.06205089651048183, + "epoch": 6.1084042429187555, + "grad_norm": 2.671875, + "learning_rate": 4.886813341871636e-05, + "loss": 0.0318, + "mean_token_accuracy": 0.99107666015625, + "num_tokens": 73938577.0, + "step": 26205 + }, + { + "entropy": 0.053936326131224635, + "epoch": 6.109569879939387, + "grad_norm": 3.140625, + "learning_rate": 4.886749863531182e-05, + "loss": 0.038, + "mean_token_accuracy": 0.9904813170433044, + "num_tokens": 73955143.0, + "step": 26210 + }, + { + "entropy": 0.06608153507113457, + "epoch": 6.110735516960019, + "grad_norm": 1.453125, + "learning_rate": 4.886686368240077e-05, + "loss": 0.0412, + "mean_token_accuracy": 0.9894643902778626, + "num_tokens": 73965817.0, + "step": 26215 + }, + { + "entropy": 0.04572048811241984, + "epoch": 6.11190115398065, + "grad_norm": 0.458984375, + "learning_rate": 4.8866228559992685e-05, + "loss": 0.0144, + "mean_token_accuracy": 0.9954964637756347, + "num_tokens": 73997786.0, + "step": 26220 + }, + { + "entropy": 0.07351892106235028, + "epoch": 6.113066791001282, + "grad_norm": 0.8984375, + "learning_rate": 4.886559326809704e-05, + "loss": 0.0449, + "mean_token_accuracy": 0.9858495235443115, + "num_tokens": 74016499.0, + "step": 26225 + }, + { + "entropy": 0.06393312495201826, + "epoch": 6.114232428021914, + "grad_norm": 1.171875, + "learning_rate": 4.8864957806723296e-05, + "loss": 0.0212, + "mean_token_accuracy": 0.9930548965930939, + "num_tokens": 74048662.0, + "step": 26230 + }, + { + "entropy": 0.0660844799131155, + "epoch": 6.1153980650425455, + "grad_norm": 1.046875, + "learning_rate": 4.886432217588095e-05, + "loss": 0.0265, + "mean_token_accuracy": 0.989313280582428, + "num_tokens": 74068187.0, + "step": 26235 + }, + { + "entropy": 0.055278994515538214, + "epoch": 6.116563702063178, + "grad_norm": 1.078125, + "learning_rate": 4.886368637557946e-05, + "loss": 0.0243, + "mean_token_accuracy": 0.9904797196388244, + "num_tokens": 74091261.0, + "step": 26240 + }, + { + "entropy": 0.07485796064138413, + "epoch": 6.11772933908381, + "grad_norm": 0.8671875, + "learning_rate": 4.886305040582832e-05, + "loss": 0.0494, + "mean_token_accuracy": 0.9882545590400695, + "num_tokens": 74101483.0, + "step": 26245 + }, + { + "entropy": 0.07578368950635195, + "epoch": 6.118894976104441, + "grad_norm": 4.09375, + "learning_rate": 4.8862414266637e-05, + "loss": 0.0357, + "mean_token_accuracy": 0.990442156791687, + "num_tokens": 74112359.0, + "step": 26250 + }, + { + "entropy": 0.05760523192584514, + "epoch": 6.120060613125073, + "grad_norm": 0.314453125, + "learning_rate": 4.8861777958014996e-05, + "loss": 0.0118, + "mean_token_accuracy": 0.9920580506324768, + "num_tokens": 74152099.0, + "step": 26255 + }, + { + "entropy": 0.060783774219453336, + "epoch": 6.121226250145704, + "grad_norm": 0.31640625, + "learning_rate": 4.886114147997179e-05, + "loss": 0.0185, + "mean_token_accuracy": 0.9896743893623352, + "num_tokens": 74183403.0, + "step": 26260 + }, + { + "entropy": 0.073437774553895, + "epoch": 6.122391887166336, + "grad_norm": 1.1796875, + "learning_rate": 4.886050483251689e-05, + "loss": 0.0379, + "mean_token_accuracy": 0.9888258576393127, + "num_tokens": 74195017.0, + "step": 26265 + }, + { + "entropy": 0.061823181249201296, + "epoch": 6.123557524186968, + "grad_norm": 3.71875, + "learning_rate": 4.8859868015659764e-05, + "loss": 0.0227, + "mean_token_accuracy": 0.9911990523338318, + "num_tokens": 74214212.0, + "step": 26270 + }, + { + "entropy": 0.044901407044380906, + "epoch": 6.1247231612076, + "grad_norm": 0.271484375, + "learning_rate": 4.8859231029409925e-05, + "loss": 0.0157, + "mean_token_accuracy": 0.9923342585563659, + "num_tokens": 74245056.0, + "step": 26275 + }, + { + "entropy": 0.07991915084421634, + "epoch": 6.125888798228232, + "grad_norm": 2.515625, + "learning_rate": 4.885859387377686e-05, + "loss": 0.0354, + "mean_token_accuracy": 0.9882398664951324, + "num_tokens": 74270399.0, + "step": 26280 + }, + { + "entropy": 0.06915805991739035, + "epoch": 6.127054435248864, + "grad_norm": 0.8515625, + "learning_rate": 4.885795654877009e-05, + "loss": 0.033, + "mean_token_accuracy": 0.9874129176139832, + "num_tokens": 74286158.0, + "step": 26285 + }, + { + "entropy": 0.07844992205500603, + "epoch": 6.128220072269495, + "grad_norm": 1.171875, + "learning_rate": 4.885731905439909e-05, + "loss": 0.0574, + "mean_token_accuracy": 0.9869462549686432, + "num_tokens": 74295775.0, + "step": 26290 + }, + { + "entropy": 0.0868721805512905, + "epoch": 6.129385709290127, + "grad_norm": 2.53125, + "learning_rate": 4.885668139067338e-05, + "loss": 0.0638, + "mean_token_accuracy": 0.9849142253398895, + "num_tokens": 74315238.0, + "step": 26295 + }, + { + "entropy": 0.0664596289396286, + "epoch": 6.130551346310758, + "grad_norm": 0.44140625, + "learning_rate": 4.885604355760248e-05, + "loss": 0.0162, + "mean_token_accuracy": 0.9912093698978424, + "num_tokens": 74345863.0, + "step": 26300 + }, + { + "entropy": 0.09681078474968671, + "epoch": 6.1317169833313905, + "grad_norm": 0.2890625, + "learning_rate": 4.885540555519588e-05, + "loss": 0.0525, + "mean_token_accuracy": 0.9850651860237122, + "num_tokens": 74361453.0, + "step": 26305 + }, + { + "entropy": 0.04459414193406701, + "epoch": 6.132882620352023, + "grad_norm": 2.171875, + "learning_rate": 4.8854767383463106e-05, + "loss": 0.0168, + "mean_token_accuracy": 0.9929663062095642, + "num_tokens": 74400020.0, + "step": 26310 + }, + { + "entropy": 0.05299360081553459, + "epoch": 6.134048257372654, + "grad_norm": 0.4140625, + "learning_rate": 4.8854129042413674e-05, + "loss": 0.0118, + "mean_token_accuracy": 0.9963202357292176, + "num_tokens": 74443995.0, + "step": 26315 + }, + { + "entropy": 0.08458766452968121, + "epoch": 6.135213894393286, + "grad_norm": 1.3203125, + "learning_rate": 4.88534905320571e-05, + "loss": 0.0486, + "mean_token_accuracy": 0.9887842237949371, + "num_tokens": 74454639.0, + "step": 26320 + }, + { + "entropy": 0.06806373670697212, + "epoch": 6.136379531413918, + "grad_norm": 3.265625, + "learning_rate": 4.8852851852402906e-05, + "loss": 0.0471, + "mean_token_accuracy": 0.9882710754871369, + "num_tokens": 74485326.0, + "step": 26325 + }, + { + "entropy": 0.051118244975805284, + "epoch": 6.137545168434549, + "grad_norm": 0.154296875, + "learning_rate": 4.885221300346061e-05, + "loss": 0.0257, + "mean_token_accuracy": 0.9931609928607941, + "num_tokens": 74511678.0, + "step": 26330 + }, + { + "entropy": 0.07300595417618752, + "epoch": 6.138710805455181, + "grad_norm": 1.6875, + "learning_rate": 4.8851573985239753e-05, + "loss": 0.0305, + "mean_token_accuracy": 0.9920416593551635, + "num_tokens": 74540779.0, + "step": 26335 + }, + { + "entropy": 0.050485126581043004, + "epoch": 6.139876442475813, + "grad_norm": 0.62890625, + "learning_rate": 4.885093479774985e-05, + "loss": 0.0184, + "mean_token_accuracy": 0.9928144335746765, + "num_tokens": 74575792.0, + "step": 26340 + }, + { + "entropy": 0.06935878098011017, + "epoch": 6.141042079496445, + "grad_norm": 1.6171875, + "learning_rate": 4.8850295441000435e-05, + "loss": 0.0382, + "mean_token_accuracy": 0.9901534140110015, + "num_tokens": 74594585.0, + "step": 26345 + }, + { + "entropy": 0.06734537519514561, + "epoch": 6.142207716517077, + "grad_norm": 3.328125, + "learning_rate": 4.8849655915001044e-05, + "loss": 0.0294, + "mean_token_accuracy": 0.9917540729045868, + "num_tokens": 74605422.0, + "step": 26350 + }, + { + "entropy": 0.060905332677066326, + "epoch": 6.143373353537708, + "grad_norm": 2.03125, + "learning_rate": 4.884901621976121e-05, + "loss": 0.0328, + "mean_token_accuracy": 0.9897407293319702, + "num_tokens": 74620466.0, + "step": 26355 + }, + { + "entropy": 0.074892489425838, + "epoch": 6.14453899055834, + "grad_norm": 0.94921875, + "learning_rate": 4.884837635529048e-05, + "loss": 0.0652, + "mean_token_accuracy": 0.985385262966156, + "num_tokens": 74633990.0, + "step": 26360 + }, + { + "entropy": 0.05228735357522964, + "epoch": 6.145704627578972, + "grad_norm": 0.40625, + "learning_rate": 4.884773632159839e-05, + "loss": 0.0195, + "mean_token_accuracy": 0.9903470396995544, + "num_tokens": 74665082.0, + "step": 26365 + }, + { + "entropy": 0.06522688157856464, + "epoch": 6.146870264599603, + "grad_norm": 0.44140625, + "learning_rate": 4.8847096118694474e-05, + "loss": 0.0306, + "mean_token_accuracy": 0.9882582664489746, + "num_tokens": 74698895.0, + "step": 26370 + }, + { + "entropy": 0.07812208170071244, + "epoch": 6.1480359016202355, + "grad_norm": 2.015625, + "learning_rate": 4.8846455746588295e-05, + "loss": 0.0563, + "mean_token_accuracy": 0.9852927267551422, + "num_tokens": 74715568.0, + "step": 26375 + }, + { + "entropy": 0.07707515489310027, + "epoch": 6.149201538640868, + "grad_norm": 1.546875, + "learning_rate": 4.884581520528939e-05, + "loss": 0.0423, + "mean_token_accuracy": 0.9858931481838227, + "num_tokens": 74728990.0, + "step": 26380 + }, + { + "entropy": 0.07694179080426693, + "epoch": 6.150367175661499, + "grad_norm": 2.046875, + "learning_rate": 4.884517449480732e-05, + "loss": 0.0536, + "mean_token_accuracy": 0.9824213206768035, + "num_tokens": 74739815.0, + "step": 26385 + }, + { + "entropy": 0.06638536080718041, + "epoch": 6.151532812682131, + "grad_norm": 2.078125, + "learning_rate": 4.8844533615151633e-05, + "loss": 0.0364, + "mean_token_accuracy": 0.9892715454101563, + "num_tokens": 74759843.0, + "step": 26390 + }, + { + "entropy": 0.07396925017237663, + "epoch": 6.152698449702762, + "grad_norm": 0.33984375, + "learning_rate": 4.8843892566331887e-05, + "loss": 0.0348, + "mean_token_accuracy": 0.9897041201591492, + "num_tokens": 74782611.0, + "step": 26395 + }, + { + "entropy": 0.08042656015604735, + "epoch": 6.153864086723394, + "grad_norm": 0.58984375, + "learning_rate": 4.884325134835764e-05, + "loss": 0.0454, + "mean_token_accuracy": 0.9905684471130372, + "num_tokens": 74809583.0, + "step": 26400 + }, + { + "entropy": 0.07436943799257278, + "epoch": 6.155029723744026, + "grad_norm": 0.58984375, + "learning_rate": 4.884260996123845e-05, + "loss": 0.0239, + "mean_token_accuracy": 0.9928574979305267, + "num_tokens": 74829584.0, + "step": 26405 + }, + { + "entropy": 0.04903424307703972, + "epoch": 6.156195360764658, + "grad_norm": 0.53125, + "learning_rate": 4.88419684049839e-05, + "loss": 0.0154, + "mean_token_accuracy": 0.9906674683094024, + "num_tokens": 74866848.0, + "step": 26410 + }, + { + "entropy": 0.08828915823251009, + "epoch": 6.15736099778529, + "grad_norm": 1.21875, + "learning_rate": 4.884132667960352e-05, + "loss": 0.0524, + "mean_token_accuracy": 0.9868596613407135, + "num_tokens": 74878559.0, + "step": 26415 + }, + { + "entropy": 0.05355936009436846, + "epoch": 6.158526634805922, + "grad_norm": 2.203125, + "learning_rate": 4.8840684785106915e-05, + "loss": 0.0199, + "mean_token_accuracy": 0.9939637005329132, + "num_tokens": 74900968.0, + "step": 26420 + }, + { + "entropy": 0.0665479020215571, + "epoch": 6.159692271826553, + "grad_norm": 0.6328125, + "learning_rate": 4.884004272150364e-05, + "loss": 0.0273, + "mean_token_accuracy": 0.992696750164032, + "num_tokens": 74925194.0, + "step": 26425 + }, + { + "entropy": 0.09455383159220218, + "epoch": 6.160857908847185, + "grad_norm": 0.79296875, + "learning_rate": 4.883940048880327e-05, + "loss": 0.061, + "mean_token_accuracy": 0.9805425465106964, + "num_tokens": 74944109.0, + "step": 26430 + }, + { + "entropy": 0.10350356921553612, + "epoch": 6.162023545867816, + "grad_norm": 0.26171875, + "learning_rate": 4.8838758087015385e-05, + "loss": 0.0501, + "mean_token_accuracy": 0.9844925940036774, + "num_tokens": 74962982.0, + "step": 26435 + }, + { + "entropy": 0.07926721423864365, + "epoch": 6.163189182888448, + "grad_norm": 2.59375, + "learning_rate": 4.8838115516149566e-05, + "loss": 0.0503, + "mean_token_accuracy": 0.9858163237571717, + "num_tokens": 74972064.0, + "step": 26440 + }, + { + "entropy": 0.05242730937898159, + "epoch": 6.1643548199090805, + "grad_norm": 1.8984375, + "learning_rate": 4.883747277621539e-05, + "loss": 0.0224, + "mean_token_accuracy": 0.9898411691188812, + "num_tokens": 74998421.0, + "step": 26445 + }, + { + "entropy": 0.0582451980561018, + "epoch": 6.165520456929712, + "grad_norm": 1.4140625, + "learning_rate": 4.883682986722243e-05, + "loss": 0.0268, + "mean_token_accuracy": 0.9894807755947113, + "num_tokens": 75022066.0, + "step": 26450 + }, + { + "entropy": 0.048820036463439465, + "epoch": 6.166686093950344, + "grad_norm": 1.5703125, + "learning_rate": 4.88361867891803e-05, + "loss": 0.0201, + "mean_token_accuracy": 0.9917604267597199, + "num_tokens": 75046513.0, + "step": 26455 + }, + { + "entropy": 0.04594447817653417, + "epoch": 6.167851730970976, + "grad_norm": 1.03125, + "learning_rate": 4.883554354209857e-05, + "loss": 0.0138, + "mean_token_accuracy": 0.9945909082889557, + "num_tokens": 75074076.0, + "step": 26460 + }, + { + "entropy": 0.06487497426569462, + "epoch": 6.169017367991607, + "grad_norm": 0.9296875, + "learning_rate": 4.883490012598683e-05, + "loss": 0.0284, + "mean_token_accuracy": 0.9908344566822052, + "num_tokens": 75097190.0, + "step": 26465 + }, + { + "entropy": 0.10159607119858265, + "epoch": 6.170183005012239, + "grad_norm": 0.97265625, + "learning_rate": 4.883425654085469e-05, + "loss": 0.1187, + "mean_token_accuracy": 0.9753309786319733, + "num_tokens": 75117363.0, + "step": 26470 + }, + { + "entropy": 0.06636620126664639, + "epoch": 6.171348642032871, + "grad_norm": 0.8671875, + "learning_rate": 4.883361278671173e-05, + "loss": 0.0328, + "mean_token_accuracy": 0.9904363811016083, + "num_tokens": 75129335.0, + "step": 26475 + }, + { + "entropy": 0.04916867651045322, + "epoch": 6.172514279053503, + "grad_norm": 1.6875, + "learning_rate": 4.883296886356756e-05, + "loss": 0.0244, + "mean_token_accuracy": 0.9905480206012726, + "num_tokens": 75156142.0, + "step": 26480 + }, + { + "entropy": 0.06334216399118305, + "epoch": 6.173679916074135, + "grad_norm": 0.6796875, + "learning_rate": 4.883232477143178e-05, + "loss": 0.0171, + "mean_token_accuracy": 0.9912271916866302, + "num_tokens": 75181069.0, + "step": 26485 + }, + { + "entropy": 0.05516471900045872, + "epoch": 6.174845553094766, + "grad_norm": 2.140625, + "learning_rate": 4.8831680510313994e-05, + "loss": 0.0234, + "mean_token_accuracy": 0.9919645011425018, + "num_tokens": 75197946.0, + "step": 26490 + }, + { + "entropy": 0.054226364567875864, + "epoch": 6.176011190115398, + "grad_norm": 1.890625, + "learning_rate": 4.8831036080223804e-05, + "loss": 0.0315, + "mean_token_accuracy": 0.991038691997528, + "num_tokens": 75216585.0, + "step": 26495 + }, + { + "entropy": 0.09042618926614523, + "epoch": 6.17717682713603, + "grad_norm": 5.09375, + "learning_rate": 4.883039148117082e-05, + "loss": 0.0395, + "mean_token_accuracy": 0.9868484675884247, + "num_tokens": 75230630.0, + "step": 26500 + }, + { + "entropy": 0.06445145141333342, + "epoch": 6.178342464156661, + "grad_norm": 2.234375, + "learning_rate": 4.882974671316466e-05, + "loss": 0.0411, + "mean_token_accuracy": 0.9895972013473511, + "num_tokens": 75245396.0, + "step": 26505 + }, + { + "entropy": 0.06774566173553467, + "epoch": 6.179508101177293, + "grad_norm": 0.91015625, + "learning_rate": 4.8829101776214934e-05, + "loss": 0.0328, + "mean_token_accuracy": 0.9879733324050903, + "num_tokens": 75255982.0, + "step": 26510 + }, + { + "entropy": 0.05292671788483858, + "epoch": 6.1806737381979255, + "grad_norm": 1.109375, + "learning_rate": 4.882845667033127e-05, + "loss": 0.027, + "mean_token_accuracy": 0.9933640778064727, + "num_tokens": 75283127.0, + "step": 26515 + }, + { + "entropy": 0.0652140263468027, + "epoch": 6.181839375218557, + "grad_norm": 1.0625, + "learning_rate": 4.882781139552327e-05, + "loss": 0.0242, + "mean_token_accuracy": 0.9877884387969971, + "num_tokens": 75313041.0, + "step": 26520 + }, + { + "entropy": 0.07414305829443038, + "epoch": 6.183005012239189, + "grad_norm": 1.734375, + "learning_rate": 4.8827165951800565e-05, + "loss": 0.0511, + "mean_token_accuracy": 0.9878878772258759, + "num_tokens": 75330555.0, + "step": 26525 + }, + { + "entropy": 0.09437569361180068, + "epoch": 6.18417064925982, + "grad_norm": 1.4921875, + "learning_rate": 4.8826520339172774e-05, + "loss": 0.0328, + "mean_token_accuracy": 0.98801589012146, + "num_tokens": 75343418.0, + "step": 26530 + }, + { + "entropy": 0.07906931964680552, + "epoch": 6.185336286280452, + "grad_norm": 0.3359375, + "learning_rate": 4.882587455764954e-05, + "loss": 0.042, + "mean_token_accuracy": 0.9886034429073334, + "num_tokens": 75357739.0, + "step": 26535 + }, + { + "entropy": 0.09388845460489392, + "epoch": 6.186501923301084, + "grad_norm": 0.142578125, + "learning_rate": 4.882522860724047e-05, + "loss": 0.0644, + "mean_token_accuracy": 0.9776318430900574, + "num_tokens": 75408177.0, + "step": 26540 + }, + { + "entropy": 0.06323309913277626, + "epoch": 6.1876675603217155, + "grad_norm": 1.7265625, + "learning_rate": 4.8824582487955214e-05, + "loss": 0.0297, + "mean_token_accuracy": 0.9861397266387939, + "num_tokens": 75445410.0, + "step": 26545 + }, + { + "entropy": 0.06687562046572566, + "epoch": 6.188833197342348, + "grad_norm": 1.703125, + "learning_rate": 4.88239361998034e-05, + "loss": 0.0224, + "mean_token_accuracy": 0.9914937198162079, + "num_tokens": 75467771.0, + "step": 26550 + }, + { + "entropy": 0.05728769712150097, + "epoch": 6.18999883436298, + "grad_norm": 1.1015625, + "learning_rate": 4.882328974279467e-05, + "loss": 0.0159, + "mean_token_accuracy": 0.9917247593402863, + "num_tokens": 75491896.0, + "step": 26555 + }, + { + "entropy": 0.08520476371049882, + "epoch": 6.191164471383611, + "grad_norm": 0.546875, + "learning_rate": 4.882264311693865e-05, + "loss": 0.0318, + "mean_token_accuracy": 0.9897422969341279, + "num_tokens": 75509972.0, + "step": 26560 + }, + { + "entropy": 0.05651366077363491, + "epoch": 6.192330108404243, + "grad_norm": 0.2265625, + "learning_rate": 4.882199632224499e-05, + "loss": 0.0128, + "mean_token_accuracy": 0.9885481774806977, + "num_tokens": 75542748.0, + "step": 26565 + }, + { + "entropy": 0.07149199210107327, + "epoch": 6.193495745424874, + "grad_norm": 0.4375, + "learning_rate": 4.882134935872334e-05, + "loss": 0.027, + "mean_token_accuracy": 0.9882903575897217, + "num_tokens": 75563403.0, + "step": 26570 + }, + { + "entropy": 0.07008785083889961, + "epoch": 6.194661382445506, + "grad_norm": 2.96875, + "learning_rate": 4.882070222638334e-05, + "loss": 0.0349, + "mean_token_accuracy": 0.9898769140243531, + "num_tokens": 75578117.0, + "step": 26575 + }, + { + "entropy": 0.079210801795125, + "epoch": 6.1958270194661385, + "grad_norm": 2.484375, + "learning_rate": 4.8820054925234645e-05, + "loss": 0.0386, + "mean_token_accuracy": 0.986040997505188, + "num_tokens": 75597784.0, + "step": 26580 + }, + { + "entropy": 0.07456495780497789, + "epoch": 6.19699265648677, + "grad_norm": 1.828125, + "learning_rate": 4.8819407455286905e-05, + "loss": 0.026, + "mean_token_accuracy": 0.9922024965286255, + "num_tokens": 75613004.0, + "step": 26585 + }, + { + "entropy": 0.07746983375400304, + "epoch": 6.198158293507402, + "grad_norm": 2.40625, + "learning_rate": 4.881875981654977e-05, + "loss": 0.0449, + "mean_token_accuracy": 0.9894679367542267, + "num_tokens": 75627893.0, + "step": 26590 + }, + { + "entropy": 0.05641837287694216, + "epoch": 6.199323930528034, + "grad_norm": 1.9921875, + "learning_rate": 4.881811200903291e-05, + "loss": 0.0275, + "mean_token_accuracy": 0.9910026013851165, + "num_tokens": 75651925.0, + "step": 26595 + }, + { + "entropy": 0.07631746679544449, + "epoch": 6.200489567548665, + "grad_norm": 1.9921875, + "learning_rate": 4.881746403274597e-05, + "loss": 0.0479, + "mean_token_accuracy": 0.990505713224411, + "num_tokens": 75661578.0, + "step": 26600 + }, + { + "entropy": 0.09214288219809533, + "epoch": 6.201655204569297, + "grad_norm": 2.75, + "learning_rate": 4.881681588769862e-05, + "loss": 0.0464, + "mean_token_accuracy": 0.9885360062122345, + "num_tokens": 75671242.0, + "step": 26605 + }, + { + "entropy": 0.05341388094238937, + "epoch": 6.202820841589929, + "grad_norm": 0.365234375, + "learning_rate": 4.8816167573900524e-05, + "loss": 0.0176, + "mean_token_accuracy": 0.9929543256759643, + "num_tokens": 75697552.0, + "step": 26610 + }, + { + "entropy": 0.06103728674352169, + "epoch": 6.2039864786105605, + "grad_norm": 0.96875, + "learning_rate": 4.881551909136135e-05, + "loss": 0.0253, + "mean_token_accuracy": 0.9898181974887847, + "num_tokens": 75715801.0, + "step": 26615 + }, + { + "entropy": 0.054025298729538915, + "epoch": 6.205152115631193, + "grad_norm": 1.0703125, + "learning_rate": 4.8814870440090764e-05, + "loss": 0.0293, + "mean_token_accuracy": 0.9889390408992768, + "num_tokens": 75733829.0, + "step": 26620 + }, + { + "entropy": 0.06002594884485006, + "epoch": 6.206317752651824, + "grad_norm": 2.890625, + "learning_rate": 4.881422162009844e-05, + "loss": 0.0377, + "mean_token_accuracy": 0.9891004621982574, + "num_tokens": 75752108.0, + "step": 26625 + }, + { + "entropy": 0.07102447785437108, + "epoch": 6.207483389672456, + "grad_norm": 7.03125, + "learning_rate": 4.881357263139406e-05, + "loss": 0.0342, + "mean_token_accuracy": 0.9882679045200348, + "num_tokens": 75766733.0, + "step": 26630 + }, + { + "entropy": 0.06536264475435019, + "epoch": 6.208649026693088, + "grad_norm": 2.8125, + "learning_rate": 4.8812923473987295e-05, + "loss": 0.0337, + "mean_token_accuracy": 0.9899225354194641, + "num_tokens": 75781265.0, + "step": 26635 + }, + { + "entropy": 0.05781321842223406, + "epoch": 6.209814663713719, + "grad_norm": 0.490234375, + "learning_rate": 4.881227414788782e-05, + "loss": 0.0171, + "mean_token_accuracy": 0.9932263731956482, + "num_tokens": 75800349.0, + "step": 26640 + }, + { + "entropy": 0.06824675220996142, + "epoch": 6.210980300734351, + "grad_norm": 5.125, + "learning_rate": 4.881162465310533e-05, + "loss": 0.0343, + "mean_token_accuracy": 0.9887260556221008, + "num_tokens": 75818446.0, + "step": 26645 + }, + { + "entropy": 0.0608520383015275, + "epoch": 6.2121459377549835, + "grad_norm": 0.40234375, + "learning_rate": 4.88109749896495e-05, + "loss": 0.0272, + "mean_token_accuracy": 0.9919718444347382, + "num_tokens": 75839774.0, + "step": 26650 + }, + { + "entropy": 0.0646435147151351, + "epoch": 6.213311574775615, + "grad_norm": 0.26171875, + "learning_rate": 4.881032515753002e-05, + "loss": 0.016, + "mean_token_accuracy": 0.991169399023056, + "num_tokens": 75867534.0, + "step": 26655 + }, + { + "entropy": 0.07422850346192718, + "epoch": 6.214477211796247, + "grad_norm": 0.455078125, + "learning_rate": 4.880967515675657e-05, + "loss": 0.0444, + "mean_token_accuracy": 0.9853957951068878, + "num_tokens": 75883164.0, + "step": 26660 + }, + { + "entropy": 0.05428111115470528, + "epoch": 6.215642848816878, + "grad_norm": 1.578125, + "learning_rate": 4.880902498733887e-05, + "loss": 0.0224, + "mean_token_accuracy": 0.9905717432498932, + "num_tokens": 75909765.0, + "step": 26665 + }, + { + "entropy": 0.06294290795922279, + "epoch": 6.21680848583751, + "grad_norm": 3.203125, + "learning_rate": 4.880837464928659e-05, + "loss": 0.0475, + "mean_token_accuracy": 0.9853145003318786, + "num_tokens": 75920338.0, + "step": 26670 + }, + { + "entropy": 0.05609114458784461, + "epoch": 6.217974122858142, + "grad_norm": 0.322265625, + "learning_rate": 4.880772414260944e-05, + "loss": 0.0273, + "mean_token_accuracy": 0.9908358454704285, + "num_tokens": 75951772.0, + "step": 26675 + }, + { + "entropy": 0.06486607976257801, + "epoch": 6.219139759878773, + "grad_norm": 2.28125, + "learning_rate": 4.88070734673171e-05, + "loss": 0.0405, + "mean_token_accuracy": 0.9895318448543549, + "num_tokens": 75963086.0, + "step": 26680 + }, + { + "entropy": 0.06027032807469368, + "epoch": 6.2203053968994055, + "grad_norm": 0.498046875, + "learning_rate": 4.88064226234193e-05, + "loss": 0.0284, + "mean_token_accuracy": 0.9899581730365753, + "num_tokens": 75987378.0, + "step": 26685 + }, + { + "entropy": 0.0686921939253807, + "epoch": 6.221471033920038, + "grad_norm": 0.984375, + "learning_rate": 4.880577161092573e-05, + "loss": 0.0314, + "mean_token_accuracy": 0.9877607941627502, + "num_tokens": 76005481.0, + "step": 26690 + }, + { + "entropy": 0.07105220509693026, + "epoch": 6.222636670940669, + "grad_norm": 0.1826171875, + "learning_rate": 4.88051204298461e-05, + "loss": 0.0438, + "mean_token_accuracy": 0.9898497760295868, + "num_tokens": 76023271.0, + "step": 26695 + }, + { + "entropy": 0.07426328733563423, + "epoch": 6.223802307961301, + "grad_norm": 2.453125, + "learning_rate": 4.8804469080190126e-05, + "loss": 0.0321, + "mean_token_accuracy": 0.9896513879299164, + "num_tokens": 76038726.0, + "step": 26700 + }, + { + "entropy": 0.07415295001119375, + "epoch": 6.224967944981932, + "grad_norm": 1.84375, + "learning_rate": 4.880381756196751e-05, + "loss": 0.0229, + "mean_token_accuracy": 0.9894894480705261, + "num_tokens": 76069366.0, + "step": 26705 + }, + { + "entropy": 0.0542880711145699, + "epoch": 6.226133582002564, + "grad_norm": 1.2421875, + "learning_rate": 4.8803165875187975e-05, + "loss": 0.0165, + "mean_token_accuracy": 0.9950318813323975, + "num_tokens": 76104221.0, + "step": 26710 + }, + { + "entropy": 0.05485262274742127, + "epoch": 6.227299219023196, + "grad_norm": 0.859375, + "learning_rate": 4.880251401986123e-05, + "loss": 0.0197, + "mean_token_accuracy": 0.9925736427307129, + "num_tokens": 76136927.0, + "step": 26715 + }, + { + "entropy": 0.0561104491353035, + "epoch": 6.228464856043828, + "grad_norm": 0.404296875, + "learning_rate": 4.8801861995997004e-05, + "loss": 0.0225, + "mean_token_accuracy": 0.9895463585853577, + "num_tokens": 76172523.0, + "step": 26720 + }, + { + "entropy": 0.05799694359302521, + "epoch": 6.22963049306446, + "grad_norm": 0.296875, + "learning_rate": 4.880120980360502e-05, + "loss": 0.0219, + "mean_token_accuracy": 0.9899403989315033, + "num_tokens": 76202120.0, + "step": 26725 + }, + { + "entropy": 0.054118511453270915, + "epoch": 6.230796130085092, + "grad_norm": 0.97265625, + "learning_rate": 4.880055744269499e-05, + "loss": 0.0231, + "mean_token_accuracy": 0.9917508125305176, + "num_tokens": 76232161.0, + "step": 26730 + }, + { + "entropy": 0.07646337822079659, + "epoch": 6.231961767105723, + "grad_norm": 2.109375, + "learning_rate": 4.879990491327667e-05, + "loss": 0.0555, + "mean_token_accuracy": 0.9848457276821136, + "num_tokens": 76241597.0, + "step": 26735 + }, + { + "entropy": 0.06108809132128954, + "epoch": 6.233127404126355, + "grad_norm": 3.875, + "learning_rate": 4.879925221535976e-05, + "loss": 0.0234, + "mean_token_accuracy": 0.9888475894927978, + "num_tokens": 76263968.0, + "step": 26740 + }, + { + "entropy": 0.04935785736888647, + "epoch": 6.234293041146987, + "grad_norm": 0.28125, + "learning_rate": 4.8798599348954e-05, + "loss": 0.029, + "mean_token_accuracy": 0.9913921117782593, + "num_tokens": 76293126.0, + "step": 26745 + }, + { + "entropy": 0.07111759670078754, + "epoch": 6.2354586781676185, + "grad_norm": 0.68359375, + "learning_rate": 4.879794631406914e-05, + "loss": 0.0454, + "mean_token_accuracy": 0.9879423141479492, + "num_tokens": 76314977.0, + "step": 26750 + }, + { + "entropy": 0.0636790843680501, + "epoch": 6.2366243151882506, + "grad_norm": 0.63671875, + "learning_rate": 4.8797293110714906e-05, + "loss": 0.0249, + "mean_token_accuracy": 0.9907831728458405, + "num_tokens": 76329840.0, + "step": 26755 + }, + { + "entropy": 0.05912710763514042, + "epoch": 6.237789952208882, + "grad_norm": 0.318359375, + "learning_rate": 4.8796639738901026e-05, + "loss": 0.0203, + "mean_token_accuracy": 0.9898356914520263, + "num_tokens": 76354041.0, + "step": 26760 + }, + { + "entropy": 0.07571594156324864, + "epoch": 6.238955589229514, + "grad_norm": 0.35546875, + "learning_rate": 4.879598619863727e-05, + "loss": 0.0382, + "mean_token_accuracy": 0.9869071960449218, + "num_tokens": 76372519.0, + "step": 26765 + }, + { + "entropy": 0.06836293041706085, + "epoch": 6.240121226250146, + "grad_norm": 1.0078125, + "learning_rate": 4.879533248993337e-05, + "loss": 0.0386, + "mean_token_accuracy": 0.9874460101127625, + "num_tokens": 76386239.0, + "step": 26770 + }, + { + "entropy": 0.08493883013725281, + "epoch": 6.241286863270777, + "grad_norm": 1.9453125, + "learning_rate": 4.8794678612799066e-05, + "loss": 0.0388, + "mean_token_accuracy": 0.9879676699638367, + "num_tokens": 76401255.0, + "step": 26775 + }, + { + "entropy": 0.07453780174255371, + "epoch": 6.242452500291409, + "grad_norm": 1.65625, + "learning_rate": 4.879402456724412e-05, + "loss": 0.0333, + "mean_token_accuracy": 0.9884099781513214, + "num_tokens": 76420058.0, + "step": 26780 + }, + { + "entropy": 0.04887163182720542, + "epoch": 6.243618137312041, + "grad_norm": 0.103515625, + "learning_rate": 4.8793370353278276e-05, + "loss": 0.0144, + "mean_token_accuracy": 0.9924459099769593, + "num_tokens": 76449079.0, + "step": 26785 + }, + { + "entropy": 0.058128141332417727, + "epoch": 6.244783774332673, + "grad_norm": 1.828125, + "learning_rate": 4.8792715970911305e-05, + "loss": 0.0267, + "mean_token_accuracy": 0.9893514454364777, + "num_tokens": 76476260.0, + "step": 26790 + }, + { + "entropy": 0.07593741156160831, + "epoch": 6.245949411353305, + "grad_norm": 0.87890625, + "learning_rate": 4.879206142015294e-05, + "loss": 0.0404, + "mean_token_accuracy": 0.9897358655929566, + "num_tokens": 76502102.0, + "step": 26795 + }, + { + "entropy": 0.09316719993948937, + "epoch": 6.247115048373936, + "grad_norm": 2.5, + "learning_rate": 4.879140670101296e-05, + "loss": 0.0473, + "mean_token_accuracy": 0.9866836547851563, + "num_tokens": 76517793.0, + "step": 26800 + }, + { + "entropy": 0.07820145031437278, + "epoch": 6.248280685394568, + "grad_norm": 3.65625, + "learning_rate": 4.879075181350113e-05, + "loss": 0.0515, + "mean_token_accuracy": 0.9878412783145905, + "num_tokens": 76545744.0, + "step": 26805 + }, + { + "entropy": 0.06826126556843519, + "epoch": 6.2494463224152, + "grad_norm": 1.2109375, + "learning_rate": 4.8790096757627205e-05, + "loss": 0.041, + "mean_token_accuracy": 0.9902231276035309, + "num_tokens": 76562994.0, + "step": 26810 + }, + { + "entropy": 0.05862971879541874, + "epoch": 6.250611959435831, + "grad_norm": 0.197265625, + "learning_rate": 4.878944153340095e-05, + "loss": 0.0188, + "mean_token_accuracy": 0.9923254668712616, + "num_tokens": 76592668.0, + "step": 26815 + }, + { + "entropy": 0.041546163335442546, + "epoch": 6.2517775964564635, + "grad_norm": 0.81640625, + "learning_rate": 4.878878614083214e-05, + "loss": 0.0154, + "mean_token_accuracy": 0.9943924307823181, + "num_tokens": 76618811.0, + "step": 26820 + }, + { + "entropy": 0.0818638127297163, + "epoch": 6.252943233477096, + "grad_norm": 0.6953125, + "learning_rate": 4.878813057993056e-05, + "loss": 0.0442, + "mean_token_accuracy": 0.9860123097896576, + "num_tokens": 76630583.0, + "step": 26825 + }, + { + "entropy": 0.07344717662781478, + "epoch": 6.254108870497727, + "grad_norm": 0.2060546875, + "learning_rate": 4.878747485070597e-05, + "loss": 0.0177, + "mean_token_accuracy": 0.9916049182415009, + "num_tokens": 76654703.0, + "step": 26830 + }, + { + "entropy": 0.06005438230931759, + "epoch": 6.255274507518359, + "grad_norm": 3.59375, + "learning_rate": 4.878681895316815e-05, + "loss": 0.0361, + "mean_token_accuracy": 0.9874625086784363, + "num_tokens": 76667234.0, + "step": 26835 + }, + { + "entropy": 0.05835657585412264, + "epoch": 6.25644014453899, + "grad_norm": 0.875, + "learning_rate": 4.878616288732688e-05, + "loss": 0.0314, + "mean_token_accuracy": 0.9923226952552795, + "num_tokens": 76680110.0, + "step": 26840 + }, + { + "entropy": 0.061682638619095086, + "epoch": 6.257605781559622, + "grad_norm": 1.890625, + "learning_rate": 4.8785506653191956e-05, + "loss": 0.024, + "mean_token_accuracy": 0.9914807081222534, + "num_tokens": 76701644.0, + "step": 26845 + }, + { + "entropy": 0.052669542096555236, + "epoch": 6.258771418580254, + "grad_norm": 0.7265625, + "learning_rate": 4.878485025077315e-05, + "loss": 0.0218, + "mean_token_accuracy": 0.9928339958190918, + "num_tokens": 76722251.0, + "step": 26850 + }, + { + "entropy": 0.05744164055213332, + "epoch": 6.2599370556008855, + "grad_norm": 1.0703125, + "learning_rate": 4.8784193680080247e-05, + "loss": 0.0268, + "mean_token_accuracy": 0.9903415560722351, + "num_tokens": 76741829.0, + "step": 26855 + }, + { + "entropy": 0.056614281982183455, + "epoch": 6.261102692621518, + "grad_norm": 1.265625, + "learning_rate": 4.878353694112305e-05, + "loss": 0.016, + "mean_token_accuracy": 0.9917404353618622, + "num_tokens": 76766115.0, + "step": 26860 + }, + { + "entropy": 0.060635652393102646, + "epoch": 6.26226832964215, + "grad_norm": 1.8203125, + "learning_rate": 4.8782880033911346e-05, + "loss": 0.0314, + "mean_token_accuracy": 0.9926228582859039, + "num_tokens": 76777892.0, + "step": 26865 + }, + { + "entropy": 0.08467749282717704, + "epoch": 6.263433966662781, + "grad_norm": 1.34375, + "learning_rate": 4.878222295845493e-05, + "loss": 0.053, + "mean_token_accuracy": 0.984963458776474, + "num_tokens": 76786498.0, + "step": 26870 + }, + { + "entropy": 0.07869560457766056, + "epoch": 6.264599603683413, + "grad_norm": 2.0625, + "learning_rate": 4.8781565714763594e-05, + "loss": 0.0531, + "mean_token_accuracy": 0.9836140871047974, + "num_tokens": 76806702.0, + "step": 26875 + }, + { + "entropy": 0.08117300998419523, + "epoch": 6.265765240704045, + "grad_norm": 3.984375, + "learning_rate": 4.878090830284715e-05, + "loss": 0.0588, + "mean_token_accuracy": 0.9826340734958648, + "num_tokens": 76818610.0, + "step": 26880 + }, + { + "entropy": 0.05971610611304641, + "epoch": 6.266930877724676, + "grad_norm": 0.66796875, + "learning_rate": 4.878025072271539e-05, + "loss": 0.0255, + "mean_token_accuracy": 0.991646808385849, + "num_tokens": 76845946.0, + "step": 26885 + }, + { + "entropy": 0.06977970357984305, + "epoch": 6.2680965147453085, + "grad_norm": 0.359375, + "learning_rate": 4.877959297437814e-05, + "loss": 0.0348, + "mean_token_accuracy": 0.9882163524627685, + "num_tokens": 76864978.0, + "step": 26890 + }, + { + "entropy": 0.07290565092116594, + "epoch": 6.26926215176594, + "grad_norm": 2.09375, + "learning_rate": 4.877893505784518e-05, + "loss": 0.0369, + "mean_token_accuracy": 0.989104425907135, + "num_tokens": 76892345.0, + "step": 26895 + }, + { + "entropy": 0.06487658582627773, + "epoch": 6.270427788786572, + "grad_norm": 2.375, + "learning_rate": 4.877827697312634e-05, + "loss": 0.0324, + "mean_token_accuracy": 0.9904053211212158, + "num_tokens": 76910852.0, + "step": 26900 + }, + { + "entropy": 0.06390552939847112, + "epoch": 6.271593425807204, + "grad_norm": 2.84375, + "learning_rate": 4.877761872023142e-05, + "loss": 0.0255, + "mean_token_accuracy": 0.9912206172943115, + "num_tokens": 76941311.0, + "step": 26905 + }, + { + "entropy": 0.096971770003438, + "epoch": 6.272759062827835, + "grad_norm": 5.3125, + "learning_rate": 4.877696029917025e-05, + "loss": 0.1081, + "mean_token_accuracy": 0.9758206665515899, + "num_tokens": 76960875.0, + "step": 26910 + }, + { + "entropy": 0.08263939693570137, + "epoch": 6.273924699848467, + "grad_norm": 3.78125, + "learning_rate": 4.877630170995264e-05, + "loss": 0.0536, + "mean_token_accuracy": 0.9840788066387176, + "num_tokens": 76978127.0, + "step": 26915 + }, + { + "entropy": 0.05669274376705289, + "epoch": 6.275090336869099, + "grad_norm": 0.8046875, + "learning_rate": 4.877564295258841e-05, + "loss": 0.0296, + "mean_token_accuracy": 0.9915313005447388, + "num_tokens": 77000427.0, + "step": 26920 + }, + { + "entropy": 0.08088835161179304, + "epoch": 6.2762559738897306, + "grad_norm": 2.859375, + "learning_rate": 4.877498402708738e-05, + "loss": 0.0379, + "mean_token_accuracy": 0.9871138870716095, + "num_tokens": 77021781.0, + "step": 26925 + }, + { + "entropy": 0.06899372283369302, + "epoch": 6.277421610910363, + "grad_norm": 0.44921875, + "learning_rate": 4.877432493345938e-05, + "loss": 0.0245, + "mean_token_accuracy": 0.9892875671386718, + "num_tokens": 77043096.0, + "step": 26930 + }, + { + "entropy": 0.07236437350511551, + "epoch": 6.278587247930994, + "grad_norm": 2.140625, + "learning_rate": 4.877366567171424e-05, + "loss": 0.0261, + "mean_token_accuracy": 0.9930740773677826, + "num_tokens": 77057439.0, + "step": 26935 + }, + { + "entropy": 0.06578505616635084, + "epoch": 6.279752884951626, + "grad_norm": 0.56640625, + "learning_rate": 4.8773006241861786e-05, + "loss": 0.0416, + "mean_token_accuracy": 0.9892131865024567, + "num_tokens": 77073294.0, + "step": 26940 + }, + { + "entropy": 0.0702615974470973, + "epoch": 6.280918521972258, + "grad_norm": 4.5625, + "learning_rate": 4.877234664391185e-05, + "loss": 0.05, + "mean_token_accuracy": 0.9869410037994385, + "num_tokens": 77087501.0, + "step": 26945 + }, + { + "entropy": 0.045784792955964805, + "epoch": 6.282084158992889, + "grad_norm": 0.91796875, + "learning_rate": 4.877168687787428e-05, + "loss": 0.0191, + "mean_token_accuracy": 0.9897647440433502, + "num_tokens": 77120136.0, + "step": 26950 + }, + { + "entropy": 0.06734316907823086, + "epoch": 6.283249796013521, + "grad_norm": 3.921875, + "learning_rate": 4.87710269437589e-05, + "loss": 0.0418, + "mean_token_accuracy": 0.9890086054801941, + "num_tokens": 77131782.0, + "step": 26955 + }, + { + "entropy": 0.1180669752880931, + "epoch": 6.2844154330341535, + "grad_norm": 0.34765625, + "learning_rate": 4.877036684157556e-05, + "loss": 0.1248, + "mean_token_accuracy": 0.9731480836868286, + "num_tokens": 77158773.0, + "step": 26960 + }, + { + "entropy": 0.07690473701804876, + "epoch": 6.285581070054785, + "grad_norm": 0.271484375, + "learning_rate": 4.8769706571334095e-05, + "loss": 0.0209, + "mean_token_accuracy": 0.9888293027877808, + "num_tokens": 77183757.0, + "step": 26965 + }, + { + "entropy": 0.0930105771869421, + "epoch": 6.286746707075417, + "grad_norm": 2.203125, + "learning_rate": 4.876904613304435e-05, + "loss": 0.051, + "mean_token_accuracy": 0.9848386824131012, + "num_tokens": 77193681.0, + "step": 26970 + }, + { + "entropy": 0.060835771076381204, + "epoch": 6.287912344096048, + "grad_norm": 3.15625, + "learning_rate": 4.876838552671619e-05, + "loss": 0.0264, + "mean_token_accuracy": 0.9885562717914581, + "num_tokens": 77219243.0, + "step": 26975 + }, + { + "entropy": 0.06320591568946839, + "epoch": 6.28907798111668, + "grad_norm": 1.1171875, + "learning_rate": 4.876772475235945e-05, + "loss": 0.0349, + "mean_token_accuracy": 0.9873491525650024, + "num_tokens": 77235210.0, + "step": 26980 + }, + { + "entropy": 0.06562241949141026, + "epoch": 6.290243618137312, + "grad_norm": 3.671875, + "learning_rate": 4.876706380998398e-05, + "loss": 0.0388, + "mean_token_accuracy": 0.9866467297077179, + "num_tokens": 77249292.0, + "step": 26985 + }, + { + "entropy": 0.07758224606513978, + "epoch": 6.2914092551579435, + "grad_norm": 2.390625, + "learning_rate": 4.8766402699599646e-05, + "loss": 0.0579, + "mean_token_accuracy": 0.981865918636322, + "num_tokens": 77260195.0, + "step": 26990 + }, + { + "entropy": 0.06526398342102765, + "epoch": 6.292574892178576, + "grad_norm": 0.4140625, + "learning_rate": 4.8765741421216297e-05, + "loss": 0.0236, + "mean_token_accuracy": 0.9926089167594909, + "num_tokens": 77278321.0, + "step": 26995 + }, + { + "entropy": 0.047286936268210414, + "epoch": 6.293740529199208, + "grad_norm": 1.6953125, + "learning_rate": 4.87650799748438e-05, + "loss": 0.0166, + "mean_token_accuracy": 0.9924996316432952, + "num_tokens": 77310297.0, + "step": 27000 + }, + { + "entropy": 0.08713566735386849, + "epoch": 6.294906166219839, + "grad_norm": 1.3046875, + "learning_rate": 4.876441836049202e-05, + "loss": 0.0707, + "mean_token_accuracy": 0.9823097467422486, + "num_tokens": 77319165.0, + "step": 27005 + }, + { + "entropy": 0.0667908507399261, + "epoch": 6.296071803240471, + "grad_norm": 0.84375, + "learning_rate": 4.8763756578170814e-05, + "loss": 0.0329, + "mean_token_accuracy": 0.9904216408729554, + "num_tokens": 77335995.0, + "step": 27010 + }, + { + "entropy": 0.07371202344074845, + "epoch": 6.297237440261103, + "grad_norm": 2.484375, + "learning_rate": 4.8763094627890065e-05, + "loss": 0.0305, + "mean_token_accuracy": 0.9872647106647492, + "num_tokens": 77361297.0, + "step": 27015 + }, + { + "entropy": 0.07954277824610471, + "epoch": 6.298403077281734, + "grad_norm": 0.5078125, + "learning_rate": 4.876243250965963e-05, + "loss": 0.043, + "mean_token_accuracy": 0.9880691766738892, + "num_tokens": 77376597.0, + "step": 27020 + }, + { + "entropy": 0.07487954199314117, + "epoch": 6.299568714302366, + "grad_norm": 2.90625, + "learning_rate": 4.876177022348939e-05, + "loss": 0.0461, + "mean_token_accuracy": 0.9878607869148255, + "num_tokens": 77409350.0, + "step": 27025 + }, + { + "entropy": 0.08372844085097313, + "epoch": 6.300734351322998, + "grad_norm": 2.03125, + "learning_rate": 4.8761107769389214e-05, + "loss": 0.04, + "mean_token_accuracy": 0.988189697265625, + "num_tokens": 77427888.0, + "step": 27030 + }, + { + "entropy": 0.07974276877939701, + "epoch": 6.30189998834363, + "grad_norm": 0.9609375, + "learning_rate": 4.8760445147368986e-05, + "loss": 0.038, + "mean_token_accuracy": 0.991102111339569, + "num_tokens": 77439750.0, + "step": 27035 + }, + { + "entropy": 0.06711346944794058, + "epoch": 6.303065625364262, + "grad_norm": 0.150390625, + "learning_rate": 4.875978235743858e-05, + "loss": 0.0259, + "mean_token_accuracy": 0.9904236793518066, + "num_tokens": 77465080.0, + "step": 27040 + }, + { + "entropy": 0.0663998268544674, + "epoch": 6.304231262384893, + "grad_norm": 1.765625, + "learning_rate": 4.875911939960788e-05, + "loss": 0.033, + "mean_token_accuracy": 0.9886034607887269, + "num_tokens": 77477493.0, + "step": 27045 + }, + { + "entropy": 0.06868320833891631, + "epoch": 6.305396899405525, + "grad_norm": 0.158203125, + "learning_rate": 4.875845627388678e-05, + "loss": 0.0151, + "mean_token_accuracy": 0.991125100851059, + "num_tokens": 77518272.0, + "step": 27050 + }, + { + "entropy": 0.05745923724025488, + "epoch": 6.306562536426157, + "grad_norm": 0.57421875, + "learning_rate": 4.875779298028517e-05, + "loss": 0.0158, + "mean_token_accuracy": 0.9914752006530761, + "num_tokens": 77551818.0, + "step": 27055 + }, + { + "entropy": 0.06080328058451414, + "epoch": 6.3077281734467885, + "grad_norm": 1.046875, + "learning_rate": 4.875712951881292e-05, + "loss": 0.0336, + "mean_token_accuracy": 0.9897435069084167, + "num_tokens": 77570567.0, + "step": 27060 + }, + { + "entropy": 0.0706357978284359, + "epoch": 6.308893810467421, + "grad_norm": 0.234375, + "learning_rate": 4.875646588947995e-05, + "loss": 0.0514, + "mean_token_accuracy": 0.9898265242576599, + "num_tokens": 77592122.0, + "step": 27065 + }, + { + "entropy": 0.05524565796367824, + "epoch": 6.310059447488052, + "grad_norm": 0.2490234375, + "learning_rate": 4.875580209229613e-05, + "loss": 0.0172, + "mean_token_accuracy": 0.9909223794937134, + "num_tokens": 77626893.0, + "step": 27070 + }, + { + "entropy": 0.1092427797615528, + "epoch": 6.311225084508684, + "grad_norm": 0.68359375, + "learning_rate": 4.8755138127271383e-05, + "loss": 0.0909, + "mean_token_accuracy": 0.9715740263462067, + "num_tokens": 77657199.0, + "step": 27075 + }, + { + "entropy": 0.07431203294545412, + "epoch": 6.312390721529316, + "grad_norm": 2.890625, + "learning_rate": 4.87544739944156e-05, + "loss": 0.0384, + "mean_token_accuracy": 0.9846733629703521, + "num_tokens": 77678532.0, + "step": 27080 + }, + { + "entropy": 0.06422578003257513, + "epoch": 6.313556358549947, + "grad_norm": 1.046875, + "learning_rate": 4.875380969373867e-05, + "loss": 0.0313, + "mean_token_accuracy": 0.9897050440311432, + "num_tokens": 77695040.0, + "step": 27085 + }, + { + "entropy": 0.05756510868668556, + "epoch": 6.314721995570579, + "grad_norm": 0.859375, + "learning_rate": 4.875314522525052e-05, + "loss": 0.0217, + "mean_token_accuracy": 0.9928114414215088, + "num_tokens": 77716217.0, + "step": 27090 + }, + { + "entropy": 0.06490825079381465, + "epoch": 6.315887632591211, + "grad_norm": 0.447265625, + "learning_rate": 4.875248058896104e-05, + "loss": 0.0166, + "mean_token_accuracy": 0.9889268100261688, + "num_tokens": 77744133.0, + "step": 27095 + }, + { + "entropy": 0.06178513718768954, + "epoch": 6.317053269611843, + "grad_norm": 0.263671875, + "learning_rate": 4.8751815784880154e-05, + "loss": 0.0288, + "mean_token_accuracy": 0.9889122724533081, + "num_tokens": 77772314.0, + "step": 27100 + }, + { + "entropy": 0.05994966216385365, + "epoch": 6.318218906632475, + "grad_norm": 1.484375, + "learning_rate": 4.8751150813017776e-05, + "loss": 0.0423, + "mean_token_accuracy": 0.989544004201889, + "num_tokens": 77784114.0, + "step": 27105 + }, + { + "entropy": 0.0725404830649495, + "epoch": 6.319384543653106, + "grad_norm": 0.3359375, + "learning_rate": 4.875048567338381e-05, + "loss": 0.0426, + "mean_token_accuracy": 0.9853807687759399, + "num_tokens": 77805973.0, + "step": 27110 + }, + { + "entropy": 0.07376164561137558, + "epoch": 6.320550180673738, + "grad_norm": 3.109375, + "learning_rate": 4.8749820365988185e-05, + "loss": 0.0517, + "mean_token_accuracy": 0.9803310930728912, + "num_tokens": 77829286.0, + "step": 27115 + }, + { + "entropy": 0.08759814314544201, + "epoch": 6.32171581769437, + "grad_norm": 0.79296875, + "learning_rate": 4.874915489084081e-05, + "loss": 0.0421, + "mean_token_accuracy": 0.9889974951744079, + "num_tokens": 77840547.0, + "step": 27120 + }, + { + "entropy": 0.08322993703186513, + "epoch": 6.322881454715001, + "grad_norm": 1.578125, + "learning_rate": 4.874848924795163e-05, + "loss": 0.0499, + "mean_token_accuracy": 0.9860299587249756, + "num_tokens": 77848476.0, + "step": 27125 + }, + { + "entropy": 0.06156183313578367, + "epoch": 6.3240470917356335, + "grad_norm": 0.361328125, + "learning_rate": 4.874782343733055e-05, + "loss": 0.021, + "mean_token_accuracy": 0.992520546913147, + "num_tokens": 77874137.0, + "step": 27130 + }, + { + "entropy": 0.04788096770644188, + "epoch": 6.325212728756266, + "grad_norm": 0.62890625, + "learning_rate": 4.8747157458987505e-05, + "loss": 0.025, + "mean_token_accuracy": 0.9898177683353424, + "num_tokens": 77917307.0, + "step": 27135 + }, + { + "entropy": 0.0891963217407465, + "epoch": 6.326378365776897, + "grad_norm": 1.4296875, + "learning_rate": 4.874649131293242e-05, + "loss": 0.0495, + "mean_token_accuracy": 0.9910438537597657, + "num_tokens": 77937080.0, + "step": 27140 + }, + { + "entropy": 0.07902921987697482, + "epoch": 6.327544002797529, + "grad_norm": 1.875, + "learning_rate": 4.874582499917524e-05, + "loss": 0.0296, + "mean_token_accuracy": 0.99190593957901, + "num_tokens": 77955404.0, + "step": 27145 + }, + { + "entropy": 0.06934650149196386, + "epoch": 6.328709639818161, + "grad_norm": 4.53125, + "learning_rate": 4.874515851772589e-05, + "loss": 0.0415, + "mean_token_accuracy": 0.9878355026245117, + "num_tokens": 77977887.0, + "step": 27150 + }, + { + "entropy": 0.042633866891264915, + "epoch": 6.329875276838792, + "grad_norm": 1.3046875, + "learning_rate": 4.8744491868594316e-05, + "loss": 0.0197, + "mean_token_accuracy": 0.995466285943985, + "num_tokens": 78004193.0, + "step": 27155 + }, + { + "entropy": 0.0678960201330483, + "epoch": 6.331040913859424, + "grad_norm": 0.5, + "learning_rate": 4.8743825051790455e-05, + "loss": 0.0328, + "mean_token_accuracy": 0.9873064517974853, + "num_tokens": 78021840.0, + "step": 27160 + }, + { + "entropy": 0.07100022733211517, + "epoch": 6.332206550880056, + "grad_norm": 0.3046875, + "learning_rate": 4.874315806732425e-05, + "loss": 0.0395, + "mean_token_accuracy": 0.9875366270542145, + "num_tokens": 78046750.0, + "step": 27165 + }, + { + "entropy": 0.07311735656112432, + "epoch": 6.333372187900688, + "grad_norm": 1.5703125, + "learning_rate": 4.874249091520565e-05, + "loss": 0.0356, + "mean_token_accuracy": 0.9898627936840058, + "num_tokens": 78062381.0, + "step": 27170 + }, + { + "entropy": 0.056971798092126845, + "epoch": 6.33453782492132, + "grad_norm": 0.357421875, + "learning_rate": 4.87418235954446e-05, + "loss": 0.0097, + "mean_token_accuracy": 0.9902470290660859, + "num_tokens": 78105924.0, + "step": 27175 + }, + { + "entropy": 0.06141122579574585, + "epoch": 6.335703461941951, + "grad_norm": 0.478515625, + "learning_rate": 4.874115610805105e-05, + "loss": 0.0343, + "mean_token_accuracy": 0.9900152325630188, + "num_tokens": 78128152.0, + "step": 27180 + }, + { + "entropy": 0.07991018062457442, + "epoch": 6.336869098962583, + "grad_norm": 0.265625, + "learning_rate": 4.8740488453034954e-05, + "loss": 0.0248, + "mean_token_accuracy": 0.9912156224250793, + "num_tokens": 78146706.0, + "step": 27185 + }, + { + "entropy": 0.07130510099232197, + "epoch": 6.338034735983215, + "grad_norm": 3.75, + "learning_rate": 4.8739820630406275e-05, + "loss": 0.0474, + "mean_token_accuracy": 0.9853857576847076, + "num_tokens": 78161648.0, + "step": 27190 + }, + { + "entropy": 0.04623548369854689, + "epoch": 6.339200373003846, + "grad_norm": 0.1650390625, + "learning_rate": 4.8739152640174956e-05, + "loss": 0.0167, + "mean_token_accuracy": 0.9954453229904174, + "num_tokens": 78200518.0, + "step": 27195 + }, + { + "entropy": 0.06505171973258257, + "epoch": 6.3403660100244785, + "grad_norm": 0.96875, + "learning_rate": 4.873848448235097e-05, + "loss": 0.0324, + "mean_token_accuracy": 0.989247715473175, + "num_tokens": 78220373.0, + "step": 27200 + }, + { + "entropy": 0.08946949765086173, + "epoch": 6.34153164704511, + "grad_norm": 3.375, + "learning_rate": 4.873781615694428e-05, + "loss": 0.0588, + "mean_token_accuracy": 0.985131961107254, + "num_tokens": 78228640.0, + "step": 27205 + }, + { + "entropy": 0.05788529254496098, + "epoch": 6.342697284065742, + "grad_norm": 0.85546875, + "learning_rate": 4.873714766396484e-05, + "loss": 0.0231, + "mean_token_accuracy": 0.9910314798355102, + "num_tokens": 78246948.0, + "step": 27210 + }, + { + "entropy": 0.05744266202673316, + "epoch": 6.343862921086374, + "grad_norm": 1.078125, + "learning_rate": 4.8736479003422636e-05, + "loss": 0.0316, + "mean_token_accuracy": 0.9888568997383118, + "num_tokens": 78265111.0, + "step": 27215 + }, + { + "entropy": 0.055955256521701816, + "epoch": 6.345028558107005, + "grad_norm": 2.359375, + "learning_rate": 4.873581017532762e-05, + "loss": 0.0275, + "mean_token_accuracy": 0.9928835809230805, + "num_tokens": 78288714.0, + "step": 27220 + }, + { + "entropy": 0.08183978088200092, + "epoch": 6.346194195127637, + "grad_norm": 0.404296875, + "learning_rate": 4.8735141179689785e-05, + "loss": 0.0569, + "mean_token_accuracy": 0.9858071386814118, + "num_tokens": 78320371.0, + "step": 27225 + }, + { + "entropy": 0.055092441756278275, + "epoch": 6.347359832148269, + "grad_norm": 0.55078125, + "learning_rate": 4.8734472016519097e-05, + "loss": 0.0185, + "mean_token_accuracy": 0.9905835092067719, + "num_tokens": 78343924.0, + "step": 27230 + }, + { + "entropy": 0.0675237711519003, + "epoch": 6.348525469168901, + "grad_norm": 0.59375, + "learning_rate": 4.8733802685825525e-05, + "loss": 0.0197, + "mean_token_accuracy": 0.993051677942276, + "num_tokens": 78367520.0, + "step": 27235 + }, + { + "entropy": 0.06913798209279776, + "epoch": 6.349691106189533, + "grad_norm": 0.8984375, + "learning_rate": 4.8733133187619065e-05, + "loss": 0.0627, + "mean_token_accuracy": 0.988431054353714, + "num_tokens": 78382339.0, + "step": 27240 + }, + { + "entropy": 0.08050851384177804, + "epoch": 6.350856743210164, + "grad_norm": 1.75, + "learning_rate": 4.873246352190969e-05, + "loss": 0.0468, + "mean_token_accuracy": 0.9877587676048278, + "num_tokens": 78400191.0, + "step": 27245 + }, + { + "entropy": 0.12997290436178446, + "epoch": 6.352022380230796, + "grad_norm": 2.921875, + "learning_rate": 4.8731793688707386e-05, + "loss": 0.1291, + "mean_token_accuracy": 0.9744395613670349, + "num_tokens": 78423703.0, + "step": 27250 + }, + { + "entropy": 0.05893737701699138, + "epoch": 6.353188017251428, + "grad_norm": 1.9453125, + "learning_rate": 4.873112368802215e-05, + "loss": 0.0417, + "mean_token_accuracy": 0.9888142168521881, + "num_tokens": 78438932.0, + "step": 27255 + }, + { + "entropy": 0.08109636697918177, + "epoch": 6.354353654272059, + "grad_norm": 0.5234375, + "learning_rate": 4.873045351986396e-05, + "loss": 0.0488, + "mean_token_accuracy": 0.9866352796554565, + "num_tokens": 78450813.0, + "step": 27260 + }, + { + "entropy": 0.09451097249984741, + "epoch": 6.355519291292691, + "grad_norm": 1.0546875, + "learning_rate": 4.872978318424283e-05, + "loss": 0.0295, + "mean_token_accuracy": 0.9913021385669708, + "num_tokens": 78465366.0, + "step": 27265 + }, + { + "entropy": 0.05986616881564259, + "epoch": 6.3566849283133235, + "grad_norm": 0.1806640625, + "learning_rate": 4.872911268116873e-05, + "loss": 0.0381, + "mean_token_accuracy": 0.9888629734516143, + "num_tokens": 78491298.0, + "step": 27270 + }, + { + "entropy": 0.06749337911605835, + "epoch": 6.357850565333955, + "grad_norm": 1.3359375, + "learning_rate": 4.872844201065168e-05, + "loss": 0.0364, + "mean_token_accuracy": 0.9906283020973206, + "num_tokens": 78502915.0, + "step": 27275 + }, + { + "entropy": 0.08259217841550708, + "epoch": 6.359016202354587, + "grad_norm": 2.21875, + "learning_rate": 4.872777117270166e-05, + "loss": 0.0456, + "mean_token_accuracy": 0.984370744228363, + "num_tokens": 78519778.0, + "step": 27280 + }, + { + "entropy": 0.07694516181945801, + "epoch": 6.360181839375219, + "grad_norm": 2.375, + "learning_rate": 4.8727100167328685e-05, + "loss": 0.0425, + "mean_token_accuracy": 0.9869538724422455, + "num_tokens": 78528363.0, + "step": 27285 + }, + { + "entropy": 0.0720290282741189, + "epoch": 6.36134747639585, + "grad_norm": 1.3203125, + "learning_rate": 4.872642899454277e-05, + "loss": 0.035, + "mean_token_accuracy": 0.9898767948150635, + "num_tokens": 78541818.0, + "step": 27290 + }, + { + "entropy": 0.07878648396581411, + "epoch": 6.362513113416482, + "grad_norm": 0.38671875, + "learning_rate": 4.872575765435391e-05, + "loss": 0.0379, + "mean_token_accuracy": 0.9881724655628205, + "num_tokens": 78555497.0, + "step": 27295 + }, + { + "entropy": 0.0814511626958847, + "epoch": 6.3636787504371135, + "grad_norm": 0.62109375, + "learning_rate": 4.8725086146772115e-05, + "loss": 0.0375, + "mean_token_accuracy": 0.9854908883571625, + "num_tokens": 78567574.0, + "step": 27300 + }, + { + "entropy": 0.08116347342729568, + "epoch": 6.364844387457746, + "grad_norm": 0.8515625, + "learning_rate": 4.87244144718074e-05, + "loss": 0.0599, + "mean_token_accuracy": 0.9835920453071594, + "num_tokens": 78577065.0, + "step": 27305 + }, + { + "entropy": 0.08684050869196654, + "epoch": 6.366010024478378, + "grad_norm": 0.6640625, + "learning_rate": 4.8723742629469794e-05, + "loss": 0.0309, + "mean_token_accuracy": 0.9864567041397094, + "num_tokens": 78596233.0, + "step": 27310 + }, + { + "entropy": 0.07369791343808174, + "epoch": 6.367175661499009, + "grad_norm": 1.765625, + "learning_rate": 4.87230706197693e-05, + "loss": 0.0591, + "mean_token_accuracy": 0.986175411939621, + "num_tokens": 78607212.0, + "step": 27315 + }, + { + "entropy": 0.06452398095279932, + "epoch": 6.368341298519641, + "grad_norm": 3.203125, + "learning_rate": 4.872239844271593e-05, + "loss": 0.032, + "mean_token_accuracy": 0.988612276315689, + "num_tokens": 78631724.0, + "step": 27320 + }, + { + "entropy": 0.05261941840872168, + "epoch": 6.369506935540273, + "grad_norm": 0.75390625, + "learning_rate": 4.872172609831973e-05, + "loss": 0.0289, + "mean_token_accuracy": 0.9906743228435516, + "num_tokens": 78660307.0, + "step": 27325 + }, + { + "entropy": 0.07566324677318334, + "epoch": 6.370672572560904, + "grad_norm": 0.921875, + "learning_rate": 4.8721053586590714e-05, + "loss": 0.0241, + "mean_token_accuracy": 0.9894809603691102, + "num_tokens": 78681145.0, + "step": 27330 + }, + { + "entropy": 0.06389159299433231, + "epoch": 6.3718382095815365, + "grad_norm": 1.09375, + "learning_rate": 4.872038090753892e-05, + "loss": 0.0185, + "mean_token_accuracy": 0.9917346715927124, + "num_tokens": 78703037.0, + "step": 27335 + }, + { + "entropy": 0.07629008330404759, + "epoch": 6.373003846602168, + "grad_norm": 0.953125, + "learning_rate": 4.8719708061174355e-05, + "loss": 0.0275, + "mean_token_accuracy": 0.9885548293590546, + "num_tokens": 78715880.0, + "step": 27340 + }, + { + "entropy": 0.08970846701413393, + "epoch": 6.3741694836228, + "grad_norm": 3.90625, + "learning_rate": 4.871903504750708e-05, + "loss": 0.052, + "mean_token_accuracy": 0.9867014706134796, + "num_tokens": 78730263.0, + "step": 27345 + }, + { + "entropy": 0.07629141733050346, + "epoch": 6.375335120643432, + "grad_norm": 1.6640625, + "learning_rate": 4.8718361866547113e-05, + "loss": 0.0521, + "mean_token_accuracy": 0.9869326591491699, + "num_tokens": 78743333.0, + "step": 27350 + }, + { + "entropy": 0.07764709144830703, + "epoch": 6.376500757664063, + "grad_norm": 2.671875, + "learning_rate": 4.871768851830449e-05, + "loss": 0.0502, + "mean_token_accuracy": 0.9871854364871979, + "num_tokens": 78754458.0, + "step": 27355 + }, + { + "entropy": 0.08753283023834228, + "epoch": 6.377666394684695, + "grad_norm": 0.8125, + "learning_rate": 4.871701500278927e-05, + "loss": 0.056, + "mean_token_accuracy": 0.9862245202064515, + "num_tokens": 78763480.0, + "step": 27360 + }, + { + "entropy": 0.062451109476387504, + "epoch": 6.378832031705327, + "grad_norm": 0.58203125, + "learning_rate": 4.8716341320011485e-05, + "loss": 0.0352, + "mean_token_accuracy": 0.9882869064807892, + "num_tokens": 78776564.0, + "step": 27365 + }, + { + "entropy": 0.07153270859271288, + "epoch": 6.3799976687259585, + "grad_norm": 2.0, + "learning_rate": 4.871566746998117e-05, + "loss": 0.0467, + "mean_token_accuracy": 0.987527585029602, + "num_tokens": 78790359.0, + "step": 27370 + }, + { + "entropy": 0.06521230619400739, + "epoch": 6.381163305746591, + "grad_norm": 1.5078125, + "learning_rate": 4.871499345270839e-05, + "loss": 0.0223, + "mean_token_accuracy": 0.9910327970981598, + "num_tokens": 78815759.0, + "step": 27375 + }, + { + "entropy": 0.0744168077595532, + "epoch": 6.382328942767222, + "grad_norm": 0.92578125, + "learning_rate": 4.871431926820319e-05, + "loss": 0.0368, + "mean_token_accuracy": 0.9912038207054138, + "num_tokens": 78833711.0, + "step": 27380 + }, + { + "entropy": 0.05522217508405447, + "epoch": 6.383494579787854, + "grad_norm": 1.4296875, + "learning_rate": 4.871364491647562e-05, + "loss": 0.0297, + "mean_token_accuracy": 0.9883031487464905, + "num_tokens": 78858418.0, + "step": 27385 + }, + { + "entropy": 0.08326612692326307, + "epoch": 6.384660216808486, + "grad_norm": 0.33203125, + "learning_rate": 4.871297039753575e-05, + "loss": 0.0475, + "mean_token_accuracy": 0.9871909439563751, + "num_tokens": 78875926.0, + "step": 27390 + }, + { + "entropy": 0.06716305706650019, + "epoch": 6.385825853829117, + "grad_norm": 2.078125, + "learning_rate": 4.871229571139361e-05, + "loss": 0.0264, + "mean_token_accuracy": 0.9889111697673798, + "num_tokens": 78889549.0, + "step": 27395 + }, + { + "entropy": 0.06953895352780819, + "epoch": 6.386991490849749, + "grad_norm": 4.78125, + "learning_rate": 4.871162085805928e-05, + "loss": 0.0367, + "mean_token_accuracy": 0.9867134690284729, + "num_tokens": 78932439.0, + "step": 27400 + }, + { + "entropy": 0.05694300038740039, + "epoch": 6.3881571278703815, + "grad_norm": 2.0, + "learning_rate": 4.871094583754283e-05, + "loss": 0.021, + "mean_token_accuracy": 0.9930303812026977, + "num_tokens": 78948125.0, + "step": 27405 + }, + { + "entropy": 0.050066758040338756, + "epoch": 6.389322764891013, + "grad_norm": 0.2060546875, + "learning_rate": 4.871027064985431e-05, + "loss": 0.0175, + "mean_token_accuracy": 0.9904357731342316, + "num_tokens": 78982256.0, + "step": 27410 + }, + { + "entropy": 0.06765003893524409, + "epoch": 6.390488401911645, + "grad_norm": 1.828125, + "learning_rate": 4.8709595295003786e-05, + "loss": 0.0428, + "mean_token_accuracy": 0.9883161067962647, + "num_tokens": 79007275.0, + "step": 27415 + }, + { + "entropy": 0.05757339298725128, + "epoch": 6.391654038932277, + "grad_norm": 0.416015625, + "learning_rate": 4.8708919773001335e-05, + "loss": 0.0299, + "mean_token_accuracy": 0.9906012296676636, + "num_tokens": 79025587.0, + "step": 27420 + }, + { + "entropy": 0.04727498330175876, + "epoch": 6.392819675952908, + "grad_norm": 1.625, + "learning_rate": 4.8708244083857044e-05, + "loss": 0.0229, + "mean_token_accuracy": 0.993421071767807, + "num_tokens": 79050082.0, + "step": 27425 + }, + { + "entropy": 0.06903645731508731, + "epoch": 6.39398531297354, + "grad_norm": 0.44140625, + "learning_rate": 4.870756822758097e-05, + "loss": 0.0483, + "mean_token_accuracy": 0.9889946520328522, + "num_tokens": 79065697.0, + "step": 27430 + }, + { + "entropy": 0.09842782281339169, + "epoch": 6.395150949994171, + "grad_norm": 2.5, + "learning_rate": 4.870689220418319e-05, + "loss": 0.0994, + "mean_token_accuracy": 0.9789209306240082, + "num_tokens": 79084356.0, + "step": 27435 + }, + { + "entropy": 0.05659789480268955, + "epoch": 6.3963165870148035, + "grad_norm": 0.34765625, + "learning_rate": 4.87062160136738e-05, + "loss": 0.0254, + "mean_token_accuracy": 0.9913104176521301, + "num_tokens": 79106442.0, + "step": 27440 + }, + { + "entropy": 0.05858164490200579, + "epoch": 6.397482224035436, + "grad_norm": 0.8359375, + "learning_rate": 4.8705539656062874e-05, + "loss": 0.0177, + "mean_token_accuracy": 0.9934798002243042, + "num_tokens": 79136838.0, + "step": 27445 + }, + { + "entropy": 0.04104721006006003, + "epoch": 6.398647861056067, + "grad_norm": 5.03125, + "learning_rate": 4.870486313136049e-05, + "loss": 0.0285, + "mean_token_accuracy": 0.9911053597927093, + "num_tokens": 79159904.0, + "step": 27450 + }, + { + "entropy": 0.05417780401185155, + "epoch": 6.399813498076699, + "grad_norm": 3.140625, + "learning_rate": 4.870418643957675e-05, + "loss": 0.0331, + "mean_token_accuracy": 0.9912892341613769, + "num_tokens": 79182211.0, + "step": 27455 + }, + { + "entropy": 0.07004196234047413, + "epoch": 6.400979135097331, + "grad_norm": 0.369140625, + "learning_rate": 4.870350958072173e-05, + "loss": 0.0425, + "mean_token_accuracy": 0.9870667815208435, + "num_tokens": 79200837.0, + "step": 27460 + }, + { + "entropy": 0.07679205816239118, + "epoch": 6.402144772117962, + "grad_norm": 1.65625, + "learning_rate": 4.870283255480554e-05, + "loss": 0.048, + "mean_token_accuracy": 0.9877889931201935, + "num_tokens": 79225260.0, + "step": 27465 + }, + { + "entropy": 0.06059764139354229, + "epoch": 6.403310409138594, + "grad_norm": 2.546875, + "learning_rate": 4.8702155361838265e-05, + "loss": 0.0436, + "mean_token_accuracy": 0.9852454006671906, + "num_tokens": 79236989.0, + "step": 27470 + }, + { + "entropy": 0.07255728393793107, + "epoch": 6.404476046159226, + "grad_norm": 1.234375, + "learning_rate": 4.8701478001830006e-05, + "loss": 0.0516, + "mean_token_accuracy": 0.9845960021018982, + "num_tokens": 79249351.0, + "step": 27475 + }, + { + "entropy": 0.09389685951173306, + "epoch": 6.405641683179858, + "grad_norm": 1.7578125, + "learning_rate": 4.870080047479086e-05, + "loss": 0.036, + "mean_token_accuracy": 0.99000563621521, + "num_tokens": 79267778.0, + "step": 27480 + }, + { + "entropy": 0.062156752310693265, + "epoch": 6.40680732020049, + "grad_norm": 3.875, + "learning_rate": 4.870012278073093e-05, + "loss": 0.0437, + "mean_token_accuracy": 0.9860499918460846, + "num_tokens": 79284570.0, + "step": 27485 + }, + { + "entropy": 0.06316896006464959, + "epoch": 6.407972957221121, + "grad_norm": 0.51171875, + "learning_rate": 4.869944491966033e-05, + "loss": 0.0252, + "mean_token_accuracy": 0.9924411833286285, + "num_tokens": 79301528.0, + "step": 27490 + }, + { + "entropy": 0.06141756055876613, + "epoch": 6.409138594241753, + "grad_norm": 0.2275390625, + "learning_rate": 4.8698766891589146e-05, + "loss": 0.034, + "mean_token_accuracy": 0.987896740436554, + "num_tokens": 79334446.0, + "step": 27495 + }, + { + "entropy": 0.0701604936271906, + "epoch": 6.410304231262385, + "grad_norm": 1.5234375, + "learning_rate": 4.869808869652752e-05, + "loss": 0.0435, + "mean_token_accuracy": 0.9874770641326904, + "num_tokens": 79345554.0, + "step": 27500 + }, + { + "entropy": 0.050602398626506326, + "epoch": 6.4114698682830165, + "grad_norm": 0.6640625, + "learning_rate": 4.869741033448554e-05, + "loss": 0.0382, + "mean_token_accuracy": 0.9874916076660156, + "num_tokens": 79368799.0, + "step": 27505 + }, + { + "entropy": 0.06348597463220358, + "epoch": 6.412635505303649, + "grad_norm": 2.703125, + "learning_rate": 4.869673180547333e-05, + "loss": 0.0301, + "mean_token_accuracy": 0.9886243999004364, + "num_tokens": 79393263.0, + "step": 27510 + }, + { + "entropy": 0.05499905683100224, + "epoch": 6.41380114232428, + "grad_norm": 0.8359375, + "learning_rate": 4.869605310950102e-05, + "loss": 0.0266, + "mean_token_accuracy": 0.9914214432239532, + "num_tokens": 79414468.0, + "step": 27515 + }, + { + "entropy": 0.07658125725574791, + "epoch": 6.414966779344912, + "grad_norm": 1.8203125, + "learning_rate": 4.86953742465787e-05, + "loss": 0.0358, + "mean_token_accuracy": 0.989252644777298, + "num_tokens": 79432609.0, + "step": 27520 + }, + { + "entropy": 0.0815320173278451, + "epoch": 6.416132416365544, + "grad_norm": 3.28125, + "learning_rate": 4.869469521671652e-05, + "loss": 0.0501, + "mean_token_accuracy": 0.9842431008815765, + "num_tokens": 79443455.0, + "step": 27525 + }, + { + "entropy": 0.07601859644055367, + "epoch": 6.417298053386175, + "grad_norm": 2.59375, + "learning_rate": 4.8694016019924596e-05, + "loss": 0.0503, + "mean_token_accuracy": 0.9878701090812683, + "num_tokens": 79456083.0, + "step": 27530 + }, + { + "entropy": 0.06871461626142264, + "epoch": 6.418463690406807, + "grad_norm": 0.2255859375, + "learning_rate": 4.869333665621306e-05, + "loss": 0.0361, + "mean_token_accuracy": 0.984860771894455, + "num_tokens": 79477148.0, + "step": 27535 + }, + { + "entropy": 0.06119240690022707, + "epoch": 6.419629327427439, + "grad_norm": 1.3828125, + "learning_rate": 4.869265712559203e-05, + "loss": 0.0278, + "mean_token_accuracy": 0.9898676156997681, + "num_tokens": 79508657.0, + "step": 27540 + }, + { + "entropy": 0.08302809465676546, + "epoch": 6.420794964448071, + "grad_norm": 1.3359375, + "learning_rate": 4.869197742807166e-05, + "loss": 0.0471, + "mean_token_accuracy": 0.9878687500953675, + "num_tokens": 79519230.0, + "step": 27545 + }, + { + "entropy": 0.08217885680496692, + "epoch": 6.421960601468703, + "grad_norm": 1.078125, + "learning_rate": 4.8691297563662064e-05, + "loss": 0.0377, + "mean_token_accuracy": 0.9881730794906616, + "num_tokens": 79529426.0, + "step": 27550 + }, + { + "entropy": 0.07849093452095986, + "epoch": 6.423126238489335, + "grad_norm": 3.140625, + "learning_rate": 4.86906175323734e-05, + "loss": 0.0419, + "mean_token_accuracy": 0.9865849614143372, + "num_tokens": 79539544.0, + "step": 27555 + }, + { + "entropy": 0.06886114776134492, + "epoch": 6.424291875509966, + "grad_norm": 2.078125, + "learning_rate": 4.868993733421578e-05, + "loss": 0.0396, + "mean_token_accuracy": 0.9901333630084992, + "num_tokens": 79552090.0, + "step": 27560 + }, + { + "entropy": 0.06919397301971912, + "epoch": 6.425457512530598, + "grad_norm": 0.90625, + "learning_rate": 4.868925696919937e-05, + "loss": 0.0192, + "mean_token_accuracy": 0.9921321332454681, + "num_tokens": 79570648.0, + "step": 27565 + }, + { + "entropy": 0.057403205148875716, + "epoch": 6.426623149551229, + "grad_norm": 1.359375, + "learning_rate": 4.868857643733431e-05, + "loss": 0.0176, + "mean_token_accuracy": 0.9934298872947693, + "num_tokens": 79589140.0, + "step": 27570 + }, + { + "entropy": 0.0688036672770977, + "epoch": 6.4277887865718615, + "grad_norm": 0.384765625, + "learning_rate": 4.868789573863075e-05, + "loss": 0.0297, + "mean_token_accuracy": 0.9875211656093598, + "num_tokens": 79616950.0, + "step": 27575 + }, + { + "entropy": 0.07488919515162706, + "epoch": 6.428954423592494, + "grad_norm": 2.28125, + "learning_rate": 4.8687214873098836e-05, + "loss": 0.0421, + "mean_token_accuracy": 0.9881720423698426, + "num_tokens": 79633601.0, + "step": 27580 + }, + { + "entropy": 0.05924738338217139, + "epoch": 6.430120060613125, + "grad_norm": 1.078125, + "learning_rate": 4.8686533840748714e-05, + "loss": 0.0269, + "mean_token_accuracy": 0.988178825378418, + "num_tokens": 79659234.0, + "step": 27585 + }, + { + "entropy": 0.07275372557342052, + "epoch": 6.431285697633757, + "grad_norm": 1.5546875, + "learning_rate": 4.868585264159056e-05, + "loss": 0.0413, + "mean_token_accuracy": 0.9883818924427032, + "num_tokens": 79669822.0, + "step": 27590 + }, + { + "entropy": 0.08007224500179291, + "epoch": 6.432451334654389, + "grad_norm": 2.6875, + "learning_rate": 4.868517127563451e-05, + "loss": 0.0564, + "mean_token_accuracy": 0.985856169462204, + "num_tokens": 79681441.0, + "step": 27595 + }, + { + "entropy": 0.09746268084272743, + "epoch": 6.43361697167502, + "grad_norm": 0.50390625, + "learning_rate": 4.8684489742890735e-05, + "loss": 0.1101, + "mean_token_accuracy": 0.9700672090053558, + "num_tokens": 79707024.0, + "step": 27600 + }, + { + "entropy": 0.08292770921252668, + "epoch": 6.434782608695652, + "grad_norm": 0.7578125, + "learning_rate": 4.868380804336939e-05, + "loss": 0.044, + "mean_token_accuracy": 0.9842049777507782, + "num_tokens": 79735662.0, + "step": 27605 + }, + { + "entropy": 0.06323742168024182, + "epoch": 6.4359482457162835, + "grad_norm": 1.625, + "learning_rate": 4.868312617708065e-05, + "loss": 0.036, + "mean_token_accuracy": 0.9877136886119843, + "num_tokens": 79753008.0, + "step": 27610 + }, + { + "entropy": 0.054892979096621275, + "epoch": 6.437113882736916, + "grad_norm": 0.578125, + "learning_rate": 4.868244414403468e-05, + "loss": 0.0304, + "mean_token_accuracy": 0.9911454975605011, + "num_tokens": 79774283.0, + "step": 27615 + }, + { + "entropy": 0.060715962387621406, + "epoch": 6.438279519757548, + "grad_norm": 0.7578125, + "learning_rate": 4.868176194424165e-05, + "loss": 0.022, + "mean_token_accuracy": 0.9909097194671631, + "num_tokens": 79802163.0, + "step": 27620 + }, + { + "entropy": 0.07650736030191183, + "epoch": 6.439445156778179, + "grad_norm": 2.203125, + "learning_rate": 4.8681079577711733e-05, + "loss": 0.0462, + "mean_token_accuracy": 0.9869279861450195, + "num_tokens": 79817232.0, + "step": 27625 + }, + { + "entropy": 0.05185747491195798, + "epoch": 6.440610793798811, + "grad_norm": 0.546875, + "learning_rate": 4.8680397044455095e-05, + "loss": 0.017, + "mean_token_accuracy": 0.9912841856479645, + "num_tokens": 79850502.0, + "step": 27630 + }, + { + "entropy": 0.07319100480526686, + "epoch": 6.441776430819443, + "grad_norm": 2.921875, + "learning_rate": 4.867971434448192e-05, + "loss": 0.0441, + "mean_token_accuracy": 0.9871128141880036, + "num_tokens": 79865854.0, + "step": 27635 + }, + { + "entropy": 0.059058552328497174, + "epoch": 6.442942067840074, + "grad_norm": 0.328125, + "learning_rate": 4.867903147780239e-05, + "loss": 0.0147, + "mean_token_accuracy": 0.991374397277832, + "num_tokens": 79893417.0, + "step": 27640 + }, + { + "entropy": 0.06549884639680385, + "epoch": 6.4441077048607065, + "grad_norm": 1.296875, + "learning_rate": 4.867834844442669e-05, + "loss": 0.0292, + "mean_token_accuracy": 0.9903046309947967, + "num_tokens": 79910987.0, + "step": 27645 + }, + { + "entropy": 0.061291656084358694, + "epoch": 6.445273341881338, + "grad_norm": 2.75, + "learning_rate": 4.8677665244365e-05, + "loss": 0.025, + "mean_token_accuracy": 0.9875168144702912, + "num_tokens": 79938899.0, + "step": 27650 + }, + { + "entropy": 0.1144998598843813, + "epoch": 6.44643897890197, + "grad_norm": 0.271484375, + "learning_rate": 4.8676981877627516e-05, + "loss": 0.0399, + "mean_token_accuracy": 0.9874226093292237, + "num_tokens": 79964068.0, + "step": 27655 + }, + { + "entropy": 0.06761856637895107, + "epoch": 6.447604615922602, + "grad_norm": 0.78125, + "learning_rate": 4.867629834422441e-05, + "loss": 0.0201, + "mean_token_accuracy": 0.9924596011638641, + "num_tokens": 79990446.0, + "step": 27660 + }, + { + "entropy": 0.07056233957409859, + "epoch": 6.448770252943233, + "grad_norm": 0.55078125, + "learning_rate": 4.867561464416589e-05, + "loss": 0.0274, + "mean_token_accuracy": 0.9904316186904907, + "num_tokens": 80013369.0, + "step": 27665 + }, + { + "entropy": 0.05911567583680153, + "epoch": 6.449935889963865, + "grad_norm": 0.5, + "learning_rate": 4.8674930777462146e-05, + "loss": 0.0177, + "mean_token_accuracy": 0.9893089294433594, + "num_tokens": 80043038.0, + "step": 27670 + }, + { + "entropy": 0.061127489618957045, + "epoch": 6.451101526984497, + "grad_norm": 0.439453125, + "learning_rate": 4.867424674412337e-05, + "loss": 0.0178, + "mean_token_accuracy": 0.9948661983013153, + "num_tokens": 80074697.0, + "step": 27675 + }, + { + "entropy": 0.08972432166337967, + "epoch": 6.452267164005129, + "grad_norm": 4.40625, + "learning_rate": 4.867356254415979e-05, + "loss": 0.059, + "mean_token_accuracy": 0.9820292592048645, + "num_tokens": 80091290.0, + "step": 27680 + }, + { + "entropy": 0.056455508526414636, + "epoch": 6.453432801025761, + "grad_norm": 1.4921875, + "learning_rate": 4.867287817758157e-05, + "loss": 0.0406, + "mean_token_accuracy": 0.9857260882854462, + "num_tokens": 80120900.0, + "step": 27685 + }, + { + "entropy": 0.16505700461566447, + "epoch": 6.454598438046393, + "grad_norm": 1.078125, + "learning_rate": 4.867219364439893e-05, + "loss": 0.2416, + "mean_token_accuracy": 0.9665460169315339, + "num_tokens": 80142769.0, + "step": 27690 + }, + { + "entropy": 0.05633875224739313, + "epoch": 6.455764075067024, + "grad_norm": 0.5, + "learning_rate": 4.867150894462208e-05, + "loss": 0.0103, + "mean_token_accuracy": 0.9942323207855225, + "num_tokens": 80169351.0, + "step": 27695 + }, + { + "entropy": 0.06843089256435633, + "epoch": 6.456929712087656, + "grad_norm": 0.8359375, + "learning_rate": 4.867082407826123e-05, + "loss": 0.045, + "mean_token_accuracy": 0.9859276533126831, + "num_tokens": 80181397.0, + "step": 27700 + }, + { + "entropy": 0.07748326919972896, + "epoch": 6.458095349108287, + "grad_norm": 1.4140625, + "learning_rate": 4.867013904532659e-05, + "loss": 0.0426, + "mean_token_accuracy": 0.98861523270607, + "num_tokens": 80191864.0, + "step": 27705 + }, + { + "entropy": 0.07460166187956929, + "epoch": 6.459260986128919, + "grad_norm": 1.4921875, + "learning_rate": 4.8669453845828375e-05, + "loss": 0.0461, + "mean_token_accuracy": 0.9854443371295929, + "num_tokens": 80209907.0, + "step": 27710 + }, + { + "entropy": 0.06753936400637031, + "epoch": 6.4604266231495515, + "grad_norm": 2.359375, + "learning_rate": 4.8668768479776805e-05, + "loss": 0.0257, + "mean_token_accuracy": 0.988853371143341, + "num_tokens": 80239473.0, + "step": 27715 + }, + { + "entropy": 0.08805846348404885, + "epoch": 6.461592260170183, + "grad_norm": 0.6015625, + "learning_rate": 4.8668082947182104e-05, + "loss": 0.0344, + "mean_token_accuracy": 0.9911517918109893, + "num_tokens": 80251800.0, + "step": 27720 + }, + { + "entropy": 0.05560419810935855, + "epoch": 6.462757897190815, + "grad_norm": 2.078125, + "learning_rate": 4.866739724805448e-05, + "loss": 0.0204, + "mean_token_accuracy": 0.9919722378253937, + "num_tokens": 80281701.0, + "step": 27725 + }, + { + "entropy": 0.04412915213033557, + "epoch": 6.463923534211447, + "grad_norm": 0.328125, + "learning_rate": 4.8666711382404174e-05, + "loss": 0.0236, + "mean_token_accuracy": 0.9937281429767608, + "num_tokens": 80316543.0, + "step": 27730 + }, + { + "entropy": 0.06135749835520983, + "epoch": 6.465089171232078, + "grad_norm": 0.86328125, + "learning_rate": 4.8666025350241394e-05, + "loss": 0.0355, + "mean_token_accuracy": 0.9881977498531341, + "num_tokens": 80330948.0, + "step": 27735 + }, + { + "entropy": 0.060386568494141105, + "epoch": 6.46625480825271, + "grad_norm": 0.9921875, + "learning_rate": 4.866533915157639e-05, + "loss": 0.019, + "mean_token_accuracy": 0.9925783932209015, + "num_tokens": 80356120.0, + "step": 27740 + }, + { + "entropy": 0.08376160068437458, + "epoch": 6.4674204452733415, + "grad_norm": 2.140625, + "learning_rate": 4.866465278641938e-05, + "loss": 0.0304, + "mean_token_accuracy": 0.9902998149394989, + "num_tokens": 80370511.0, + "step": 27745 + }, + { + "entropy": 0.07058606464415788, + "epoch": 6.468586082293974, + "grad_norm": 3.671875, + "learning_rate": 4.866396625478061e-05, + "loss": 0.0397, + "mean_token_accuracy": 0.9877101600170135, + "num_tokens": 80391356.0, + "step": 27750 + }, + { + "entropy": 0.09883666504174471, + "epoch": 6.469751719314606, + "grad_norm": 0.59375, + "learning_rate": 4.866327955667032e-05, + "loss": 0.0804, + "mean_token_accuracy": 0.9778140246868133, + "num_tokens": 80414428.0, + "step": 27755 + }, + { + "entropy": 0.06909555848687887, + "epoch": 6.470917356335237, + "grad_norm": 1.2421875, + "learning_rate": 4.866259269209873e-05, + "loss": 0.0526, + "mean_token_accuracy": 0.9838371276855469, + "num_tokens": 80432420.0, + "step": 27760 + }, + { + "entropy": 0.07560903541743755, + "epoch": 6.472082993355869, + "grad_norm": 2.6875, + "learning_rate": 4.866190566107609e-05, + "loss": 0.0531, + "mean_token_accuracy": 0.9859624147415161, + "num_tokens": 80441410.0, + "step": 27765 + }, + { + "entropy": 0.06336052715778351, + "epoch": 6.473248630376501, + "grad_norm": 0.71484375, + "learning_rate": 4.866121846361266e-05, + "loss": 0.0276, + "mean_token_accuracy": 0.9916476845741272, + "num_tokens": 80455283.0, + "step": 27770 + }, + { + "entropy": 0.05840042941272259, + "epoch": 6.474414267397132, + "grad_norm": 0.5, + "learning_rate": 4.8660531099718666e-05, + "loss": 0.024, + "mean_token_accuracy": 0.9917756974697113, + "num_tokens": 80476389.0, + "step": 27775 + }, + { + "entropy": 0.061544339545071124, + "epoch": 6.475579904417764, + "grad_norm": 0.55078125, + "learning_rate": 4.865984356940437e-05, + "loss": 0.0275, + "mean_token_accuracy": 0.9913029193878173, + "num_tokens": 80494281.0, + "step": 27780 + }, + { + "entropy": 0.07006441093981267, + "epoch": 6.476745541438396, + "grad_norm": 1.2734375, + "learning_rate": 4.865915587268002e-05, + "loss": 0.0356, + "mean_token_accuracy": 0.988695627450943, + "num_tokens": 80516753.0, + "step": 27785 + }, + { + "entropy": 0.06810265779495239, + "epoch": 6.477911178459028, + "grad_norm": 2.515625, + "learning_rate": 4.865846800955587e-05, + "loss": 0.0402, + "mean_token_accuracy": 0.9881290256977081, + "num_tokens": 80528537.0, + "step": 27790 + }, + { + "entropy": 0.06130484715104103, + "epoch": 6.47907681547966, + "grad_norm": 1.3984375, + "learning_rate": 4.865777998004218e-05, + "loss": 0.0268, + "mean_token_accuracy": 0.9925431370735168, + "num_tokens": 80554545.0, + "step": 27795 + }, + { + "entropy": 0.08006125790998339, + "epoch": 6.480242452500291, + "grad_norm": 0.232421875, + "learning_rate": 4.865709178414921e-05, + "loss": 0.0241, + "mean_token_accuracy": 0.9917748987674713, + "num_tokens": 80572200.0, + "step": 27800 + }, + { + "entropy": 0.09746038131415843, + "epoch": 6.481408089520923, + "grad_norm": 1.1484375, + "learning_rate": 4.865640342188722e-05, + "loss": 0.0326, + "mean_token_accuracy": 0.9908531725406646, + "num_tokens": 80602063.0, + "step": 27805 + }, + { + "entropy": 0.08190167564898729, + "epoch": 6.482573726541555, + "grad_norm": 2.765625, + "learning_rate": 4.865571489326647e-05, + "loss": 0.0393, + "mean_token_accuracy": 0.9881881058216095, + "num_tokens": 80619369.0, + "step": 27810 + }, + { + "entropy": 0.06538397213444114, + "epoch": 6.4837393635621865, + "grad_norm": 2.46875, + "learning_rate": 4.8655026198297235e-05, + "loss": 0.0428, + "mean_token_accuracy": 0.9888033509254456, + "num_tokens": 80641276.0, + "step": 27815 + }, + { + "entropy": 0.06671316362917423, + "epoch": 6.484905000582819, + "grad_norm": 1.6875, + "learning_rate": 4.865433733698979e-05, + "loss": 0.0388, + "mean_token_accuracy": 0.9907189249992371, + "num_tokens": 80654599.0, + "step": 27820 + }, + { + "entropy": 0.08046202724799514, + "epoch": 6.486070637603451, + "grad_norm": 1.5234375, + "learning_rate": 4.8653648309354385e-05, + "loss": 0.0443, + "mean_token_accuracy": 0.9860710978507996, + "num_tokens": 80675635.0, + "step": 27825 + }, + { + "entropy": 0.06237729825079441, + "epoch": 6.487236274624082, + "grad_norm": 1.703125, + "learning_rate": 4.865295911540131e-05, + "loss": 0.0445, + "mean_token_accuracy": 0.9872939884662628, + "num_tokens": 80690700.0, + "step": 27830 + }, + { + "entropy": 0.07674776520580054, + "epoch": 6.488401911644714, + "grad_norm": 4.28125, + "learning_rate": 4.865226975514085e-05, + "loss": 0.0397, + "mean_token_accuracy": 0.9856312572956085, + "num_tokens": 80714560.0, + "step": 27835 + }, + { + "entropy": 0.11107652802020311, + "epoch": 6.489567548665345, + "grad_norm": 0.298828125, + "learning_rate": 4.8651580228583264e-05, + "loss": 0.0416, + "mean_token_accuracy": 0.987369966506958, + "num_tokens": 80744196.0, + "step": 27840 + }, + { + "entropy": 0.07360722348093987, + "epoch": 6.490733185685977, + "grad_norm": 0.7734375, + "learning_rate": 4.8650890535738844e-05, + "loss": 0.0301, + "mean_token_accuracy": 0.9905798494815826, + "num_tokens": 80763300.0, + "step": 27845 + }, + { + "entropy": 0.0837981840595603, + "epoch": 6.4918988227066095, + "grad_norm": 1.4765625, + "learning_rate": 4.865020067661788e-05, + "loss": 0.043, + "mean_token_accuracy": 0.9866504311561585, + "num_tokens": 80777853.0, + "step": 27850 + }, + { + "entropy": 0.07153965644538403, + "epoch": 6.493064459727241, + "grad_norm": 3.3125, + "learning_rate": 4.864951065123065e-05, + "loss": 0.0294, + "mean_token_accuracy": 0.9883228242397308, + "num_tokens": 80796982.0, + "step": 27855 + }, + { + "entropy": 0.07343727601692081, + "epoch": 6.494230096747873, + "grad_norm": 1.4296875, + "learning_rate": 4.864882045958745e-05, + "loss": 0.0518, + "mean_token_accuracy": 0.9857357382774353, + "num_tokens": 80831583.0, + "step": 27860 + }, + { + "entropy": 0.06243647811934352, + "epoch": 6.495395733768505, + "grad_norm": 0.59765625, + "learning_rate": 4.8648130101698565e-05, + "loss": 0.03, + "mean_token_accuracy": 0.9885487198829651, + "num_tokens": 80852559.0, + "step": 27865 + }, + { + "entropy": 0.06593287736177444, + "epoch": 6.496561370789136, + "grad_norm": 3.25, + "learning_rate": 4.864743957757429e-05, + "loss": 0.0309, + "mean_token_accuracy": 0.9913531363010406, + "num_tokens": 80869445.0, + "step": 27870 + }, + { + "entropy": 0.06547895520925522, + "epoch": 6.497727007809768, + "grad_norm": 0.69921875, + "learning_rate": 4.8646748887224926e-05, + "loss": 0.0183, + "mean_token_accuracy": 0.9927921533584595, + "num_tokens": 80894845.0, + "step": 27875 + }, + { + "entropy": 0.05576323997229338, + "epoch": 6.498892644830399, + "grad_norm": 0.50390625, + "learning_rate": 4.864605803066077e-05, + "loss": 0.0451, + "mean_token_accuracy": 0.9896509110927582, + "num_tokens": 80930602.0, + "step": 27880 + }, + { + "entropy": 0.05927497670054436, + "epoch": 6.5000582818510315, + "grad_norm": 4.28125, + "learning_rate": 4.864536700789212e-05, + "loss": 0.0416, + "mean_token_accuracy": 0.9875927090644836, + "num_tokens": 80951872.0, + "step": 27885 + }, + { + "entropy": 0.057063156738877296, + "epoch": 6.501223918871664, + "grad_norm": 0.216796875, + "learning_rate": 4.864467581892929e-05, + "loss": 0.0295, + "mean_token_accuracy": 0.9899273931980133, + "num_tokens": 80977099.0, + "step": 27890 + }, + { + "entropy": 0.08189502377063036, + "epoch": 6.502389555892295, + "grad_norm": 1.1484375, + "learning_rate": 4.8643984463782584e-05, + "loss": 0.0484, + "mean_token_accuracy": 0.9839639067649841, + "num_tokens": 80989763.0, + "step": 27895 + }, + { + "entropy": 0.08151528304442764, + "epoch": 6.503555192912927, + "grad_norm": 2.015625, + "learning_rate": 4.86432929424623e-05, + "loss": 0.044, + "mean_token_accuracy": 0.9860894799232482, + "num_tokens": 81006752.0, + "step": 27900 + }, + { + "entropy": 0.05777513347566128, + "epoch": 6.504720829933559, + "grad_norm": 1.3984375, + "learning_rate": 4.864260125497877e-05, + "loss": 0.025, + "mean_token_accuracy": 0.9919376611709595, + "num_tokens": 81029463.0, + "step": 27905 + }, + { + "entropy": 0.052465210948139426, + "epoch": 6.50588646695419, + "grad_norm": 0.640625, + "learning_rate": 4.864190940134228e-05, + "loss": 0.02, + "mean_token_accuracy": 0.9926451206207275, + "num_tokens": 81056765.0, + "step": 27910 + }, + { + "entropy": 0.053254136629402635, + "epoch": 6.507052103974822, + "grad_norm": 0.2490234375, + "learning_rate": 4.864121738156317e-05, + "loss": 0.023, + "mean_token_accuracy": 0.9922501981258393, + "num_tokens": 81082674.0, + "step": 27915 + }, + { + "entropy": 0.059919987060129645, + "epoch": 6.508217740995454, + "grad_norm": 0.2490234375, + "learning_rate": 4.8640525195651754e-05, + "loss": 0.0229, + "mean_token_accuracy": 0.988700020313263, + "num_tokens": 81117981.0, + "step": 27920 + }, + { + "entropy": 0.06886968342587352, + "epoch": 6.509383378016086, + "grad_norm": 1.546875, + "learning_rate": 4.863983284361835e-05, + "loss": 0.0388, + "mean_token_accuracy": 0.9891575574874878, + "num_tokens": 81142768.0, + "step": 27925 + }, + { + "entropy": 0.06728349346667528, + "epoch": 6.510549015036718, + "grad_norm": 0.474609375, + "learning_rate": 4.863914032547328e-05, + "loss": 0.0352, + "mean_token_accuracy": 0.9899146974086761, + "num_tokens": 81164421.0, + "step": 27930 + }, + { + "entropy": 0.07344648716971278, + "epoch": 6.511714652057349, + "grad_norm": 0.365234375, + "learning_rate": 4.863844764122687e-05, + "loss": 0.0304, + "mean_token_accuracy": 0.9872526228427887, + "num_tokens": 81193947.0, + "step": 27935 + }, + { + "entropy": 0.052133775874972345, + "epoch": 6.512880289077981, + "grad_norm": 0.224609375, + "learning_rate": 4.863775479088946e-05, + "loss": 0.0254, + "mean_token_accuracy": 0.9909688651561737, + "num_tokens": 81221469.0, + "step": 27940 + }, + { + "entropy": 0.06797988787293434, + "epoch": 6.514045926098613, + "grad_norm": 1.84375, + "learning_rate": 4.863706177447138e-05, + "loss": 0.0468, + "mean_token_accuracy": 0.9872564733028412, + "num_tokens": 81232706.0, + "step": 27945 + }, + { + "entropy": 0.07692260686308146, + "epoch": 6.515211563119244, + "grad_norm": 1.578125, + "learning_rate": 4.8636368591982944e-05, + "loss": 0.0416, + "mean_token_accuracy": 0.9876651465892792, + "num_tokens": 81244650.0, + "step": 27950 + }, + { + "entropy": 0.07264660159125924, + "epoch": 6.5163772001398765, + "grad_norm": 1.15625, + "learning_rate": 4.863567524343451e-05, + "loss": 0.0346, + "mean_token_accuracy": 0.9892261922359467, + "num_tokens": 81260414.0, + "step": 27955 + }, + { + "entropy": 0.06311188600957393, + "epoch": 6.517542837160509, + "grad_norm": 0.484375, + "learning_rate": 4.8634981728836404e-05, + "loss": 0.0203, + "mean_token_accuracy": 0.9921561360359192, + "num_tokens": 81282441.0, + "step": 27960 + }, + { + "entropy": 0.0678584418259561, + "epoch": 6.51870847418114, + "grad_norm": 1.8046875, + "learning_rate": 4.863428804819898e-05, + "loss": 0.0386, + "mean_token_accuracy": 0.9875958859920502, + "num_tokens": 81300933.0, + "step": 27965 + }, + { + "entropy": 0.06609183494001628, + "epoch": 6.519874111201772, + "grad_norm": 1.03125, + "learning_rate": 4.863359420153257e-05, + "loss": 0.0206, + "mean_token_accuracy": 0.9928387343883515, + "num_tokens": 81318505.0, + "step": 27970 + }, + { + "entropy": 0.06018575457856059, + "epoch": 6.521039748222403, + "grad_norm": 3.21875, + "learning_rate": 4.863290018884752e-05, + "loss": 0.0296, + "mean_token_accuracy": 0.9921539068222046, + "num_tokens": 81342932.0, + "step": 27975 + }, + { + "entropy": 0.09531013956293463, + "epoch": 6.522205385243035, + "grad_norm": 4.25, + "learning_rate": 4.863220601015419e-05, + "loss": 0.0503, + "mean_token_accuracy": 0.9850192785263061, + "num_tokens": 81361075.0, + "step": 27980 + }, + { + "entropy": 0.07941121272742749, + "epoch": 6.523371022263667, + "grad_norm": 2.515625, + "learning_rate": 4.863151166546292e-05, + "loss": 0.0497, + "mean_token_accuracy": 0.9868826925754547, + "num_tokens": 81369715.0, + "step": 27985 + }, + { + "entropy": 0.06422848673537374, + "epoch": 6.524536659284299, + "grad_norm": 0.3359375, + "learning_rate": 4.863081715478407e-05, + "loss": 0.0259, + "mean_token_accuracy": 0.9898504614830017, + "num_tokens": 81400861.0, + "step": 27990 + }, + { + "entropy": 0.06149542648345232, + "epoch": 6.525702296304931, + "grad_norm": 2.296875, + "learning_rate": 4.8630122478127995e-05, + "loss": 0.0255, + "mean_token_accuracy": 0.9882985472679138, + "num_tokens": 81432239.0, + "step": 27995 + }, + { + "entropy": 0.07662085331976413, + "epoch": 6.526867933325562, + "grad_norm": 2.359375, + "learning_rate": 4.8629427635505055e-05, + "loss": 0.0576, + "mean_token_accuracy": 0.9864896059036254, + "num_tokens": 81444100.0, + "step": 28000 + }, + { + "entropy": 0.07400370575487614, + "epoch": 6.528033570346194, + "grad_norm": 1.40625, + "learning_rate": 4.8628732626925613e-05, + "loss": 0.0409, + "mean_token_accuracy": 0.9867150366306305, + "num_tokens": 81463262.0, + "step": 28005 + }, + { + "entropy": 0.06608630102127791, + "epoch": 6.529199207366826, + "grad_norm": 6.15625, + "learning_rate": 4.862803745240002e-05, + "loss": 0.0449, + "mean_token_accuracy": 0.9875483572483063, + "num_tokens": 81484054.0, + "step": 28010 + }, + { + "entropy": 0.06695858966559172, + "epoch": 6.530364844387458, + "grad_norm": 0.361328125, + "learning_rate": 4.862734211193866e-05, + "loss": 0.0257, + "mean_token_accuracy": 0.9882546544075013, + "num_tokens": 81510572.0, + "step": 28015 + }, + { + "entropy": 0.06558367498219013, + "epoch": 6.5315304814080895, + "grad_norm": 1.0859375, + "learning_rate": 4.8626646605551876e-05, + "loss": 0.0299, + "mean_token_accuracy": 0.9903232753276825, + "num_tokens": 81534531.0, + "step": 28020 + }, + { + "entropy": 0.06639163857325911, + "epoch": 6.5326961184287216, + "grad_norm": 1.4921875, + "learning_rate": 4.862595093325007e-05, + "loss": 0.0307, + "mean_token_accuracy": 0.9907230257987976, + "num_tokens": 81555787.0, + "step": 28025 + }, + { + "entropy": 0.06413227356970311, + "epoch": 6.533861755449353, + "grad_norm": 0.75, + "learning_rate": 4.8625255095043595e-05, + "loss": 0.0281, + "mean_token_accuracy": 0.9888515174388885, + "num_tokens": 81568551.0, + "step": 28030 + }, + { + "entropy": 0.061216467432677744, + "epoch": 6.535027392469985, + "grad_norm": 1.3984375, + "learning_rate": 4.862455909094284e-05, + "loss": 0.0185, + "mean_token_accuracy": 0.9905256271362305, + "num_tokens": 81600493.0, + "step": 28035 + }, + { + "entropy": 0.08351697884500027, + "epoch": 6.536193029490617, + "grad_norm": 1.59375, + "learning_rate": 4.862386292095817e-05, + "loss": 0.0363, + "mean_token_accuracy": 0.9892892718315125, + "num_tokens": 81612813.0, + "step": 28040 + }, + { + "entropy": 0.06674035713076591, + "epoch": 6.537358666511248, + "grad_norm": 0.87890625, + "learning_rate": 4.8623166585099974e-05, + "loss": 0.0298, + "mean_token_accuracy": 0.9903652191162109, + "num_tokens": 81627906.0, + "step": 28045 + }, + { + "entropy": 0.06334521155804396, + "epoch": 6.53852430353188, + "grad_norm": 0.8046875, + "learning_rate": 4.862247008337864e-05, + "loss": 0.0342, + "mean_token_accuracy": 0.9886260449886322, + "num_tokens": 81641286.0, + "step": 28050 + }, + { + "entropy": 0.06510356459766627, + "epoch": 6.5396899405525115, + "grad_norm": 3.046875, + "learning_rate": 4.8621773415804546e-05, + "loss": 0.0293, + "mean_token_accuracy": 0.9888158679008484, + "num_tokens": 81657218.0, + "step": 28055 + }, + { + "entropy": 0.06543186036869883, + "epoch": 6.540855577573144, + "grad_norm": 0.376953125, + "learning_rate": 4.862107658238808e-05, + "loss": 0.0394, + "mean_token_accuracy": 0.988468474149704, + "num_tokens": 81679608.0, + "step": 28060 + }, + { + "entropy": 0.08569641970098019, + "epoch": 6.542021214593776, + "grad_norm": 2.859375, + "learning_rate": 4.862037958313964e-05, + "loss": 0.027, + "mean_token_accuracy": 0.988139945268631, + "num_tokens": 81708410.0, + "step": 28065 + }, + { + "entropy": 0.0672588437795639, + "epoch": 6.543186851614407, + "grad_norm": 0.90234375, + "learning_rate": 4.861968241806961e-05, + "loss": 0.0339, + "mean_token_accuracy": 0.9860668420791626, + "num_tokens": 81732815.0, + "step": 28070 + }, + { + "entropy": 0.07395201176404953, + "epoch": 6.544352488635039, + "grad_norm": 1.0234375, + "learning_rate": 4.861898508718838e-05, + "loss": 0.046, + "mean_token_accuracy": 0.9865674555301667, + "num_tokens": 81743650.0, + "step": 28075 + }, + { + "entropy": 0.06611506836488842, + "epoch": 6.545518125655671, + "grad_norm": 3.6875, + "learning_rate": 4.8618287590506376e-05, + "loss": 0.038, + "mean_token_accuracy": 0.9851749837398529, + "num_tokens": 81766370.0, + "step": 28080 + }, + { + "entropy": 0.03880115207284689, + "epoch": 6.546683762676302, + "grad_norm": 0.337890625, + "learning_rate": 4.8617589928033966e-05, + "loss": 0.0166, + "mean_token_accuracy": 0.994249552488327, + "num_tokens": 81796871.0, + "step": 28085 + }, + { + "entropy": 0.06620198376476764, + "epoch": 6.5478493996969345, + "grad_norm": 0.2041015625, + "learning_rate": 4.861689209978158e-05, + "loss": 0.0294, + "mean_token_accuracy": 0.9911263763904572, + "num_tokens": 81816362.0, + "step": 28090 + }, + { + "entropy": 0.06923009911552072, + "epoch": 6.549015036717567, + "grad_norm": 1.3203125, + "learning_rate": 4.8616194105759606e-05, + "loss": 0.0107, + "mean_token_accuracy": 0.9917510986328125, + "num_tokens": 81855342.0, + "step": 28095 + }, + { + "entropy": 0.06260980144143105, + "epoch": 6.550180673738198, + "grad_norm": 1.46875, + "learning_rate": 4.861549594597846e-05, + "loss": 0.0252, + "mean_token_accuracy": 0.9904425501823425, + "num_tokens": 81871751.0, + "step": 28100 + }, + { + "entropy": 0.0857225801795721, + "epoch": 6.55134631075883, + "grad_norm": 1.265625, + "learning_rate": 4.861479762044856e-05, + "loss": 0.0714, + "mean_token_accuracy": 0.983836007118225, + "num_tokens": 81881802.0, + "step": 28105 + }, + { + "entropy": 0.06633962662890554, + "epoch": 6.552511947779461, + "grad_norm": 0.8359375, + "learning_rate": 4.861409912918029e-05, + "loss": 0.0307, + "mean_token_accuracy": 0.9916288614273071, + "num_tokens": 81898680.0, + "step": 28110 + }, + { + "entropy": 0.08444469049572945, + "epoch": 6.553677584800093, + "grad_norm": 2.15625, + "learning_rate": 4.86134004721841e-05, + "loss": 0.0417, + "mean_token_accuracy": 0.9863658368587493, + "num_tokens": 81918320.0, + "step": 28115 + }, + { + "entropy": 0.08844755683094263, + "epoch": 6.554843221820725, + "grad_norm": 1.3515625, + "learning_rate": 4.8612701649470385e-05, + "loss": 0.0625, + "mean_token_accuracy": 0.9853982567787171, + "num_tokens": 81938033.0, + "step": 28120 + }, + { + "entropy": 0.08071080623194575, + "epoch": 6.5560088588413565, + "grad_norm": 1.75, + "learning_rate": 4.8612002661049584e-05, + "loss": 0.0346, + "mean_token_accuracy": 0.9872838437557221, + "num_tokens": 81957727.0, + "step": 28125 + }, + { + "entropy": 0.11229084599763155, + "epoch": 6.557174495861989, + "grad_norm": 2.203125, + "learning_rate": 4.8611303506932104e-05, + "loss": 0.1357, + "mean_token_accuracy": 0.9701371610164642, + "num_tokens": 81993913.0, + "step": 28130 + }, + { + "entropy": 0.06542156785726547, + "epoch": 6.55834013288262, + "grad_norm": 0.291015625, + "learning_rate": 4.861060418712837e-05, + "loss": 0.0297, + "mean_token_accuracy": 0.9911216259002685, + "num_tokens": 82017676.0, + "step": 28135 + }, + { + "entropy": 0.0545942329801619, + "epoch": 6.559505769903252, + "grad_norm": 0.2412109375, + "learning_rate": 4.860990470164883e-05, + "loss": 0.0177, + "mean_token_accuracy": 0.9939680576324463, + "num_tokens": 82041966.0, + "step": 28140 + }, + { + "entropy": 0.13166623264551164, + "epoch": 6.560671406923884, + "grad_norm": 0.466796875, + "learning_rate": 4.8609205050503895e-05, + "loss": 0.1511, + "mean_token_accuracy": 0.9682539939880371, + "num_tokens": 82075203.0, + "step": 28145 + }, + { + "entropy": 0.07135718585923315, + "epoch": 6.561837043944516, + "grad_norm": 0.8359375, + "learning_rate": 4.8608505233704e-05, + "loss": 0.0502, + "mean_token_accuracy": 0.9858476459980011, + "num_tokens": 82098960.0, + "step": 28150 + }, + { + "entropy": 0.08739682212471962, + "epoch": 6.563002680965147, + "grad_norm": 0.455078125, + "learning_rate": 4.8607805251259584e-05, + "loss": 0.0293, + "mean_token_accuracy": 0.9880187332630157, + "num_tokens": 82131078.0, + "step": 28155 + }, + { + "entropy": 0.058482589572668074, + "epoch": 6.5641683179857795, + "grad_norm": 1.0390625, + "learning_rate": 4.8607105103181086e-05, + "loss": 0.0258, + "mean_token_accuracy": 0.990253335237503, + "num_tokens": 82167502.0, + "step": 28160 + }, + { + "entropy": 0.08452012352645397, + "epoch": 6.565333955006411, + "grad_norm": 1.84375, + "learning_rate": 4.860640478947895e-05, + "loss": 0.0601, + "mean_token_accuracy": 0.9837320148944855, + "num_tokens": 82177026.0, + "step": 28165 + }, + { + "entropy": 0.06720562288537621, + "epoch": 6.566499592027043, + "grad_norm": 2.453125, + "learning_rate": 4.860570431016361e-05, + "loss": 0.0301, + "mean_token_accuracy": 0.9914529860019684, + "num_tokens": 82213975.0, + "step": 28170 + }, + { + "entropy": 0.04906615521758795, + "epoch": 6.567665229047675, + "grad_norm": 0.443359375, + "learning_rate": 4.8605003665245516e-05, + "loss": 0.0165, + "mean_token_accuracy": 0.9915839433670044, + "num_tokens": 82249364.0, + "step": 28175 + }, + { + "entropy": 0.06979955788701772, + "epoch": 6.568830866068306, + "grad_norm": 2.5625, + "learning_rate": 4.860430285473511e-05, + "loss": 0.0487, + "mean_token_accuracy": 0.9855699956417083, + "num_tokens": 82267527.0, + "step": 28180 + }, + { + "entropy": 0.06681377086788416, + "epoch": 6.569996503088938, + "grad_norm": 0.1005859375, + "learning_rate": 4.860360187864285e-05, + "loss": 0.044, + "mean_token_accuracy": 0.9887908697128296, + "num_tokens": 82287597.0, + "step": 28185 + }, + { + "entropy": 0.08111393954604865, + "epoch": 6.5711621401095694, + "grad_norm": 1.4296875, + "learning_rate": 4.8602900736979185e-05, + "loss": 0.0386, + "mean_token_accuracy": 0.986400431394577, + "num_tokens": 82306790.0, + "step": 28190 + }, + { + "entropy": 0.07878834493458271, + "epoch": 6.5723277771302016, + "grad_norm": 2.875, + "learning_rate": 4.860219942975457e-05, + "loss": 0.0545, + "mean_token_accuracy": 0.9820395827293396, + "num_tokens": 82323050.0, + "step": 28195 + }, + { + "entropy": 0.03850213307887316, + "epoch": 6.573493414150834, + "grad_norm": 0.2392578125, + "learning_rate": 4.860149795697946e-05, + "loss": 0.0104, + "mean_token_accuracy": 0.9948647737503051, + "num_tokens": 82357011.0, + "step": 28200 + }, + { + "entropy": 0.06725431568920612, + "epoch": 6.574659051171465, + "grad_norm": 2.6875, + "learning_rate": 4.860079631866432e-05, + "loss": 0.0364, + "mean_token_accuracy": 0.9887121975421905, + "num_tokens": 82372585.0, + "step": 28205 + }, + { + "entropy": 0.05918426923453808, + "epoch": 6.575824688192097, + "grad_norm": 1.2421875, + "learning_rate": 4.860009451481961e-05, + "loss": 0.0211, + "mean_token_accuracy": 0.9932548403739929, + "num_tokens": 82408282.0, + "step": 28210 + }, + { + "entropy": 0.05952431866899133, + "epoch": 6.576990325212729, + "grad_norm": 0.15625, + "learning_rate": 4.8599392545455797e-05, + "loss": 0.0282, + "mean_token_accuracy": 0.9910712659358978, + "num_tokens": 82440321.0, + "step": 28215 + }, + { + "entropy": 0.07454060800373555, + "epoch": 6.57815596223336, + "grad_norm": 1.984375, + "learning_rate": 4.859869041058335e-05, + "loss": 0.0395, + "mean_token_accuracy": 0.9880998790264129, + "num_tokens": 82453169.0, + "step": 28220 + }, + { + "entropy": 0.08988696299493312, + "epoch": 6.579321599253992, + "grad_norm": 2.34375, + "learning_rate": 4.859798811021273e-05, + "loss": 0.0599, + "mean_token_accuracy": 0.9867674469947815, + "num_tokens": 82471385.0, + "step": 28225 + }, + { + "entropy": 0.07569172158837319, + "epoch": 6.5804872362746245, + "grad_norm": 1.671875, + "learning_rate": 4.859728564435441e-05, + "loss": 0.0423, + "mean_token_accuracy": 0.9849360704421997, + "num_tokens": 82482967.0, + "step": 28230 + }, + { + "entropy": 0.06420651264488697, + "epoch": 6.581652873295256, + "grad_norm": 0.66796875, + "learning_rate": 4.8596583013018885e-05, + "loss": 0.0427, + "mean_token_accuracy": 0.9870175302028656, + "num_tokens": 82495378.0, + "step": 28235 + }, + { + "entropy": 0.07353771440684795, + "epoch": 6.582818510315888, + "grad_norm": 1.5234375, + "learning_rate": 4.8595880216216604e-05, + "loss": 0.0427, + "mean_token_accuracy": 0.9905295133590698, + "num_tokens": 82506088.0, + "step": 28240 + }, + { + "entropy": 0.05138236228376627, + "epoch": 6.583984147336519, + "grad_norm": 1.46875, + "learning_rate": 4.8595177253958064e-05, + "loss": 0.024, + "mean_token_accuracy": 0.9912134110927582, + "num_tokens": 82535805.0, + "step": 28245 + }, + { + "entropy": 0.057565122842788696, + "epoch": 6.585149784357151, + "grad_norm": 1.0078125, + "learning_rate": 4.859447412625374e-05, + "loss": 0.0339, + "mean_token_accuracy": 0.9907476007938385, + "num_tokens": 82549312.0, + "step": 28250 + }, + { + "entropy": 0.07886847332119942, + "epoch": 6.586315421377783, + "grad_norm": 4.09375, + "learning_rate": 4.859377083311413e-05, + "loss": 0.0484, + "mean_token_accuracy": 0.9851777493953705, + "num_tokens": 82558924.0, + "step": 28255 + }, + { + "entropy": 0.06298988554626703, + "epoch": 6.5874810583984145, + "grad_norm": 2.921875, + "learning_rate": 4.85930673745497e-05, + "loss": 0.0399, + "mean_token_accuracy": 0.9884525418281556, + "num_tokens": 82582432.0, + "step": 28260 + }, + { + "entropy": 0.08036144189536572, + "epoch": 6.588646695419047, + "grad_norm": 1.8125, + "learning_rate": 4.859236375057095e-05, + "loss": 0.0465, + "mean_token_accuracy": 0.987638258934021, + "num_tokens": 82602132.0, + "step": 28265 + }, + { + "entropy": 0.06753709372133017, + "epoch": 6.589812332439678, + "grad_norm": 1.3125, + "learning_rate": 4.859165996118838e-05, + "loss": 0.0328, + "mean_token_accuracy": 0.9888114511966706, + "num_tokens": 82618780.0, + "step": 28270 + }, + { + "entropy": 0.05341739971190691, + "epoch": 6.59097796946031, + "grad_norm": 0.28125, + "learning_rate": 4.8590956006412476e-05, + "loss": 0.0255, + "mean_token_accuracy": 0.9922696411609649, + "num_tokens": 82648437.0, + "step": 28275 + }, + { + "entropy": 0.0667111149057746, + "epoch": 6.592143606480942, + "grad_norm": 0.408203125, + "learning_rate": 4.859025188625374e-05, + "loss": 0.0302, + "mean_token_accuracy": 0.9906487345695496, + "num_tokens": 82669457.0, + "step": 28280 + }, + { + "entropy": 0.06441701222211123, + "epoch": 6.593309243501574, + "grad_norm": 1.7265625, + "learning_rate": 4.858954760072265e-05, + "loss": 0.0451, + "mean_token_accuracy": 0.9849387526512146, + "num_tokens": 82682232.0, + "step": 28285 + }, + { + "entropy": 0.08769273720681667, + "epoch": 6.594474880522205, + "grad_norm": 2.84375, + "learning_rate": 4.858884314982974e-05, + "loss": 0.065, + "mean_token_accuracy": 0.9820234298706054, + "num_tokens": 82690955.0, + "step": 28290 + }, + { + "entropy": 0.06150923212990165, + "epoch": 6.595640517542837, + "grad_norm": 1.5625, + "learning_rate": 4.85881385335855e-05, + "loss": 0.0347, + "mean_token_accuracy": 0.9915253400802613, + "num_tokens": 82710026.0, + "step": 28295 + }, + { + "entropy": 0.058764266222715376, + "epoch": 6.596806154563469, + "grad_norm": 0.94140625, + "learning_rate": 4.858743375200043e-05, + "loss": 0.0274, + "mean_token_accuracy": 0.9908628404140473, + "num_tokens": 82736692.0, + "step": 28300 + }, + { + "entropy": 0.07299914155155421, + "epoch": 6.597971791584101, + "grad_norm": 1.484375, + "learning_rate": 4.858672880508506e-05, + "loss": 0.0383, + "mean_token_accuracy": 0.9905340611934662, + "num_tokens": 82758849.0, + "step": 28305 + }, + { + "entropy": 0.05621396470814943, + "epoch": 6.599137428604733, + "grad_norm": 1.65625, + "learning_rate": 4.858602369284987e-05, + "loss": 0.0233, + "mean_token_accuracy": 0.9891703724861145, + "num_tokens": 82783679.0, + "step": 28310 + }, + { + "entropy": 0.06540145818144083, + "epoch": 6.600303065625364, + "grad_norm": 2.15625, + "learning_rate": 4.8585318415305404e-05, + "loss": 0.0413, + "mean_token_accuracy": 0.9880593121051788, + "num_tokens": 82797141.0, + "step": 28315 + }, + { + "entropy": 0.07652120906859636, + "epoch": 6.601468702645996, + "grad_norm": 2.203125, + "learning_rate": 4.858461297246217e-05, + "loss": 0.0591, + "mean_token_accuracy": 0.9846980929374695, + "num_tokens": 82813933.0, + "step": 28320 + }, + { + "entropy": 0.061003577709197995, + "epoch": 6.602634339666627, + "grad_norm": 1.15625, + "learning_rate": 4.8583907364330677e-05, + "loss": 0.0356, + "mean_token_accuracy": 0.9893332183361053, + "num_tokens": 82828846.0, + "step": 28325 + }, + { + "entropy": 0.08375898487865925, + "epoch": 6.6037999766872595, + "grad_norm": 2.671875, + "learning_rate": 4.858320159092146e-05, + "loss": 0.0502, + "mean_token_accuracy": 0.9873512983322144, + "num_tokens": 82837492.0, + "step": 28330 + }, + { + "entropy": 0.07190875075757504, + "epoch": 6.604965613707892, + "grad_norm": 1.7421875, + "learning_rate": 4.858249565224503e-05, + "loss": 0.0307, + "mean_token_accuracy": 0.9873040735721588, + "num_tokens": 82849052.0, + "step": 28335 + }, + { + "entropy": 0.0680878208950162, + "epoch": 6.606131250728523, + "grad_norm": 0.64453125, + "learning_rate": 4.8581789548311924e-05, + "loss": 0.0497, + "mean_token_accuracy": 0.9851384818553924, + "num_tokens": 82885860.0, + "step": 28340 + }, + { + "entropy": 0.05567469764500856, + "epoch": 6.607296887749155, + "grad_norm": 0.6796875, + "learning_rate": 4.858108327913267e-05, + "loss": 0.0194, + "mean_token_accuracy": 0.9858059644699096, + "num_tokens": 82911685.0, + "step": 28345 + }, + { + "entropy": 0.07293141707777977, + "epoch": 6.608462524769787, + "grad_norm": 3.65625, + "learning_rate": 4.858037684471779e-05, + "loss": 0.041, + "mean_token_accuracy": 0.9894559681415558, + "num_tokens": 82924003.0, + "step": 28350 + }, + { + "entropy": 0.05879962723702192, + "epoch": 6.609628161790418, + "grad_norm": 2.96875, + "learning_rate": 4.857967024507783e-05, + "loss": 0.0248, + "mean_token_accuracy": 0.9917656898498535, + "num_tokens": 82958399.0, + "step": 28355 + }, + { + "entropy": 0.04448529947549105, + "epoch": 6.61079379881105, + "grad_norm": 0.7890625, + "learning_rate": 4.8578963480223326e-05, + "loss": 0.0196, + "mean_token_accuracy": 0.9942450284957886, + "num_tokens": 82998038.0, + "step": 28360 + }, + { + "entropy": 0.06249931612983346, + "epoch": 6.611959435831682, + "grad_norm": 2.03125, + "learning_rate": 4.857825655016481e-05, + "loss": 0.0256, + "mean_token_accuracy": 0.9889679789543152, + "num_tokens": 83025588.0, + "step": 28365 + }, + { + "entropy": 0.08115007188171149, + "epoch": 6.613125072852314, + "grad_norm": 0.478515625, + "learning_rate": 4.857754945491282e-05, + "loss": 0.0458, + "mean_token_accuracy": 0.9878861665725708, + "num_tokens": 83041745.0, + "step": 28370 + }, + { + "entropy": 0.06632826328277588, + "epoch": 6.614290709872946, + "grad_norm": 1.6796875, + "learning_rate": 4.857684219447792e-05, + "loss": 0.0247, + "mean_token_accuracy": 0.9895213901996612, + "num_tokens": 83073646.0, + "step": 28375 + }, + { + "entropy": 0.05887887412682176, + "epoch": 6.615456346893577, + "grad_norm": 1.6171875, + "learning_rate": 4.857613476887063e-05, + "loss": 0.0214, + "mean_token_accuracy": 0.9936350464820862, + "num_tokens": 83101095.0, + "step": 28380 + }, + { + "entropy": 0.06550744706764818, + "epoch": 6.616621983914209, + "grad_norm": 1.09375, + "learning_rate": 4.857542717810152e-05, + "loss": 0.0333, + "mean_token_accuracy": 0.98823082447052, + "num_tokens": 83117537.0, + "step": 28385 + }, + { + "entropy": 0.09187786467373371, + "epoch": 6.617787620934841, + "grad_norm": 2.296875, + "learning_rate": 4.857471942218112e-05, + "loss": 0.0488, + "mean_token_accuracy": 0.9861462652683258, + "num_tokens": 83141574.0, + "step": 28390 + }, + { + "entropy": 0.06437947321683168, + "epoch": 6.618953257955472, + "grad_norm": 0.8125, + "learning_rate": 4.857401150112001e-05, + "loss": 0.0297, + "mean_token_accuracy": 0.9912596046924591, + "num_tokens": 83163578.0, + "step": 28395 + }, + { + "entropy": 0.07269996423274279, + "epoch": 6.6201188949761045, + "grad_norm": 0.6640625, + "learning_rate": 4.8573303414928725e-05, + "loss": 0.0301, + "mean_token_accuracy": 0.990373021364212, + "num_tokens": 83179042.0, + "step": 28400 + }, + { + "entropy": 0.22393528148531913, + "epoch": 6.621284531996736, + "grad_norm": 2.890625, + "learning_rate": 4.857259516361783e-05, + "loss": 0.3705, + "mean_token_accuracy": 0.9582188367843628, + "num_tokens": 83213305.0, + "step": 28405 + }, + { + "entropy": 0.06759192235767841, + "epoch": 6.622450169017368, + "grad_norm": 0.2099609375, + "learning_rate": 4.8571886747197893e-05, + "loss": 0.0361, + "mean_token_accuracy": 0.9905477106571198, + "num_tokens": 83229930.0, + "step": 28410 + }, + { + "entropy": 0.06327125979587436, + "epoch": 6.623615806038, + "grad_norm": 3.515625, + "learning_rate": 4.857117816567947e-05, + "loss": 0.0431, + "mean_token_accuracy": 0.985412847995758, + "num_tokens": 83246877.0, + "step": 28415 + }, + { + "entropy": 0.0795931302011013, + "epoch": 6.624781443058632, + "grad_norm": 4.5625, + "learning_rate": 4.857046941907312e-05, + "loss": 0.0357, + "mean_token_accuracy": 0.9869856595993042, + "num_tokens": 83261191.0, + "step": 28420 + }, + { + "entropy": 0.07875011954456568, + "epoch": 6.625947080079263, + "grad_norm": 3.65625, + "learning_rate": 4.8569760507389426e-05, + "loss": 0.0442, + "mean_token_accuracy": 0.9851264238357544, + "num_tokens": 83285754.0, + "step": 28425 + }, + { + "entropy": 0.0738863229751587, + "epoch": 6.627112717099895, + "grad_norm": 1.0390625, + "learning_rate": 4.856905143063896e-05, + "loss": 0.0335, + "mean_token_accuracy": 0.9872049629688263, + "num_tokens": 83300747.0, + "step": 28430 + }, + { + "entropy": 0.08566249124705791, + "epoch": 6.628278354120527, + "grad_norm": 3.921875, + "learning_rate": 4.8568342188832276e-05, + "loss": 0.0469, + "mean_token_accuracy": 0.9827580094337464, + "num_tokens": 83318653.0, + "step": 28435 + }, + { + "entropy": 0.06706524565815926, + "epoch": 6.629443991141159, + "grad_norm": 1.203125, + "learning_rate": 4.856763278197996e-05, + "loss": 0.0243, + "mean_token_accuracy": 0.9923423230648041, + "num_tokens": 83337335.0, + "step": 28440 + }, + { + "entropy": 0.06407251004129648, + "epoch": 6.630609628161791, + "grad_norm": 0.61328125, + "learning_rate": 4.8566923210092605e-05, + "loss": 0.0239, + "mean_token_accuracy": 0.992389714717865, + "num_tokens": 83360624.0, + "step": 28445 + }, + { + "entropy": 0.0974472158588469, + "epoch": 6.631775265182422, + "grad_norm": 1.8203125, + "learning_rate": 4.856621347318078e-05, + "loss": 0.0313, + "mean_token_accuracy": 0.987954068183899, + "num_tokens": 83399635.0, + "step": 28450 + }, + { + "entropy": 0.08182252198457718, + "epoch": 6.632940902203054, + "grad_norm": 0.5546875, + "learning_rate": 4.856550357125506e-05, + "loss": 0.0521, + "mean_token_accuracy": 0.9852348983287811, + "num_tokens": 83410321.0, + "step": 28455 + }, + { + "entropy": 0.038520860578864814, + "epoch": 6.634106539223685, + "grad_norm": 0.447265625, + "learning_rate": 4.856479350432604e-05, + "loss": 0.0218, + "mean_token_accuracy": 0.9928028047084808, + "num_tokens": 83451475.0, + "step": 28460 + }, + { + "entropy": 0.07194502120837569, + "epoch": 6.635272176244317, + "grad_norm": 2.921875, + "learning_rate": 4.85640832724043e-05, + "loss": 0.028, + "mean_token_accuracy": 0.9891715884208679, + "num_tokens": 83472506.0, + "step": 28465 + }, + { + "entropy": 0.06827894113957882, + "epoch": 6.6364378132649495, + "grad_norm": 3.34375, + "learning_rate": 4.856337287550045e-05, + "loss": 0.0306, + "mean_token_accuracy": 0.9915993750095368, + "num_tokens": 83494023.0, + "step": 28470 + }, + { + "entropy": 0.0858086671680212, + "epoch": 6.637603450285581, + "grad_norm": 1.0078125, + "learning_rate": 4.856266231362506e-05, + "loss": 0.0328, + "mean_token_accuracy": 0.9919438421726227, + "num_tokens": 83505466.0, + "step": 28475 + }, + { + "entropy": 0.08067451752722263, + "epoch": 6.638769087306213, + "grad_norm": 1.5078125, + "learning_rate": 4.856195158678875e-05, + "loss": 0.0409, + "mean_token_accuracy": 0.9864005327224732, + "num_tokens": 83525173.0, + "step": 28480 + }, + { + "entropy": 0.0665781082585454, + "epoch": 6.639934724326845, + "grad_norm": 5.0625, + "learning_rate": 4.856124069500209e-05, + "loss": 0.0447, + "mean_token_accuracy": 0.9896150350570678, + "num_tokens": 83545423.0, + "step": 28485 + }, + { + "entropy": 0.05508313453756273, + "epoch": 6.641100361347476, + "grad_norm": 0.205078125, + "learning_rate": 4.85605296382757e-05, + "loss": 0.0309, + "mean_token_accuracy": 0.9883796334266662, + "num_tokens": 83574737.0, + "step": 28490 + }, + { + "entropy": 0.07376216007396579, + "epoch": 6.642265998368108, + "grad_norm": 1.015625, + "learning_rate": 4.855981841662017e-05, + "loss": 0.064, + "mean_token_accuracy": 0.9849434971809388, + "num_tokens": 83611091.0, + "step": 28495 + }, + { + "entropy": 0.10243667252361774, + "epoch": 6.64343163538874, + "grad_norm": 4.0, + "learning_rate": 4.855910703004612e-05, + "loss": 0.0562, + "mean_token_accuracy": 0.9861430406570435, + "num_tokens": 83620959.0, + "step": 28500 + }, + { + "entropy": 0.07161519918590784, + "epoch": 6.644597272409372, + "grad_norm": 1.84375, + "learning_rate": 4.855839547856415e-05, + "loss": 0.0319, + "mean_token_accuracy": 0.9895271420478821, + "num_tokens": 83632086.0, + "step": 28505 + }, + { + "entropy": 0.09283050457015633, + "epoch": 6.645762909430004, + "grad_norm": 1.671875, + "learning_rate": 4.855768376218487e-05, + "loss": 0.0362, + "mean_token_accuracy": 0.9906116843223571, + "num_tokens": 83662501.0, + "step": 28510 + }, + { + "entropy": 0.08217054158449173, + "epoch": 6.646928546450635, + "grad_norm": 2.1875, + "learning_rate": 4.855697188091889e-05, + "loss": 0.024, + "mean_token_accuracy": 0.9915976047515869, + "num_tokens": 83681165.0, + "step": 28515 + }, + { + "entropy": 0.06361790988594293, + "epoch": 6.648094183471267, + "grad_norm": 1.75, + "learning_rate": 4.855625983477683e-05, + "loss": 0.0299, + "mean_token_accuracy": 0.9915489137172699, + "num_tokens": 83711612.0, + "step": 28520 + }, + { + "entropy": 0.07515859454870225, + "epoch": 6.649259820491899, + "grad_norm": 2.203125, + "learning_rate": 4.85555476237693e-05, + "loss": 0.0676, + "mean_token_accuracy": 0.9828903555870057, + "num_tokens": 83720807.0, + "step": 28525 + }, + { + "entropy": 0.07741277245804667, + "epoch": 6.65042545751253, + "grad_norm": 0.24609375, + "learning_rate": 4.855483524790694e-05, + "loss": 0.0434, + "mean_token_accuracy": 0.9873224675655365, + "num_tokens": 83734105.0, + "step": 28530 + }, + { + "entropy": 0.07927368283271789, + "epoch": 6.651591094533162, + "grad_norm": 0.890625, + "learning_rate": 4.855412270720035e-05, + "loss": 0.0381, + "mean_token_accuracy": 0.9878321468830109, + "num_tokens": 83744752.0, + "step": 28535 + }, + { + "entropy": 0.05704822298139334, + "epoch": 6.652756731553794, + "grad_norm": 0.162109375, + "learning_rate": 4.8553410001660173e-05, + "loss": 0.0143, + "mean_token_accuracy": 0.9922565579414367, + "num_tokens": 83781512.0, + "step": 28540 + }, + { + "entropy": 0.09766516759991646, + "epoch": 6.653922368574426, + "grad_norm": 2.03125, + "learning_rate": 4.855269713129702e-05, + "loss": 0.0484, + "mean_token_accuracy": 0.987651264667511, + "num_tokens": 83791362.0, + "step": 28545 + }, + { + "entropy": 0.0650267457589507, + "epoch": 6.655088005595058, + "grad_norm": 3.59375, + "learning_rate": 4.855198409612153e-05, + "loss": 0.053, + "mean_token_accuracy": 0.9855973184108734, + "num_tokens": 83813283.0, + "step": 28550 + }, + { + "entropy": 0.06372864125296474, + "epoch": 6.65625364261569, + "grad_norm": 0.75390625, + "learning_rate": 4.855127089614433e-05, + "loss": 0.0328, + "mean_token_accuracy": 0.9887797713279725, + "num_tokens": 83832747.0, + "step": 28555 + }, + { + "entropy": 0.037981690280139445, + "epoch": 6.657419279636321, + "grad_norm": 1.28125, + "learning_rate": 4.855055753137606e-05, + "loss": 0.0137, + "mean_token_accuracy": 0.9955213725566864, + "num_tokens": 83873690.0, + "step": 28560 + }, + { + "entropy": 0.06714175110682845, + "epoch": 6.658584916656953, + "grad_norm": 1.5625, + "learning_rate": 4.854984400182736e-05, + "loss": 0.0347, + "mean_token_accuracy": 0.9886840522289276, + "num_tokens": 83905288.0, + "step": 28565 + }, + { + "entropy": 0.07562700193375349, + "epoch": 6.6597505536775845, + "grad_norm": 4.0625, + "learning_rate": 4.854913030750887e-05, + "loss": 0.0442, + "mean_token_accuracy": 0.986762797832489, + "num_tokens": 83924393.0, + "step": 28570 + }, + { + "entropy": 0.05191770112141967, + "epoch": 6.660916190698217, + "grad_norm": 0.44140625, + "learning_rate": 4.8548416448431224e-05, + "loss": 0.0138, + "mean_token_accuracy": 0.9877704203128814, + "num_tokens": 83954953.0, + "step": 28575 + }, + { + "entropy": 0.0616036182269454, + "epoch": 6.662081827718849, + "grad_norm": 5.46875, + "learning_rate": 4.854770242460507e-05, + "loss": 0.0383, + "mean_token_accuracy": 0.9877343237400055, + "num_tokens": 83969020.0, + "step": 28580 + }, + { + "entropy": 0.06464450052008033, + "epoch": 6.66324746473948, + "grad_norm": 3.34375, + "learning_rate": 4.8546988236041054e-05, + "loss": 0.04, + "mean_token_accuracy": 0.9866331398487092, + "num_tokens": 83994057.0, + "step": 28585 + }, + { + "entropy": 0.07502462603151798, + "epoch": 6.664413101760112, + "grad_norm": 0.138671875, + "learning_rate": 4.8546273882749825e-05, + "loss": 0.0506, + "mean_token_accuracy": 0.9836625099182129, + "num_tokens": 84011629.0, + "step": 28590 + }, + { + "entropy": 0.06252543712034822, + "epoch": 6.665578738780743, + "grad_norm": 1.0078125, + "learning_rate": 4.854555936474204e-05, + "loss": 0.0237, + "mean_token_accuracy": 0.9895300209522248, + "num_tokens": 84030753.0, + "step": 28595 + }, + { + "entropy": 0.0681413403712213, + "epoch": 6.666744375801375, + "grad_norm": 1.015625, + "learning_rate": 4.854484468202836e-05, + "loss": 0.0365, + "mean_token_accuracy": 0.9867991805076599, + "num_tokens": 84050125.0, + "step": 28600 + }, + { + "entropy": 0.08328782804310322, + "epoch": 6.6679100128220075, + "grad_norm": 2.5625, + "learning_rate": 4.854412983461943e-05, + "loss": 0.0478, + "mean_token_accuracy": 0.9859695374965668, + "num_tokens": 84069321.0, + "step": 28605 + }, + { + "entropy": 0.07345206961035729, + "epoch": 6.669075649842639, + "grad_norm": 0.5390625, + "learning_rate": 4.8543414822525904e-05, + "loss": 0.0619, + "mean_token_accuracy": 0.9833361506462097, + "num_tokens": 84078921.0, + "step": 28610 + }, + { + "entropy": 0.04605873627588153, + "epoch": 6.670241286863271, + "grad_norm": 0.125, + "learning_rate": 4.854269964575846e-05, + "loss": 0.0225, + "mean_token_accuracy": 0.9940562188625336, + "num_tokens": 84104775.0, + "step": 28615 + }, + { + "entropy": 0.0720367580652237, + "epoch": 6.671406923883903, + "grad_norm": 2.328125, + "learning_rate": 4.854198430432776e-05, + "loss": 0.0367, + "mean_token_accuracy": 0.991383183002472, + "num_tokens": 84118238.0, + "step": 28620 + }, + { + "entropy": 0.06562741016969084, + "epoch": 6.672572560904534, + "grad_norm": 0.267578125, + "learning_rate": 4.854126879824446e-05, + "loss": 0.0207, + "mean_token_accuracy": 0.9923710107803345, + "num_tokens": 84146884.0, + "step": 28625 + }, + { + "entropy": 0.06544369319453835, + "epoch": 6.673738197925166, + "grad_norm": 1.0390625, + "learning_rate": 4.854055312751924e-05, + "loss": 0.0188, + "mean_token_accuracy": 0.9867700815200806, + "num_tokens": 84167785.0, + "step": 28630 + }, + { + "entropy": 0.05859682857990265, + "epoch": 6.674903834945798, + "grad_norm": 0.8984375, + "learning_rate": 4.853983729216276e-05, + "loss": 0.0242, + "mean_token_accuracy": 0.9904904246330262, + "num_tokens": 84185300.0, + "step": 28635 + }, + { + "entropy": 0.07093516178429127, + "epoch": 6.6760694719664295, + "grad_norm": 0.51171875, + "learning_rate": 4.8539121292185704e-05, + "loss": 0.027, + "mean_token_accuracy": 0.9912785172462464, + "num_tokens": 84199854.0, + "step": 28640 + }, + { + "entropy": 0.08791411444544792, + "epoch": 6.677235108987062, + "grad_norm": 0.275390625, + "learning_rate": 4.853840512759874e-05, + "loss": 0.052, + "mean_token_accuracy": 0.9832299053668976, + "num_tokens": 84215162.0, + "step": 28645 + }, + { + "entropy": 0.08093488197773695, + "epoch": 6.678400746007693, + "grad_norm": 0.1552734375, + "learning_rate": 4.853768879841256e-05, + "loss": 0.0584, + "mean_token_accuracy": 0.9838843226432801, + "num_tokens": 84232657.0, + "step": 28650 + }, + { + "entropy": 0.05880971970036626, + "epoch": 6.679566383028325, + "grad_norm": 0.61328125, + "learning_rate": 4.853697230463784e-05, + "loss": 0.0196, + "mean_token_accuracy": 0.992617392539978, + "num_tokens": 84263155.0, + "step": 28655 + }, + { + "entropy": 0.07369680106639862, + "epoch": 6.680732020048957, + "grad_norm": 2.28125, + "learning_rate": 4.853625564628525e-05, + "loss": 0.0162, + "mean_token_accuracy": 0.9927162766456604, + "num_tokens": 84288594.0, + "step": 28660 + }, + { + "entropy": 0.0617294343188405, + "epoch": 6.681897657069588, + "grad_norm": 0.341796875, + "learning_rate": 4.85355388233655e-05, + "loss": 0.0339, + "mean_token_accuracy": 0.9909321308135987, + "num_tokens": 84302921.0, + "step": 28665 + }, + { + "entropy": 0.05793065996840596, + "epoch": 6.68306329409022, + "grad_norm": 0.28125, + "learning_rate": 4.853482183588927e-05, + "loss": 0.0263, + "mean_token_accuracy": 0.9909238398075104, + "num_tokens": 84330261.0, + "step": 28670 + }, + { + "entropy": 0.07100810557603836, + "epoch": 6.684228931110852, + "grad_norm": 0.7109375, + "learning_rate": 4.853410468386724e-05, + "loss": 0.0401, + "mean_token_accuracy": 0.9874763488769531, + "num_tokens": 84347584.0, + "step": 28675 + }, + { + "entropy": 0.09726348333060741, + "epoch": 6.685394568131484, + "grad_norm": 2.65625, + "learning_rate": 4.853338736731012e-05, + "loss": 0.1012, + "mean_token_accuracy": 0.9809386074543, + "num_tokens": 84369145.0, + "step": 28680 + }, + { + "entropy": 0.06574146244674921, + "epoch": 6.686560205152116, + "grad_norm": 0.859375, + "learning_rate": 4.8532669886228596e-05, + "loss": 0.0259, + "mean_token_accuracy": 0.9907530903816223, + "num_tokens": 84387512.0, + "step": 28685 + }, + { + "entropy": 0.06407738225534558, + "epoch": 6.687725842172748, + "grad_norm": 0.2138671875, + "learning_rate": 4.853195224063337e-05, + "loss": 0.0372, + "mean_token_accuracy": 0.9909645438194274, + "num_tokens": 84415777.0, + "step": 28690 + }, + { + "entropy": 0.09450783599168062, + "epoch": 6.688891479193379, + "grad_norm": 0.2109375, + "learning_rate": 4.853123443053515e-05, + "loss": 0.0401, + "mean_token_accuracy": 0.990818876028061, + "num_tokens": 84434248.0, + "step": 28695 + }, + { + "entropy": 0.08516130037605762, + "epoch": 6.690057116214011, + "grad_norm": 4.25, + "learning_rate": 4.8530516455944625e-05, + "loss": 0.0547, + "mean_token_accuracy": 0.9852405250072479, + "num_tokens": 84447974.0, + "step": 28700 + }, + { + "entropy": 0.05056046452373266, + "epoch": 6.691222753234642, + "grad_norm": 1.6015625, + "learning_rate": 4.852979831687251e-05, + "loss": 0.0428, + "mean_token_accuracy": 0.9874833226203918, + "num_tokens": 84478389.0, + "step": 28705 + }, + { + "entropy": 0.10436076521873475, + "epoch": 6.6923883902552745, + "grad_norm": 2.328125, + "learning_rate": 4.8529080013329515e-05, + "loss": 0.0711, + "mean_token_accuracy": 0.986644196510315, + "num_tokens": 84497684.0, + "step": 28710 + }, + { + "entropy": 0.06655118707567453, + "epoch": 6.693554027275907, + "grad_norm": 2.90625, + "learning_rate": 4.8528361545326345e-05, + "loss": 0.0321, + "mean_token_accuracy": 0.9903276085853576, + "num_tokens": 84516211.0, + "step": 28715 + }, + { + "entropy": 0.060899481642991304, + "epoch": 6.694719664296538, + "grad_norm": 1.328125, + "learning_rate": 4.8527642912873714e-05, + "loss": 0.0244, + "mean_token_accuracy": 0.9928938448429108, + "num_tokens": 84544470.0, + "step": 28720 + }, + { + "entropy": 0.0634287366643548, + "epoch": 6.69588530131717, + "grad_norm": 0.66796875, + "learning_rate": 4.852692411598235e-05, + "loss": 0.0307, + "mean_token_accuracy": 0.9917833507061005, + "num_tokens": 84560366.0, + "step": 28725 + }, + { + "entropy": 0.05534190842881799, + "epoch": 6.697050938337801, + "grad_norm": 0.263671875, + "learning_rate": 4.8526205154662954e-05, + "loss": 0.0274, + "mean_token_accuracy": 0.992123681306839, + "num_tokens": 84584648.0, + "step": 28730 + }, + { + "entropy": 0.05186046920716762, + "epoch": 6.698216575358433, + "grad_norm": 0.47265625, + "learning_rate": 4.852548602892626e-05, + "loss": 0.0264, + "mean_token_accuracy": 0.9879955351352692, + "num_tokens": 84612860.0, + "step": 28735 + }, + { + "entropy": 0.09077054243534803, + "epoch": 6.699382212379065, + "grad_norm": 0.9765625, + "learning_rate": 4.8524766738782984e-05, + "loss": 0.0486, + "mean_token_accuracy": 0.9869544804096222, + "num_tokens": 84633335.0, + "step": 28740 + }, + { + "entropy": 0.08246908560395241, + "epoch": 6.700547849399697, + "grad_norm": 3.046875, + "learning_rate": 4.8524047284243854e-05, + "loss": 0.0512, + "mean_token_accuracy": 0.9850801050662994, + "num_tokens": 84643579.0, + "step": 28745 + }, + { + "entropy": 0.08006817158311605, + "epoch": 6.701713486420329, + "grad_norm": 2.828125, + "learning_rate": 4.8523327665319597e-05, + "loss": 0.0413, + "mean_token_accuracy": 0.9881218016147614, + "num_tokens": 84662846.0, + "step": 28750 + }, + { + "entropy": 0.0593781216070056, + "epoch": 6.702879123440961, + "grad_norm": 3.5625, + "learning_rate": 4.8522607882020945e-05, + "loss": 0.0396, + "mean_token_accuracy": 0.987310266494751, + "num_tokens": 84691789.0, + "step": 28755 + }, + { + "entropy": 0.08961878903210163, + "epoch": 6.704044760461592, + "grad_norm": 4.875, + "learning_rate": 4.852188793435863e-05, + "loss": 0.0451, + "mean_token_accuracy": 0.9865108668804169, + "num_tokens": 84719711.0, + "step": 28760 + }, + { + "entropy": 0.051850343402475116, + "epoch": 6.705210397482224, + "grad_norm": 0.2001953125, + "learning_rate": 4.852116782234338e-05, + "loss": 0.0189, + "mean_token_accuracy": 0.9914988338947296, + "num_tokens": 84742705.0, + "step": 28765 + }, + { + "entropy": 0.07821713835000992, + "epoch": 6.706376034502856, + "grad_norm": 1.9375, + "learning_rate": 4.852044754598595e-05, + "loss": 0.0429, + "mean_token_accuracy": 0.9877715647220612, + "num_tokens": 84753776.0, + "step": 28770 + }, + { + "entropy": 0.08759090341627598, + "epoch": 6.7075416715234875, + "grad_norm": 1.03125, + "learning_rate": 4.8519727105297074e-05, + "loss": 0.0473, + "mean_token_accuracy": 0.9881977081298828, + "num_tokens": 84763735.0, + "step": 28775 + }, + { + "entropy": 0.08235547505319118, + "epoch": 6.70870730854412, + "grad_norm": 2.359375, + "learning_rate": 4.851900650028749e-05, + "loss": 0.0516, + "mean_token_accuracy": 0.9870765507221222, + "num_tokens": 84781247.0, + "step": 28780 + }, + { + "entropy": 0.07376531232148409, + "epoch": 6.709872945564751, + "grad_norm": 0.2890625, + "learning_rate": 4.8518285730967944e-05, + "loss": 0.0458, + "mean_token_accuracy": 0.9874989748001098, + "num_tokens": 84806872.0, + "step": 28785 + }, + { + "entropy": 0.08102515386417508, + "epoch": 6.711038582585383, + "grad_norm": 0.91796875, + "learning_rate": 4.8517564797349185e-05, + "loss": 0.0224, + "mean_token_accuracy": 0.9898236274719239, + "num_tokens": 84834660.0, + "step": 28790 + }, + { + "entropy": 0.06574399825185537, + "epoch": 6.712204219606015, + "grad_norm": 1.6328125, + "learning_rate": 4.851684369944197e-05, + "loss": 0.0202, + "mean_token_accuracy": 0.9922596752643585, + "num_tokens": 84849216.0, + "step": 28795 + }, + { + "entropy": 0.07711246721446514, + "epoch": 6.713369856626646, + "grad_norm": 1.3828125, + "learning_rate": 4.851612243725703e-05, + "loss": 0.0459, + "mean_token_accuracy": 0.9878857433795929, + "num_tokens": 84861506.0, + "step": 28800 + }, + { + "entropy": 0.03763336762785911, + "epoch": 6.714535493647278, + "grad_norm": 1.5625, + "learning_rate": 4.851540101080515e-05, + "loss": 0.0149, + "mean_token_accuracy": 0.9945106983184815, + "num_tokens": 84908241.0, + "step": 28805 + }, + { + "entropy": 0.06442044954746962, + "epoch": 6.7157011306679095, + "grad_norm": 0.3671875, + "learning_rate": 4.851467942009706e-05, + "loss": 0.0265, + "mean_token_accuracy": 0.9908295273780823, + "num_tokens": 84928831.0, + "step": 28810 + }, + { + "entropy": 0.1289056757465005, + "epoch": 6.716866767688542, + "grad_norm": 1.5234375, + "learning_rate": 4.851395766514355e-05, + "loss": 0.229, + "mean_token_accuracy": 0.9561670780181885, + "num_tokens": 84960498.0, + "step": 28815 + }, + { + "entropy": 0.07786722630262374, + "epoch": 6.718032404709174, + "grad_norm": 2.453125, + "learning_rate": 4.851323574595535e-05, + "loss": 0.0418, + "mean_token_accuracy": 0.9877800464630127, + "num_tokens": 84978889.0, + "step": 28820 + }, + { + "entropy": 0.06431259289383888, + "epoch": 6.719198041729806, + "grad_norm": 0.50390625, + "learning_rate": 4.8512513662543244e-05, + "loss": 0.0261, + "mean_token_accuracy": 0.991853529214859, + "num_tokens": 85008818.0, + "step": 28825 + }, + { + "entropy": 0.05381359262391925, + "epoch": 6.720363678750437, + "grad_norm": 0.388671875, + "learning_rate": 4.8511791414918006e-05, + "loss": 0.0148, + "mean_token_accuracy": 0.990543258190155, + "num_tokens": 85041514.0, + "step": 28830 + }, + { + "entropy": 0.0829438241198659, + "epoch": 6.721529315771069, + "grad_norm": 0.48828125, + "learning_rate": 4.851106900309038e-05, + "loss": 0.0734, + "mean_token_accuracy": 0.9828771650791168, + "num_tokens": 85058710.0, + "step": 28835 + }, + { + "entropy": 0.07714390307664871, + "epoch": 6.7226949527917, + "grad_norm": 0.77734375, + "learning_rate": 4.851034642707116e-05, + "loss": 0.0398, + "mean_token_accuracy": 0.9864890038967132, + "num_tokens": 85075490.0, + "step": 28840 + }, + { + "entropy": 0.07695426233112812, + "epoch": 6.7238605898123325, + "grad_norm": 0.51953125, + "learning_rate": 4.850962368687112e-05, + "loss": 0.0572, + "mean_token_accuracy": 0.9874246895313263, + "num_tokens": 85090486.0, + "step": 28845 + }, + { + "entropy": 0.04998069824650884, + "epoch": 6.725026226832965, + "grad_norm": 2.34375, + "learning_rate": 4.850890078250103e-05, + "loss": 0.0238, + "mean_token_accuracy": 0.9922325253486634, + "num_tokens": 85128029.0, + "step": 28850 + }, + { + "entropy": 0.0514289460144937, + "epoch": 6.726191863853596, + "grad_norm": 0.447265625, + "learning_rate": 4.850817771397166e-05, + "loss": 0.0218, + "mean_token_accuracy": 0.9909493029117584, + "num_tokens": 85151400.0, + "step": 28855 + }, + { + "entropy": 0.0715357482433319, + "epoch": 6.727357500874228, + "grad_norm": 0.578125, + "learning_rate": 4.8507454481293814e-05, + "loss": 0.0366, + "mean_token_accuracy": 0.9905698835849762, + "num_tokens": 85175731.0, + "step": 28860 + }, + { + "entropy": 0.07424063235521317, + "epoch": 6.728523137894859, + "grad_norm": 0.71875, + "learning_rate": 4.8506731084478254e-05, + "loss": 0.0255, + "mean_token_accuracy": 0.9909955024719238, + "num_tokens": 85197952.0, + "step": 28865 + }, + { + "entropy": 0.12476393207907677, + "epoch": 6.729688774915491, + "grad_norm": 1.3671875, + "learning_rate": 4.850600752353579e-05, + "loss": 0.1117, + "mean_token_accuracy": 0.975234842300415, + "num_tokens": 85218193.0, + "step": 28870 + }, + { + "entropy": 0.08583814539015293, + "epoch": 6.730854411936123, + "grad_norm": 5.53125, + "learning_rate": 4.8505283798477195e-05, + "loss": 0.0451, + "mean_token_accuracy": 0.9879949271678925, + "num_tokens": 85229458.0, + "step": 28875 + }, + { + "entropy": 0.0823191050440073, + "epoch": 6.7320200489567545, + "grad_norm": 2.140625, + "learning_rate": 4.850455990931327e-05, + "loss": 0.0145, + "mean_token_accuracy": 0.9918103396892548, + "num_tokens": 85259716.0, + "step": 28880 + }, + { + "entropy": 0.06291626095771789, + "epoch": 6.733185685977387, + "grad_norm": 1.0390625, + "learning_rate": 4.85038358560548e-05, + "loss": 0.0315, + "mean_token_accuracy": 0.9903146743774414, + "num_tokens": 85270424.0, + "step": 28885 + }, + { + "entropy": 0.06514295479282736, + "epoch": 6.734351322998019, + "grad_norm": 0.74609375, + "learning_rate": 4.850311163871259e-05, + "loss": 0.0362, + "mean_token_accuracy": 0.9864876389503479, + "num_tokens": 85288155.0, + "step": 28890 + }, + { + "entropy": 0.051016226317733525, + "epoch": 6.73551696001865, + "grad_norm": 0.80078125, + "learning_rate": 4.8502387257297435e-05, + "loss": 0.0129, + "mean_token_accuracy": 0.9911475658416748, + "num_tokens": 85321624.0, + "step": 28895 + }, + { + "entropy": 0.0672379924915731, + "epoch": 6.736682597039282, + "grad_norm": 0.353515625, + "learning_rate": 4.8501662711820136e-05, + "loss": 0.0431, + "mean_token_accuracy": 0.9873587965965271, + "num_tokens": 85341463.0, + "step": 28900 + }, + { + "entropy": 0.07106356099247932, + "epoch": 6.737848234059914, + "grad_norm": 1.1640625, + "learning_rate": 4.8500938002291494e-05, + "loss": 0.0357, + "mean_token_accuracy": 0.991501921415329, + "num_tokens": 85352796.0, + "step": 28905 + }, + { + "entropy": 0.07119618002325297, + "epoch": 6.739013871080545, + "grad_norm": 0.470703125, + "learning_rate": 4.8500213128722326e-05, + "loss": 0.0365, + "mean_token_accuracy": 0.9895273089408875, + "num_tokens": 85368371.0, + "step": 28910 + }, + { + "entropy": 0.07297233371064067, + "epoch": 6.7401795081011775, + "grad_norm": 1.5625, + "learning_rate": 4.849948809112344e-05, + "loss": 0.0362, + "mean_token_accuracy": 0.9880306363105774, + "num_tokens": 85386310.0, + "step": 28915 + }, + { + "entropy": 0.05042272610589862, + "epoch": 6.741345145121809, + "grad_norm": 0.5390625, + "learning_rate": 4.849876288950563e-05, + "loss": 0.0113, + "mean_token_accuracy": 0.9967696309089661, + "num_tokens": 85415012.0, + "step": 28920 + }, + { + "entropy": 0.07025924921035767, + "epoch": 6.742510782142441, + "grad_norm": 2.859375, + "learning_rate": 4.849803752387974e-05, + "loss": 0.0318, + "mean_token_accuracy": 0.9872465670108795, + "num_tokens": 85427889.0, + "step": 28925 + }, + { + "entropy": 0.058800591714680196, + "epoch": 6.743676419163073, + "grad_norm": 0.1767578125, + "learning_rate": 4.849731199425655e-05, + "loss": 0.0409, + "mean_token_accuracy": 0.9868711709976197, + "num_tokens": 85453298.0, + "step": 28930 + }, + { + "entropy": 0.07147304005920888, + "epoch": 6.744842056183704, + "grad_norm": 1.3125, + "learning_rate": 4.8496586300646905e-05, + "loss": 0.0376, + "mean_token_accuracy": 0.9878616869449616, + "num_tokens": 85469989.0, + "step": 28935 + }, + { + "entropy": 0.03608826128765941, + "epoch": 6.746007693204336, + "grad_norm": 0.19921875, + "learning_rate": 4.849586044306162e-05, + "loss": 0.0078, + "mean_token_accuracy": 0.9953897356986999, + "num_tokens": 85507209.0, + "step": 28940 + }, + { + "entropy": 0.06290267184376716, + "epoch": 6.7471733302249675, + "grad_norm": 3.921875, + "learning_rate": 4.8495134421511515e-05, + "loss": 0.0496, + "mean_token_accuracy": 0.9858272433280945, + "num_tokens": 85518328.0, + "step": 28945 + }, + { + "entropy": 0.07403470184653997, + "epoch": 6.7483389672456, + "grad_norm": 2.46875, + "learning_rate": 4.849440823600742e-05, + "loss": 0.0458, + "mean_token_accuracy": 0.9867697477340698, + "num_tokens": 85533094.0, + "step": 28950 + }, + { + "entropy": 0.07181532364338636, + "epoch": 6.749504604266232, + "grad_norm": 3.359375, + "learning_rate": 4.8493681886560154e-05, + "loss": 0.0359, + "mean_token_accuracy": 0.9885821998119354, + "num_tokens": 85557546.0, + "step": 28955 + }, + { + "entropy": 0.0670968891121447, + "epoch": 6.750670241286863, + "grad_norm": 0.30078125, + "learning_rate": 4.8492955373180567e-05, + "loss": 0.0211, + "mean_token_accuracy": 0.992115980386734, + "num_tokens": 85583928.0, + "step": 28960 + }, + { + "entropy": 0.08538498897105455, + "epoch": 6.751835878307495, + "grad_norm": 3.4375, + "learning_rate": 4.849222869587947e-05, + "loss": 0.0511, + "mean_token_accuracy": 0.9852666079998016, + "num_tokens": 85597053.0, + "step": 28965 + }, + { + "entropy": 0.07742054816335439, + "epoch": 6.753001515328127, + "grad_norm": 2.921875, + "learning_rate": 4.849150185466772e-05, + "loss": 0.0627, + "mean_token_accuracy": 0.9815660178661346, + "num_tokens": 85622712.0, + "step": 28970 + }, + { + "entropy": 0.0654791722074151, + "epoch": 6.754167152348758, + "grad_norm": 0.20703125, + "learning_rate": 4.849077484955614e-05, + "loss": 0.0419, + "mean_token_accuracy": 0.9875000894069672, + "num_tokens": 85647033.0, + "step": 28975 + }, + { + "entropy": 0.062252599932253364, + "epoch": 6.75533278936939, + "grad_norm": 1.21875, + "learning_rate": 4.8490047680555574e-05, + "loss": 0.0224, + "mean_token_accuracy": 0.9896842062473297, + "num_tokens": 85670834.0, + "step": 28980 + }, + { + "entropy": 0.0574477035086602, + "epoch": 6.7564984263900225, + "grad_norm": 0.8671875, + "learning_rate": 4.848932034767687e-05, + "loss": 0.0186, + "mean_token_accuracy": 0.991297823190689, + "num_tokens": 85698076.0, + "step": 28985 + }, + { + "entropy": 0.06765312943607568, + "epoch": 6.757664063410654, + "grad_norm": 4.625, + "learning_rate": 4.848859285093087e-05, + "loss": 0.034, + "mean_token_accuracy": 0.9909755229949951, + "num_tokens": 85717080.0, + "step": 28990 + }, + { + "entropy": 0.07195999156683683, + "epoch": 6.758829700431286, + "grad_norm": 0.42578125, + "learning_rate": 4.848786519032842e-05, + "loss": 0.047, + "mean_token_accuracy": 0.9886180579662323, + "num_tokens": 85737727.0, + "step": 28995 + }, + { + "entropy": 0.06367543712258339, + "epoch": 6.759995337451917, + "grad_norm": 1.2578125, + "learning_rate": 4.848713736588038e-05, + "loss": 0.0382, + "mean_token_accuracy": 0.9904224634170532, + "num_tokens": 85752008.0, + "step": 29000 + }, + { + "entropy": 0.06906843390315771, + "epoch": 6.761160974472549, + "grad_norm": 0.99609375, + "learning_rate": 4.84864093775976e-05, + "loss": 0.0297, + "mean_token_accuracy": 0.990409255027771, + "num_tokens": 85766147.0, + "step": 29005 + }, + { + "entropy": 0.07358511816710234, + "epoch": 6.762326611493181, + "grad_norm": 4.0, + "learning_rate": 4.848568122549092e-05, + "loss": 0.0349, + "mean_token_accuracy": 0.9889043509960175, + "num_tokens": 85779365.0, + "step": 29010 + }, + { + "entropy": 0.05689704436808825, + "epoch": 6.7634922485138125, + "grad_norm": 2.8125, + "learning_rate": 4.8484952909571215e-05, + "loss": 0.0246, + "mean_token_accuracy": 0.9909472405910492, + "num_tokens": 85805043.0, + "step": 29015 + }, + { + "entropy": 0.061226178891956806, + "epoch": 6.764657885534445, + "grad_norm": 0.4609375, + "learning_rate": 4.848422442984934e-05, + "loss": 0.0279, + "mean_token_accuracy": 0.9857422173023224, + "num_tokens": 85822023.0, + "step": 29020 + }, + { + "entropy": 0.08178901560604572, + "epoch": 6.765823522555077, + "grad_norm": 2.140625, + "learning_rate": 4.8483495786336156e-05, + "loss": 0.0496, + "mean_token_accuracy": 0.9885387241840362, + "num_tokens": 85831669.0, + "step": 29025 + }, + { + "entropy": 0.06862952103838324, + "epoch": 6.766989159575708, + "grad_norm": 1.2421875, + "learning_rate": 4.848276697904253e-05, + "loss": 0.0373, + "mean_token_accuracy": 0.989341801404953, + "num_tokens": 85854445.0, + "step": 29030 + }, + { + "entropy": 0.06690819151699542, + "epoch": 6.76815479659634, + "grad_norm": 2.359375, + "learning_rate": 4.848203800797933e-05, + "loss": 0.0267, + "mean_token_accuracy": 0.9925275266170501, + "num_tokens": 85883012.0, + "step": 29035 + }, + { + "entropy": 0.05663586547598243, + "epoch": 6.769320433616972, + "grad_norm": 0.98828125, + "learning_rate": 4.848130887315743e-05, + "loss": 0.019, + "mean_token_accuracy": 0.9927861154079437, + "num_tokens": 85904037.0, + "step": 29040 + }, + { + "entropy": 0.06998476311564446, + "epoch": 6.770486070637603, + "grad_norm": 0.431640625, + "learning_rate": 4.848057957458769e-05, + "loss": 0.0492, + "mean_token_accuracy": 0.9872413158416748, + "num_tokens": 85917549.0, + "step": 29045 + }, + { + "entropy": 0.07485338505357504, + "epoch": 6.771651707658235, + "grad_norm": 2.515625, + "learning_rate": 4.847985011228099e-05, + "loss": 0.032, + "mean_token_accuracy": 0.9896779417991638, + "num_tokens": 85929221.0, + "step": 29050 + }, + { + "entropy": 0.06818428202532231, + "epoch": 6.772817344678867, + "grad_norm": 0.90625, + "learning_rate": 4.847912048624822e-05, + "loss": 0.0182, + "mean_token_accuracy": 0.9925785660743713, + "num_tokens": 85966358.0, + "step": 29055 + }, + { + "entropy": 0.05105516081675887, + "epoch": 6.773982981699499, + "grad_norm": 0.67578125, + "learning_rate": 4.847839069650024e-05, + "loss": 0.0181, + "mean_token_accuracy": 0.9904293179512024, + "num_tokens": 85995848.0, + "step": 29060 + }, + { + "entropy": 0.05594795113429427, + "epoch": 6.775148618720131, + "grad_norm": 0.443359375, + "learning_rate": 4.847766074304795e-05, + "loss": 0.0315, + "mean_token_accuracy": 0.9890788733959198, + "num_tokens": 86021444.0, + "step": 29065 + }, + { + "entropy": 0.2655148051679134, + "epoch": 6.776314255740762, + "grad_norm": 4.09375, + "learning_rate": 4.847693062590223e-05, + "loss": 0.4223, + "mean_token_accuracy": 0.9431075990200043, + "num_tokens": 86041173.0, + "step": 29070 + }, + { + "entropy": 0.06743866577744484, + "epoch": 6.777479892761394, + "grad_norm": 1.109375, + "learning_rate": 4.847620034507396e-05, + "loss": 0.0372, + "mean_token_accuracy": 0.9891158044338226, + "num_tokens": 86051811.0, + "step": 29075 + }, + { + "entropy": 0.062434398010373114, + "epoch": 6.778645529782025, + "grad_norm": 0.6015625, + "learning_rate": 4.847546990057403e-05, + "loss": 0.0236, + "mean_token_accuracy": 0.9920885801315308, + "num_tokens": 86069985.0, + "step": 29080 + }, + { + "entropy": 0.05638897055760026, + "epoch": 6.7798111668026575, + "grad_norm": 0.53515625, + "learning_rate": 4.847473929241334e-05, + "loss": 0.0227, + "mean_token_accuracy": 0.9927825272083283, + "num_tokens": 86100719.0, + "step": 29085 + }, + { + "entropy": 0.05632878141477704, + "epoch": 6.78097680382329, + "grad_norm": 1.6640625, + "learning_rate": 4.847400852060278e-05, + "loss": 0.0294, + "mean_token_accuracy": 0.9924284636974334, + "num_tokens": 86124617.0, + "step": 29090 + }, + { + "entropy": 0.07136726304888726, + "epoch": 6.782142440843921, + "grad_norm": 0.3828125, + "learning_rate": 4.847327758515324e-05, + "loss": 0.0329, + "mean_token_accuracy": 0.9870093047618866, + "num_tokens": 86147085.0, + "step": 29095 + }, + { + "entropy": 0.06836878564208745, + "epoch": 6.783308077864553, + "grad_norm": 0.63671875, + "learning_rate": 4.8472546486075637e-05, + "loss": 0.0482, + "mean_token_accuracy": 0.9857183873653412, + "num_tokens": 86159179.0, + "step": 29100 + }, + { + "entropy": 0.06574140526354313, + "epoch": 6.784473714885185, + "grad_norm": 1.546875, + "learning_rate": 4.847181522338086e-05, + "loss": 0.0339, + "mean_token_accuracy": 0.9897574663162232, + "num_tokens": 86176824.0, + "step": 29105 + }, + { + "entropy": 0.09539928231388331, + "epoch": 6.785639351905816, + "grad_norm": 2.296875, + "learning_rate": 4.8471083797079814e-05, + "loss": 0.0424, + "mean_token_accuracy": 0.9886773109436036, + "num_tokens": 86190608.0, + "step": 29110 + }, + { + "entropy": 0.05910975374281406, + "epoch": 6.786804988926448, + "grad_norm": 1.765625, + "learning_rate": 4.84703522071834e-05, + "loss": 0.0246, + "mean_token_accuracy": 0.9930814146995545, + "num_tokens": 86214225.0, + "step": 29115 + }, + { + "entropy": 0.10080632586032152, + "epoch": 6.7879706259470804, + "grad_norm": 1.9609375, + "learning_rate": 4.846962045370255e-05, + "loss": 0.0418, + "mean_token_accuracy": 0.9877125442028045, + "num_tokens": 86233522.0, + "step": 29120 + }, + { + "entropy": 0.08956030709668994, + "epoch": 6.789136262967712, + "grad_norm": 1.203125, + "learning_rate": 4.8468888536648146e-05, + "loss": 0.0597, + "mean_token_accuracy": 0.9848473966121674, + "num_tokens": 86252004.0, + "step": 29125 + }, + { + "entropy": 0.06908920761197805, + "epoch": 6.790301899988344, + "grad_norm": 0.95703125, + "learning_rate": 4.8468156456031125e-05, + "loss": 0.0331, + "mean_token_accuracy": 0.987659215927124, + "num_tokens": 86267780.0, + "step": 29130 + }, + { + "entropy": 0.08533047027885914, + "epoch": 6.791467537008975, + "grad_norm": 1.9453125, + "learning_rate": 4.846742421186238e-05, + "loss": 0.0671, + "mean_token_accuracy": 0.9829756557941437, + "num_tokens": 86277473.0, + "step": 29135 + }, + { + "entropy": 0.11114938296377659, + "epoch": 6.792633174029607, + "grad_norm": 3.796875, + "learning_rate": 4.8466691804152856e-05, + "loss": 0.1287, + "mean_token_accuracy": 0.9729100704193115, + "num_tokens": 86298359.0, + "step": 29140 + }, + { + "entropy": 0.05975300762802362, + "epoch": 6.793798811050239, + "grad_norm": 0.828125, + "learning_rate": 4.846595923291346e-05, + "loss": 0.0355, + "mean_token_accuracy": 0.9907340705394745, + "num_tokens": 86312783.0, + "step": 29145 + }, + { + "entropy": 0.0666638569906354, + "epoch": 6.79496444807087, + "grad_norm": 0.44140625, + "learning_rate": 4.846522649815512e-05, + "loss": 0.0402, + "mean_token_accuracy": 0.9895487248897552, + "num_tokens": 86330191.0, + "step": 29150 + }, + { + "entropy": 0.06604398051276802, + "epoch": 6.7961300850915025, + "grad_norm": 2.4375, + "learning_rate": 4.846449359988876e-05, + "loss": 0.0348, + "mean_token_accuracy": 0.988951587677002, + "num_tokens": 86346131.0, + "step": 29155 + }, + { + "entropy": 0.07143577151000499, + "epoch": 6.797295722112135, + "grad_norm": 3.015625, + "learning_rate": 4.846376053812531e-05, + "loss": 0.0434, + "mean_token_accuracy": 0.9879807472229004, + "num_tokens": 86357456.0, + "step": 29160 + }, + { + "entropy": 0.07275726459920406, + "epoch": 6.798461359132766, + "grad_norm": 0.5859375, + "learning_rate": 4.846302731287569e-05, + "loss": 0.0333, + "mean_token_accuracy": 0.9879595756530761, + "num_tokens": 86374022.0, + "step": 29165 + }, + { + "entropy": 0.06351196058094502, + "epoch": 6.799626996153398, + "grad_norm": 2.796875, + "learning_rate": 4.846229392415085e-05, + "loss": 0.0215, + "mean_token_accuracy": 0.9883199453353881, + "num_tokens": 86406372.0, + "step": 29170 + }, + { + "entropy": 0.06369423121213913, + "epoch": 6.80079263317403, + "grad_norm": 0.4765625, + "learning_rate": 4.8461560371961726e-05, + "loss": 0.0393, + "mean_token_accuracy": 0.9890810310840606, + "num_tokens": 86428485.0, + "step": 29175 + }, + { + "entropy": 0.088828045129776, + "epoch": 6.801958270194661, + "grad_norm": 2.390625, + "learning_rate": 4.8460826656319244e-05, + "loss": 0.0467, + "mean_token_accuracy": 0.9871364951133728, + "num_tokens": 86446053.0, + "step": 29180 + }, + { + "entropy": 0.06377274207770825, + "epoch": 6.803123907215293, + "grad_norm": 1.3359375, + "learning_rate": 4.846009277723435e-05, + "loss": 0.0247, + "mean_token_accuracy": 0.9909086942672729, + "num_tokens": 86472828.0, + "step": 29185 + }, + { + "entropy": 0.06937651876360178, + "epoch": 6.804289544235925, + "grad_norm": 0.51953125, + "learning_rate": 4.845935873471799e-05, + "loss": 0.0465, + "mean_token_accuracy": 0.988141006231308, + "num_tokens": 86488960.0, + "step": 29190 + }, + { + "entropy": 0.052994494885206224, + "epoch": 6.805455181256557, + "grad_norm": 0.470703125, + "learning_rate": 4.84586245287811e-05, + "loss": 0.023, + "mean_token_accuracy": 0.9916663050651551, + "num_tokens": 86524601.0, + "step": 29195 + }, + { + "entropy": 0.0682718912139535, + "epoch": 6.806620818277189, + "grad_norm": 1.8671875, + "learning_rate": 4.845789015943464e-05, + "loss": 0.0414, + "mean_token_accuracy": 0.9885375797748566, + "num_tokens": 86543402.0, + "step": 29200 + }, + { + "entropy": 0.06436384730041027, + "epoch": 6.80778645529782, + "grad_norm": 1.53125, + "learning_rate": 4.845715562668956e-05, + "loss": 0.0251, + "mean_token_accuracy": 0.9882299304008484, + "num_tokens": 86564820.0, + "step": 29205 + }, + { + "entropy": 0.0728614155203104, + "epoch": 6.808952092318452, + "grad_norm": 0.87890625, + "learning_rate": 4.845642093055681e-05, + "loss": 0.0372, + "mean_token_accuracy": 0.9917413115501403, + "num_tokens": 86578411.0, + "step": 29210 + }, + { + "entropy": 0.05707103759050369, + "epoch": 6.810117729339083, + "grad_norm": 1.7578125, + "learning_rate": 4.8455686071047334e-05, + "loss": 0.0209, + "mean_token_accuracy": 0.9903227090835571, + "num_tokens": 86598438.0, + "step": 29215 + }, + { + "entropy": 0.054839993361383677, + "epoch": 6.811283366359715, + "grad_norm": 1.6171875, + "learning_rate": 4.845495104817211e-05, + "loss": 0.0329, + "mean_token_accuracy": 0.9889629364013672, + "num_tokens": 86618133.0, + "step": 29220 + }, + { + "entropy": 0.0673223901540041, + "epoch": 6.8124490033803475, + "grad_norm": 3.671875, + "learning_rate": 4.8454215861942084e-05, + "loss": 0.0365, + "mean_token_accuracy": 0.9903973639011383, + "num_tokens": 86629789.0, + "step": 29225 + }, + { + "entropy": 0.05427380716428161, + "epoch": 6.813614640400979, + "grad_norm": 1.3359375, + "learning_rate": 4.845348051236823e-05, + "loss": 0.0224, + "mean_token_accuracy": 0.9940382957458496, + "num_tokens": 86664693.0, + "step": 29230 + }, + { + "entropy": 0.06686010584235191, + "epoch": 6.814780277421611, + "grad_norm": 0.349609375, + "learning_rate": 4.84527449994615e-05, + "loss": 0.0403, + "mean_token_accuracy": 0.9875875532627105, + "num_tokens": 86688064.0, + "step": 29235 + }, + { + "entropy": 0.07085557524114847, + "epoch": 6.815945914442243, + "grad_norm": 1.46875, + "learning_rate": 4.845200932323287e-05, + "loss": 0.0301, + "mean_token_accuracy": 0.9859791278839112, + "num_tokens": 86702004.0, + "step": 29240 + }, + { + "entropy": 0.0595591738820076, + "epoch": 6.817111551462874, + "grad_norm": 1.09375, + "learning_rate": 4.845127348369331e-05, + "loss": 0.0311, + "mean_token_accuracy": 0.9883405685424804, + "num_tokens": 86723765.0, + "step": 29245 + }, + { + "entropy": 0.060622527822852136, + "epoch": 6.818277188483506, + "grad_norm": 1.09375, + "learning_rate": 4.845053748085379e-05, + "loss": 0.0381, + "mean_token_accuracy": 0.9894052445888519, + "num_tokens": 86736897.0, + "step": 29250 + }, + { + "entropy": 0.06368033736944198, + "epoch": 6.819442825504138, + "grad_norm": 1.640625, + "learning_rate": 4.8449801314725284e-05, + "loss": 0.0358, + "mean_token_accuracy": 0.9864093840122223, + "num_tokens": 86754372.0, + "step": 29255 + }, + { + "entropy": 0.05678990054875612, + "epoch": 6.82060846252477, + "grad_norm": 1.5234375, + "learning_rate": 4.844906498531877e-05, + "loss": 0.0377, + "mean_token_accuracy": 0.9898879945278167, + "num_tokens": 86787915.0, + "step": 29260 + }, + { + "entropy": 0.0435803915373981, + "epoch": 6.821774099545402, + "grad_norm": 0.275390625, + "learning_rate": 4.8448328492645236e-05, + "loss": 0.0175, + "mean_token_accuracy": 0.9927274286746979, + "num_tokens": 86815853.0, + "step": 29265 + }, + { + "entropy": 0.06827406398952007, + "epoch": 6.822939736566033, + "grad_norm": 1.9609375, + "learning_rate": 4.844759183671565e-05, + "loss": 0.0418, + "mean_token_accuracy": 0.9852350890636444, + "num_tokens": 86834030.0, + "step": 29270 + }, + { + "entropy": 0.0832410730421543, + "epoch": 6.824105373586665, + "grad_norm": 6.15625, + "learning_rate": 4.8446855017541004e-05, + "loss": 0.0632, + "mean_token_accuracy": 0.9872022151947022, + "num_tokens": 86842714.0, + "step": 29275 + }, + { + "entropy": 0.06686999946832657, + "epoch": 6.825271010607297, + "grad_norm": 0.328125, + "learning_rate": 4.844611803513228e-05, + "loss": 0.0193, + "mean_token_accuracy": 0.9939013659954071, + "num_tokens": 86860947.0, + "step": 29280 + }, + { + "entropy": 0.053580792341381314, + "epoch": 6.826436647627928, + "grad_norm": 0.96484375, + "learning_rate": 4.844538088950048e-05, + "loss": 0.0213, + "mean_token_accuracy": 0.9897189140319824, + "num_tokens": 86889554.0, + "step": 29285 + }, + { + "entropy": 0.0600953180808574, + "epoch": 6.8276022846485604, + "grad_norm": 1.9609375, + "learning_rate": 4.844464358065659e-05, + "loss": 0.0439, + "mean_token_accuracy": 0.9867989003658295, + "num_tokens": 86914705.0, + "step": 29290 + }, + { + "entropy": 0.05365408333018422, + "epoch": 6.8287679216691926, + "grad_norm": 0.2041015625, + "learning_rate": 4.8443906108611594e-05, + "loss": 0.0169, + "mean_token_accuracy": 0.9927012145519256, + "num_tokens": 86948313.0, + "step": 29295 + }, + { + "entropy": 0.04468016289174557, + "epoch": 6.829933558689824, + "grad_norm": 0.78125, + "learning_rate": 4.84431684733765e-05, + "loss": 0.0153, + "mean_token_accuracy": 0.9934628009796143, + "num_tokens": 86981371.0, + "step": 29300 + }, + { + "entropy": 0.060497371945530176, + "epoch": 6.831099195710456, + "grad_norm": 2.359375, + "learning_rate": 4.8442430674962315e-05, + "loss": 0.0183, + "mean_token_accuracy": 0.9893570840358734, + "num_tokens": 87012118.0, + "step": 29305 + }, + { + "entropy": 0.05356447799131274, + "epoch": 6.832264832731088, + "grad_norm": 0.474609375, + "learning_rate": 4.844169271338002e-05, + "loss": 0.0093, + "mean_token_accuracy": 0.9939198553562164, + "num_tokens": 87045066.0, + "step": 29310 + }, + { + "entropy": 0.08580188415944576, + "epoch": 6.833430469751719, + "grad_norm": 1.828125, + "learning_rate": 4.844095458864063e-05, + "loss": 0.042, + "mean_token_accuracy": 0.9881276488304138, + "num_tokens": 87055976.0, + "step": 29315 + }, + { + "entropy": 0.058746106829494237, + "epoch": 6.834596106772351, + "grad_norm": 0.5078125, + "learning_rate": 4.8440216300755156e-05, + "loss": 0.0228, + "mean_token_accuracy": 0.9936113774776458, + "num_tokens": 87085259.0, + "step": 29320 + }, + { + "entropy": 0.06592875272035599, + "epoch": 6.8357617437929825, + "grad_norm": 3.25, + "learning_rate": 4.8439477849734596e-05, + "loss": 0.0287, + "mean_token_accuracy": 0.9858220040798187, + "num_tokens": 87107554.0, + "step": 29325 + }, + { + "entropy": 0.06077197715640068, + "epoch": 6.836927380813615, + "grad_norm": 3.15625, + "learning_rate": 4.843873923558997e-05, + "loss": 0.0373, + "mean_token_accuracy": 0.9901536464691162, + "num_tokens": 87125825.0, + "step": 29330 + }, + { + "entropy": 0.06383555121719837, + "epoch": 6.838093017834247, + "grad_norm": 2.015625, + "learning_rate": 4.843800045833229e-05, + "loss": 0.0246, + "mean_token_accuracy": 0.992045420408249, + "num_tokens": 87148897.0, + "step": 29335 + }, + { + "entropy": 0.06727636214345693, + "epoch": 6.839258654854878, + "grad_norm": 2.796875, + "learning_rate": 4.8437261517972565e-05, + "loss": 0.0405, + "mean_token_accuracy": 0.9885002434253692, + "num_tokens": 87177972.0, + "step": 29340 + }, + { + "entropy": 0.07472562920302153, + "epoch": 6.84042429187551, + "grad_norm": 1.75, + "learning_rate": 4.843652241452183e-05, + "loss": 0.0477, + "mean_token_accuracy": 0.9853593826293945, + "num_tokens": 87188899.0, + "step": 29345 + }, + { + "entropy": 0.045274854823946956, + "epoch": 6.841589928896141, + "grad_norm": 0.66015625, + "learning_rate": 4.8435783147991084e-05, + "loss": 0.0232, + "mean_token_accuracy": 0.9936028838157653, + "num_tokens": 87219880.0, + "step": 29350 + }, + { + "entropy": 0.07913060076534748, + "epoch": 6.842755565916773, + "grad_norm": 0.40625, + "learning_rate": 4.8435043718391374e-05, + "loss": 0.0466, + "mean_token_accuracy": 0.9835161387920379, + "num_tokens": 87233980.0, + "step": 29355 + }, + { + "entropy": 0.07882125917822122, + "epoch": 6.8439212029374055, + "grad_norm": 0.92578125, + "learning_rate": 4.8434304125733715e-05, + "loss": 0.0413, + "mean_token_accuracy": 0.9837761163711548, + "num_tokens": 87249683.0, + "step": 29360 + }, + { + "entropy": 0.08739354386925698, + "epoch": 6.845086839958037, + "grad_norm": 2.796875, + "learning_rate": 4.8433564370029126e-05, + "loss": 0.0455, + "mean_token_accuracy": 0.9867386996746064, + "num_tokens": 87260438.0, + "step": 29365 + }, + { + "entropy": 0.09364954633638263, + "epoch": 6.846252476978669, + "grad_norm": 1.3828125, + "learning_rate": 4.843282445128866e-05, + "loss": 0.0433, + "mean_token_accuracy": 0.9867539525032043, + "num_tokens": 87277536.0, + "step": 29370 + }, + { + "entropy": 0.09267938621342182, + "epoch": 6.847418113999301, + "grad_norm": 1.8125, + "learning_rate": 4.843208436952333e-05, + "loss": 0.0578, + "mean_token_accuracy": 0.9815105080604554, + "num_tokens": 87287407.0, + "step": 29375 + }, + { + "entropy": 0.08081861436367035, + "epoch": 6.848583751019932, + "grad_norm": 2.1875, + "learning_rate": 4.8431344124744174e-05, + "loss": 0.0534, + "mean_token_accuracy": 0.9844126164913177, + "num_tokens": 87297492.0, + "step": 29380 + }, + { + "entropy": 0.11397622730582953, + "epoch": 6.849749388040564, + "grad_norm": 1.71875, + "learning_rate": 4.843060371696225e-05, + "loss": 0.1607, + "mean_token_accuracy": 0.9661115884780884, + "num_tokens": 87322709.0, + "step": 29385 + }, + { + "entropy": 0.04962232355028391, + "epoch": 6.850915025061196, + "grad_norm": 0.337890625, + "learning_rate": 4.842986314618857e-05, + "loss": 0.0259, + "mean_token_accuracy": 0.990403151512146, + "num_tokens": 87350847.0, + "step": 29390 + }, + { + "entropy": 0.0795666191726923, + "epoch": 6.8520806620818275, + "grad_norm": 1.2109375, + "learning_rate": 4.84291224124342e-05, + "loss": 0.0304, + "mean_token_accuracy": 0.9844847023487091, + "num_tokens": 87374215.0, + "step": 29395 + }, + { + "entropy": 0.12139319926500321, + "epoch": 6.85324629910246, + "grad_norm": 0.74609375, + "learning_rate": 4.842838151571017e-05, + "loss": 0.0829, + "mean_token_accuracy": 0.978274130821228, + "num_tokens": 87383830.0, + "step": 29400 + }, + { + "entropy": 0.09214160200208425, + "epoch": 6.854411936123091, + "grad_norm": 1.953125, + "learning_rate": 4.842764045602754e-05, + "loss": 0.0208, + "mean_token_accuracy": 0.9909944117069245, + "num_tokens": 87410168.0, + "step": 29405 + }, + { + "entropy": 0.07607862763106824, + "epoch": 6.855577573143723, + "grad_norm": 2.5625, + "learning_rate": 4.8426899233397346e-05, + "loss": 0.0454, + "mean_token_accuracy": 0.9829370498657226, + "num_tokens": 87428573.0, + "step": 29410 + }, + { + "entropy": 0.06585808522067964, + "epoch": 6.856743210164355, + "grad_norm": 0.412109375, + "learning_rate": 4.842615784783066e-05, + "loss": 0.0294, + "mean_token_accuracy": 0.9886744976043701, + "num_tokens": 87450620.0, + "step": 29415 + }, + { + "entropy": 0.06280166539363563, + "epoch": 6.857908847184986, + "grad_norm": 1.6328125, + "learning_rate": 4.842541629933852e-05, + "loss": 0.0122, + "mean_token_accuracy": 0.9930438220500946, + "num_tokens": 87491928.0, + "step": 29420 + }, + { + "entropy": 0.05158753078430891, + "epoch": 6.859074484205618, + "grad_norm": 0.77734375, + "learning_rate": 4.842467458793199e-05, + "loss": 0.0226, + "mean_token_accuracy": 0.9912914633750916, + "num_tokens": 87512955.0, + "step": 29425 + }, + { + "entropy": 0.07368856463581323, + "epoch": 6.8602401212262505, + "grad_norm": 1.3984375, + "learning_rate": 4.842393271362214e-05, + "loss": 0.048, + "mean_token_accuracy": 0.9881428837776184, + "num_tokens": 87527515.0, + "step": 29430 + }, + { + "entropy": 0.07875376977026463, + "epoch": 6.861405758246882, + "grad_norm": 1.2109375, + "learning_rate": 4.8423190676420014e-05, + "loss": 0.0414, + "mean_token_accuracy": 0.988014304637909, + "num_tokens": 87537921.0, + "step": 29435 + }, + { + "entropy": 0.05999251157045364, + "epoch": 6.862571395267514, + "grad_norm": 0.333984375, + "learning_rate": 4.842244847633669e-05, + "loss": 0.0221, + "mean_token_accuracy": 0.9940233111381531, + "num_tokens": 87554565.0, + "step": 29440 + }, + { + "entropy": 0.10110383518040181, + "epoch": 6.863737032288146, + "grad_norm": 4.40625, + "learning_rate": 4.842170611338323e-05, + "loss": 0.0724, + "mean_token_accuracy": 0.9790929615497589, + "num_tokens": 87572406.0, + "step": 29445 + }, + { + "entropy": 0.042115158122032884, + "epoch": 6.864902669308777, + "grad_norm": 0.2421875, + "learning_rate": 4.84209635875707e-05, + "loss": 0.0132, + "mean_token_accuracy": 0.9942797899246216, + "num_tokens": 87612902.0, + "step": 29450 + }, + { + "entropy": 0.1387539068236947, + "epoch": 6.866068306329409, + "grad_norm": 7.40625, + "learning_rate": 4.8420220898910174e-05, + "loss": 0.1285, + "mean_token_accuracy": 0.9800481796264648, + "num_tokens": 87655268.0, + "step": 29455 + }, + { + "entropy": 0.05575822722166777, + "epoch": 6.8672339433500404, + "grad_norm": 0.40625, + "learning_rate": 4.8419478047412736e-05, + "loss": 0.0225, + "mean_token_accuracy": 0.9901275217533112, + "num_tokens": 87683649.0, + "step": 29460 + }, + { + "entropy": 0.05922813983634114, + "epoch": 6.8683995803706726, + "grad_norm": 0.7734375, + "learning_rate": 4.841873503308946e-05, + "loss": 0.0268, + "mean_token_accuracy": 0.9899829208850861, + "num_tokens": 87703718.0, + "step": 29465 + }, + { + "entropy": 0.09449698030948639, + "epoch": 6.869565217391305, + "grad_norm": 2.71875, + "learning_rate": 4.8417991855951416e-05, + "loss": 0.0735, + "mean_token_accuracy": 0.9808722853660583, + "num_tokens": 87712649.0, + "step": 29470 + }, + { + "entropy": 0.07578764576464891, + "epoch": 6.870730854411936, + "grad_norm": 2.296875, + "learning_rate": 4.841724851600969e-05, + "loss": 0.0337, + "mean_token_accuracy": 0.9895140945911407, + "num_tokens": 87727487.0, + "step": 29475 + }, + { + "entropy": 0.059785145334899426, + "epoch": 6.871896491432568, + "grad_norm": 2.0625, + "learning_rate": 4.841650501327537e-05, + "loss": 0.0436, + "mean_token_accuracy": 0.9882297396659852, + "num_tokens": 87748033.0, + "step": 29480 + }, + { + "entropy": 0.05755780637264252, + "epoch": 6.873062128453199, + "grad_norm": 1.6953125, + "learning_rate": 4.8415761347759546e-05, + "loss": 0.0304, + "mean_token_accuracy": 0.9886388182640076, + "num_tokens": 87765201.0, + "step": 29485 + }, + { + "entropy": 0.06052704788744449, + "epoch": 6.874227765473831, + "grad_norm": 1.671875, + "learning_rate": 4.84150175194733e-05, + "loss": 0.027, + "mean_token_accuracy": 0.9908796191215515, + "num_tokens": 87794500.0, + "step": 29490 + }, + { + "entropy": 0.058810063265264036, + "epoch": 6.875393402494463, + "grad_norm": 1.578125, + "learning_rate": 4.8414273528427726e-05, + "loss": 0.0276, + "mean_token_accuracy": 0.9919278442859649, + "num_tokens": 87824833.0, + "step": 29495 + }, + { + "entropy": 0.05831845905631781, + "epoch": 6.876559039515095, + "grad_norm": 1.578125, + "learning_rate": 4.841352937463392e-05, + "loss": 0.0241, + "mean_token_accuracy": 0.991849547624588, + "num_tokens": 87841593.0, + "step": 29500 + }, + { + "entropy": 0.0872832141816616, + "epoch": 6.877724676535727, + "grad_norm": 3.0625, + "learning_rate": 4.841278505810296e-05, + "loss": 0.0489, + "mean_token_accuracy": 0.9851008772850036, + "num_tokens": 87852284.0, + "step": 29505 + }, + { + "entropy": 0.0699134774506092, + "epoch": 6.878890313556359, + "grad_norm": 1.1171875, + "learning_rate": 4.841204057884597e-05, + "loss": 0.0457, + "mean_token_accuracy": 0.9868590772151947, + "num_tokens": 87863119.0, + "step": 29510 + }, + { + "entropy": 0.0703543471172452, + "epoch": 6.88005595057699, + "grad_norm": 1.609375, + "learning_rate": 4.8411295936874054e-05, + "loss": 0.0359, + "mean_token_accuracy": 0.9903105676174164, + "num_tokens": 87878182.0, + "step": 29515 + }, + { + "entropy": 0.08448899425566196, + "epoch": 6.881221587597622, + "grad_norm": 1.6171875, + "learning_rate": 4.8410551132198295e-05, + "loss": 0.0394, + "mean_token_accuracy": 0.988205897808075, + "num_tokens": 87887563.0, + "step": 29520 + }, + { + "entropy": 0.06482985820621252, + "epoch": 6.882387224618254, + "grad_norm": 1.4296875, + "learning_rate": 4.840980616482981e-05, + "loss": 0.0354, + "mean_token_accuracy": 0.9864917457103729, + "num_tokens": 87907092.0, + "step": 29525 + }, + { + "entropy": 0.0666579614393413, + "epoch": 6.8835528616388855, + "grad_norm": 1.046875, + "learning_rate": 4.84090610347797e-05, + "loss": 0.0329, + "mean_token_accuracy": 0.9885077834129333, + "num_tokens": 87926240.0, + "step": 29530 + }, + { + "entropy": 0.06210904810577631, + "epoch": 6.884718498659518, + "grad_norm": 2.015625, + "learning_rate": 4.8408315742059087e-05, + "loss": 0.0463, + "mean_token_accuracy": 0.9860225260257721, + "num_tokens": 87943552.0, + "step": 29535 + }, + { + "entropy": 0.05175962019711733, + "epoch": 6.885884135680149, + "grad_norm": 1.03125, + "learning_rate": 4.8407570286679085e-05, + "loss": 0.0178, + "mean_token_accuracy": 0.995280122756958, + "num_tokens": 87963262.0, + "step": 29540 + }, + { + "entropy": 0.07738391607999802, + "epoch": 6.887049772700781, + "grad_norm": 1.4296875, + "learning_rate": 4.840682466865079e-05, + "loss": 0.0372, + "mean_token_accuracy": 0.9886998891830444, + "num_tokens": 87982910.0, + "step": 29545 + }, + { + "entropy": 0.0720852023921907, + "epoch": 6.888215409721413, + "grad_norm": 0.212890625, + "learning_rate": 4.840607888798535e-05, + "loss": 0.0289, + "mean_token_accuracy": 0.99115749001503, + "num_tokens": 88011529.0, + "step": 29550 + }, + { + "entropy": 0.05431947279721498, + "epoch": 6.889381046742044, + "grad_norm": 2.265625, + "learning_rate": 4.840533294469386e-05, + "loss": 0.023, + "mean_token_accuracy": 0.9919369578361511, + "num_tokens": 88041197.0, + "step": 29555 + }, + { + "entropy": 0.08043484576046467, + "epoch": 6.890546683762676, + "grad_norm": 0.80078125, + "learning_rate": 4.840458683878745e-05, + "loss": 0.0233, + "mean_token_accuracy": 0.992747563123703, + "num_tokens": 88055226.0, + "step": 29560 + }, + { + "entropy": 0.06275735646486283, + "epoch": 6.891712320783308, + "grad_norm": 1.7578125, + "learning_rate": 4.840384057027726e-05, + "loss": 0.0407, + "mean_token_accuracy": 0.9876458764076232, + "num_tokens": 88068457.0, + "step": 29565 + }, + { + "entropy": 0.059214035701006654, + "epoch": 6.89287795780394, + "grad_norm": 2.09375, + "learning_rate": 4.84030941391744e-05, + "loss": 0.03, + "mean_token_accuracy": 0.9914735913276672, + "num_tokens": 88102115.0, + "step": 29570 + }, + { + "entropy": 0.08597388043999672, + "epoch": 6.894043594824572, + "grad_norm": 1.7421875, + "learning_rate": 4.840234754549001e-05, + "loss": 0.0486, + "mean_token_accuracy": 0.9860342562198638, + "num_tokens": 88121858.0, + "step": 29575 + }, + { + "entropy": 0.07845104150474072, + "epoch": 6.895209231845204, + "grad_norm": 2.796875, + "learning_rate": 4.8401600789235215e-05, + "loss": 0.0421, + "mean_token_accuracy": 0.9856066465377807, + "num_tokens": 88138167.0, + "step": 29580 + }, + { + "entropy": 0.06135543161071837, + "epoch": 6.896374868865835, + "grad_norm": 1.7109375, + "learning_rate": 4.840085387042116e-05, + "loss": 0.0334, + "mean_token_accuracy": 0.9912704825401306, + "num_tokens": 88161014.0, + "step": 29585 + }, + { + "entropy": 0.04807945257052779, + "epoch": 6.897540505886467, + "grad_norm": 1.8984375, + "learning_rate": 4.840010678905898e-05, + "loss": 0.0217, + "mean_token_accuracy": 0.9900756359100342, + "num_tokens": 88187263.0, + "step": 29590 + }, + { + "entropy": 0.06505886856466532, + "epoch": 6.898706142907098, + "grad_norm": 2.984375, + "learning_rate": 4.839935954515981e-05, + "loss": 0.0498, + "mean_token_accuracy": 0.9886844336986542, + "num_tokens": 88205357.0, + "step": 29595 + }, + { + "entropy": 0.07731727361679078, + "epoch": 6.8998717799277305, + "grad_norm": 0.271484375, + "learning_rate": 4.839861213873479e-05, + "loss": 0.035, + "mean_token_accuracy": 0.9831252992153168, + "num_tokens": 88234551.0, + "step": 29600 + }, + { + "entropy": 0.06955102737993002, + "epoch": 6.901037416948363, + "grad_norm": 0.3828125, + "learning_rate": 4.8397864569795074e-05, + "loss": 0.0688, + "mean_token_accuracy": 0.9834696292877197, + "num_tokens": 88248261.0, + "step": 29605 + }, + { + "entropy": 0.08642586246132851, + "epoch": 6.902203053968994, + "grad_norm": 2.34375, + "learning_rate": 4.839711683835181e-05, + "loss": 0.0707, + "mean_token_accuracy": 0.9796038687229156, + "num_tokens": 88270591.0, + "step": 29610 + }, + { + "entropy": 0.08347761034965515, + "epoch": 6.903368690989626, + "grad_norm": 1.9296875, + "learning_rate": 4.839636894441614e-05, + "loss": 0.0355, + "mean_token_accuracy": 0.9871561110019684, + "num_tokens": 88290667.0, + "step": 29615 + }, + { + "entropy": 0.07617175304330885, + "epoch": 6.904534328010257, + "grad_norm": 1.9921875, + "learning_rate": 4.8395620887999224e-05, + "loss": 0.0406, + "mean_token_accuracy": 0.988088744878769, + "num_tokens": 88311904.0, + "step": 29620 + }, + { + "entropy": 0.0650593439117074, + "epoch": 6.905699965030889, + "grad_norm": 2.15625, + "learning_rate": 4.839487266911221e-05, + "loss": 0.051, + "mean_token_accuracy": 0.988483190536499, + "num_tokens": 88334225.0, + "step": 29625 + }, + { + "entropy": 0.07525872401893138, + "epoch": 6.906865602051521, + "grad_norm": 1.3359375, + "learning_rate": 4.8394124287766254e-05, + "loss": 0.0374, + "mean_token_accuracy": 0.9889621555805206, + "num_tokens": 88356933.0, + "step": 29630 + }, + { + "entropy": 0.07050515934824944, + "epoch": 6.9080312390721526, + "grad_norm": 1.9296875, + "learning_rate": 4.8393375743972526e-05, + "loss": 0.0343, + "mean_token_accuracy": 0.986613517999649, + "num_tokens": 88368806.0, + "step": 29635 + }, + { + "entropy": 0.04098109249025583, + "epoch": 6.909196876092785, + "grad_norm": 0.609375, + "learning_rate": 4.839262703774218e-05, + "loss": 0.0122, + "mean_token_accuracy": 0.9954572081565857, + "num_tokens": 88396176.0, + "step": 29640 + }, + { + "entropy": 0.09802703186869621, + "epoch": 6.910362513113417, + "grad_norm": 1.6875, + "learning_rate": 4.839187816908638e-05, + "loss": 0.0434, + "mean_token_accuracy": 0.9861961781978608, + "num_tokens": 88406007.0, + "step": 29645 + }, + { + "entropy": 0.07688620202243328, + "epoch": 6.911528150134048, + "grad_norm": 1.5625, + "learning_rate": 4.839112913801629e-05, + "loss": 0.0568, + "mean_token_accuracy": 0.9860326409339905, + "num_tokens": 88416726.0, + "step": 29650 + }, + { + "entropy": 0.06696718856692314, + "epoch": 6.91269378715468, + "grad_norm": 2.109375, + "learning_rate": 4.839037994454309e-05, + "loss": 0.0358, + "mean_token_accuracy": 0.9892750978469849, + "num_tokens": 88436628.0, + "step": 29655 + }, + { + "entropy": 0.061067532189190386, + "epoch": 6.913859424175312, + "grad_norm": 2.1875, + "learning_rate": 4.8389630588677934e-05, + "loss": 0.0264, + "mean_token_accuracy": 0.9902947187423706, + "num_tokens": 88455253.0, + "step": 29660 + }, + { + "entropy": 0.06965431291610003, + "epoch": 6.915025061195943, + "grad_norm": 1.515625, + "learning_rate": 4.838888107043202e-05, + "loss": 0.0269, + "mean_token_accuracy": 0.9907572269439697, + "num_tokens": 88471514.0, + "step": 29665 + }, + { + "entropy": 0.0637282345443964, + "epoch": 6.9161906982165755, + "grad_norm": 0.89453125, + "learning_rate": 4.8388131389816505e-05, + "loss": 0.0372, + "mean_token_accuracy": 0.9888510942459107, + "num_tokens": 88487253.0, + "step": 29670 + }, + { + "entropy": 0.06704912669956684, + "epoch": 6.917356335237207, + "grad_norm": 1.75, + "learning_rate": 4.838738154684258e-05, + "loss": 0.0458, + "mean_token_accuracy": 0.9872386932373047, + "num_tokens": 88497642.0, + "step": 29675 + }, + { + "entropy": 0.07361556626856328, + "epoch": 6.918521972257839, + "grad_norm": 2.9375, + "learning_rate": 4.838663154152141e-05, + "loss": 0.0355, + "mean_token_accuracy": 0.9899929702281952, + "num_tokens": 88516389.0, + "step": 29680 + }, + { + "entropy": 0.07161520551890135, + "epoch": 6.919687609278471, + "grad_norm": 0.5, + "learning_rate": 4.83858813738642e-05, + "loss": 0.0284, + "mean_token_accuracy": 0.9888733863830567, + "num_tokens": 88542247.0, + "step": 29685 + }, + { + "entropy": 0.14276986196637154, + "epoch": 6.920853246299102, + "grad_norm": 2.21875, + "learning_rate": 4.838513104388212e-05, + "loss": 0.1822, + "mean_token_accuracy": 0.9694925010204315, + "num_tokens": 88570753.0, + "step": 29690 + }, + { + "entropy": 0.03625276731327176, + "epoch": 6.922018883319734, + "grad_norm": 0.1201171875, + "learning_rate": 4.838438055158636e-05, + "loss": 0.0113, + "mean_token_accuracy": 0.9950872659683228, + "num_tokens": 88612661.0, + "step": 29695 + }, + { + "entropy": 0.04458741853013635, + "epoch": 6.923184520340366, + "grad_norm": 0.478515625, + "learning_rate": 4.8383629896988126e-05, + "loss": 0.0244, + "mean_token_accuracy": 0.991793304681778, + "num_tokens": 88649892.0, + "step": 29700 + }, + { + "entropy": 0.05614535929635167, + "epoch": 6.924350157360998, + "grad_norm": 0.45703125, + "learning_rate": 4.838287908009859e-05, + "loss": 0.0321, + "mean_token_accuracy": 0.9873059391975403, + "num_tokens": 88674555.0, + "step": 29705 + }, + { + "entropy": 0.07231198623776436, + "epoch": 6.92551579438163, + "grad_norm": 1.296875, + "learning_rate": 4.8382128100928965e-05, + "loss": 0.0338, + "mean_token_accuracy": 0.9906602203845978, + "num_tokens": 88687291.0, + "step": 29710 + }, + { + "entropy": 0.06145105315372348, + "epoch": 6.926681431402262, + "grad_norm": 0.63671875, + "learning_rate": 4.838137695949044e-05, + "loss": 0.0335, + "mean_token_accuracy": 0.9919077694416046, + "num_tokens": 88716467.0, + "step": 29715 + }, + { + "entropy": 0.06858117748051881, + "epoch": 6.927847068422893, + "grad_norm": 2.359375, + "learning_rate": 4.8380625655794216e-05, + "loss": 0.0487, + "mean_token_accuracy": 0.9860714495182037, + "num_tokens": 88731304.0, + "step": 29720 + }, + { + "entropy": 0.06362678054720164, + "epoch": 6.929012705443525, + "grad_norm": 1.9453125, + "learning_rate": 4.83798741898515e-05, + "loss": 0.0281, + "mean_token_accuracy": 0.9902177393436432, + "num_tokens": 88753419.0, + "step": 29725 + }, + { + "entropy": 0.07195475585758686, + "epoch": 6.930178342464156, + "grad_norm": 0.83984375, + "learning_rate": 4.8379122561673496e-05, + "loss": 0.0417, + "mean_token_accuracy": 0.9873518884181977, + "num_tokens": 88762473.0, + "step": 29730 + }, + { + "entropy": 0.05631625261157751, + "epoch": 6.931343979484788, + "grad_norm": 0.1982421875, + "learning_rate": 4.837837077127141e-05, + "loss": 0.0233, + "mean_token_accuracy": 0.9895540356636048, + "num_tokens": 88793402.0, + "step": 29735 + }, + { + "entropy": 0.08471834966912865, + "epoch": 6.9325096165054205, + "grad_norm": 2.1875, + "learning_rate": 4.8377618818656454e-05, + "loss": 0.0406, + "mean_token_accuracy": 0.9833233654499054, + "num_tokens": 88811724.0, + "step": 29740 + }, + { + "entropy": 0.06793053429573774, + "epoch": 6.933675253526052, + "grad_norm": 2.15625, + "learning_rate": 4.837686670383984e-05, + "loss": 0.0441, + "mean_token_accuracy": 0.9840839326381683, + "num_tokens": 88825353.0, + "step": 29745 + }, + { + "entropy": 0.06626611463725567, + "epoch": 6.934840890546684, + "grad_norm": 3.609375, + "learning_rate": 4.837611442683279e-05, + "loss": 0.0518, + "mean_token_accuracy": 0.9845434069633484, + "num_tokens": 88835435.0, + "step": 29750 + }, + { + "entropy": 0.0747138449922204, + "epoch": 6.936006527567315, + "grad_norm": 0.4296875, + "learning_rate": 4.8375361987646506e-05, + "loss": 0.0437, + "mean_token_accuracy": 0.9863695621490478, + "num_tokens": 88846040.0, + "step": 29755 + }, + { + "entropy": 0.05634311148896813, + "epoch": 6.937172164587947, + "grad_norm": 1.546875, + "learning_rate": 4.837460938629222e-05, + "loss": 0.0249, + "mean_token_accuracy": 0.9908220171928406, + "num_tokens": 88880039.0, + "step": 29760 + }, + { + "entropy": 0.06870086174458265, + "epoch": 6.938337801608579, + "grad_norm": 1.71875, + "learning_rate": 4.837385662278116e-05, + "loss": 0.0392, + "mean_token_accuracy": 0.9845155358314515, + "num_tokens": 88894671.0, + "step": 29765 + }, + { + "entropy": 0.06056760400533676, + "epoch": 6.9395034386292105, + "grad_norm": 1.234375, + "learning_rate": 4.8373103697124535e-05, + "loss": 0.0272, + "mean_token_accuracy": 0.9929500162601471, + "num_tokens": 88918540.0, + "step": 29770 + }, + { + "entropy": 0.07818248439580203, + "epoch": 6.940669075649843, + "grad_norm": 3.6875, + "learning_rate": 4.837235060933358e-05, + "loss": 0.0348, + "mean_token_accuracy": 0.9911899507045746, + "num_tokens": 88947902.0, + "step": 29775 + }, + { + "entropy": 0.08559078127145767, + "epoch": 6.941834712670475, + "grad_norm": 2.453125, + "learning_rate": 4.837159735941953e-05, + "loss": 0.0427, + "mean_token_accuracy": 0.988016277551651, + "num_tokens": 88959447.0, + "step": 29780 + }, + { + "entropy": 0.06198177421465516, + "epoch": 6.943000349691106, + "grad_norm": 0.29296875, + "learning_rate": 4.8370843947393604e-05, + "loss": 0.0516, + "mean_token_accuracy": 0.9868219137191773, + "num_tokens": 88990661.0, + "step": 29785 + }, + { + "entropy": 0.11046778000891208, + "epoch": 6.944165986711738, + "grad_norm": 4.03125, + "learning_rate": 4.837009037326705e-05, + "loss": 0.0414, + "mean_token_accuracy": 0.9880511343479157, + "num_tokens": 89009245.0, + "step": 29790 + }, + { + "entropy": 0.0634513407945633, + "epoch": 6.94533162373237, + "grad_norm": 2.0625, + "learning_rate": 4.836933663705109e-05, + "loss": 0.0465, + "mean_token_accuracy": 0.9846070647239685, + "num_tokens": 89019910.0, + "step": 29795 + }, + { + "entropy": 0.057706226408481595, + "epoch": 6.946497260753001, + "grad_norm": 1.3828125, + "learning_rate": 4.836858273875698e-05, + "loss": 0.0395, + "mean_token_accuracy": 0.9864011585712433, + "num_tokens": 89035451.0, + "step": 29800 + }, + { + "entropy": 0.06847372064366937, + "epoch": 6.947662897773633, + "grad_norm": 1.890625, + "learning_rate": 4.836782867839595e-05, + "loss": 0.0292, + "mean_token_accuracy": 0.9899077951908112, + "num_tokens": 89059892.0, + "step": 29805 + }, + { + "entropy": 0.16696815257892011, + "epoch": 6.948828534794265, + "grad_norm": 1.546875, + "learning_rate": 4.836707445597925e-05, + "loss": 0.3172, + "mean_token_accuracy": 0.9391492247581482, + "num_tokens": 89083414.0, + "step": 29810 + }, + { + "entropy": 0.06408511018380522, + "epoch": 6.949994171814897, + "grad_norm": 1.21875, + "learning_rate": 4.836632007151813e-05, + "loss": 0.037, + "mean_token_accuracy": 0.9896316051483154, + "num_tokens": 89100519.0, + "step": 29815 + }, + { + "entropy": 0.06854904610663652, + "epoch": 6.951159808835529, + "grad_norm": 1.9375, + "learning_rate": 4.8365565525023825e-05, + "loss": 0.0428, + "mean_token_accuracy": 0.987441337108612, + "num_tokens": 89115773.0, + "step": 29820 + }, + { + "entropy": 0.07351719280704856, + "epoch": 6.95232544585616, + "grad_norm": 2.09375, + "learning_rate": 4.8364810816507596e-05, + "loss": 0.0264, + "mean_token_accuracy": 0.9904137134552002, + "num_tokens": 89140065.0, + "step": 29825 + }, + { + "entropy": 0.06502757361158729, + "epoch": 6.953491082876792, + "grad_norm": 0.3671875, + "learning_rate": 4.8364055945980704e-05, + "loss": 0.0302, + "mean_token_accuracy": 0.9865997076034546, + "num_tokens": 89167133.0, + "step": 29830 + }, + { + "entropy": 0.058092600852251056, + "epoch": 6.954656719897424, + "grad_norm": 2.34375, + "learning_rate": 4.8363300913454396e-05, + "loss": 0.0307, + "mean_token_accuracy": 0.9902624368667603, + "num_tokens": 89185996.0, + "step": 29835 + }, + { + "entropy": 0.07471430338919163, + "epoch": 6.9558223569180555, + "grad_norm": 2.65625, + "learning_rate": 4.836254571893993e-05, + "loss": 0.0326, + "mean_token_accuracy": 0.9865080058574677, + "num_tokens": 89210643.0, + "step": 29840 + }, + { + "entropy": 0.06368034984916449, + "epoch": 6.956987993938688, + "grad_norm": 1.21875, + "learning_rate": 4.8361790362448564e-05, + "loss": 0.0261, + "mean_token_accuracy": 0.9888711094856262, + "num_tokens": 89233234.0, + "step": 29845 + }, + { + "entropy": 0.06988657917827368, + "epoch": 6.95815363095932, + "grad_norm": 1.3671875, + "learning_rate": 4.836103484399157e-05, + "loss": 0.0356, + "mean_token_accuracy": 0.9887575566768646, + "num_tokens": 89252582.0, + "step": 29850 + }, + { + "entropy": 0.05337027087807655, + "epoch": 6.959319267979951, + "grad_norm": 0.396484375, + "learning_rate": 4.836027916358021e-05, + "loss": 0.0233, + "mean_token_accuracy": 0.9896778404712677, + "num_tokens": 89286224.0, + "step": 29855 + }, + { + "entropy": 0.06208705846220255, + "epoch": 6.960484905000583, + "grad_norm": 0.25390625, + "learning_rate": 4.835952332122576e-05, + "loss": 0.0223, + "mean_token_accuracy": 0.9893724799156189, + "num_tokens": 89310665.0, + "step": 29860 + }, + { + "entropy": 0.07674986347556115, + "epoch": 6.961650542021214, + "grad_norm": 1.828125, + "learning_rate": 4.835876731693948e-05, + "loss": 0.0564, + "mean_token_accuracy": 0.9860752344131469, + "num_tokens": 89323958.0, + "step": 29865 + }, + { + "entropy": 0.05415382776409387, + "epoch": 6.962816179041846, + "grad_norm": 0.47265625, + "learning_rate": 4.835801115073264e-05, + "loss": 0.0375, + "mean_token_accuracy": 0.9895894765853882, + "num_tokens": 89336539.0, + "step": 29870 + }, + { + "entropy": 0.09419967234134674, + "epoch": 6.9639818160624785, + "grad_norm": 1.7578125, + "learning_rate": 4.8357254822616524e-05, + "loss": 0.054, + "mean_token_accuracy": 0.9822405338287353, + "num_tokens": 89345356.0, + "step": 29875 + }, + { + "entropy": 0.08086830032989382, + "epoch": 6.96514745308311, + "grad_norm": 1.421875, + "learning_rate": 4.835649833260242e-05, + "loss": 0.0447, + "mean_token_accuracy": 0.9842866778373718, + "num_tokens": 89363463.0, + "step": 29880 + }, + { + "entropy": 0.07272738628089429, + "epoch": 6.966313090103742, + "grad_norm": 2.53125, + "learning_rate": 4.835574168070158e-05, + "loss": 0.047, + "mean_token_accuracy": 0.9860274016857147, + "num_tokens": 89373757.0, + "step": 29885 + }, + { + "entropy": 0.06665248805657029, + "epoch": 6.967478727124373, + "grad_norm": 1.703125, + "learning_rate": 4.835498486692531e-05, + "loss": 0.0366, + "mean_token_accuracy": 0.9895995616912842, + "num_tokens": 89402671.0, + "step": 29890 + }, + { + "entropy": 0.07039319984614849, + "epoch": 6.968644364145005, + "grad_norm": 1.2890625, + "learning_rate": 4.8354227891284895e-05, + "loss": 0.0314, + "mean_token_accuracy": 0.9923740804195404, + "num_tokens": 89420719.0, + "step": 29895 + }, + { + "entropy": 0.07703676130622625, + "epoch": 6.969810001165637, + "grad_norm": 2.28125, + "learning_rate": 4.8353470753791616e-05, + "loss": 0.047, + "mean_token_accuracy": 0.9882120728492737, + "num_tokens": 89441246.0, + "step": 29900 + }, + { + "entropy": 0.08545290175825357, + "epoch": 6.970975638186268, + "grad_norm": 3.5625, + "learning_rate": 4.8352713454456755e-05, + "loss": 0.045, + "mean_token_accuracy": 0.987491762638092, + "num_tokens": 89455017.0, + "step": 29905 + }, + { + "entropy": 0.08963761646300554, + "epoch": 6.9721412752069005, + "grad_norm": 1.0390625, + "learning_rate": 4.835195599329162e-05, + "loss": 0.0597, + "mean_token_accuracy": 0.9822163105010986, + "num_tokens": 89472799.0, + "step": 29910 + }, + { + "entropy": 0.10005468633025885, + "epoch": 6.973306912227533, + "grad_norm": 1.5078125, + "learning_rate": 4.83511983703075e-05, + "loss": 0.0376, + "mean_token_accuracy": 0.9900958836078644, + "num_tokens": 89487215.0, + "step": 29915 + }, + { + "entropy": 0.07748437076807022, + "epoch": 6.974472549248164, + "grad_norm": 2.46875, + "learning_rate": 4.8350440585515685e-05, + "loss": 0.0452, + "mean_token_accuracy": 0.9832747519016266, + "num_tokens": 89498468.0, + "step": 29920 + }, + { + "entropy": 0.06895052138715982, + "epoch": 6.975638186268796, + "grad_norm": 2.234375, + "learning_rate": 4.834968263892748e-05, + "loss": 0.0433, + "mean_token_accuracy": 0.9874395668506623, + "num_tokens": 89518035.0, + "step": 29925 + }, + { + "entropy": 0.06828312175348401, + "epoch": 6.976803823289428, + "grad_norm": 2.0625, + "learning_rate": 4.83489245305542e-05, + "loss": 0.0297, + "mean_token_accuracy": 0.9890495955944061, + "num_tokens": 89546377.0, + "step": 29930 + }, + { + "entropy": 0.06735577872022987, + "epoch": 6.977969460310059, + "grad_norm": 1.7265625, + "learning_rate": 4.8348166260407126e-05, + "loss": 0.0359, + "mean_token_accuracy": 0.9898211777210235, + "num_tokens": 89563052.0, + "step": 29935 + }, + { + "entropy": 0.07529946230351925, + "epoch": 6.979135097330691, + "grad_norm": 0.70703125, + "learning_rate": 4.834740782849758e-05, + "loss": 0.0206, + "mean_token_accuracy": 0.9889273405075073, + "num_tokens": 89585304.0, + "step": 29940 + }, + { + "entropy": 0.06895999712869524, + "epoch": 6.980300734351323, + "grad_norm": 1.1171875, + "learning_rate": 4.8346649234836865e-05, + "loss": 0.029, + "mean_token_accuracy": 0.9874065518379211, + "num_tokens": 89606457.0, + "step": 29945 + }, + { + "entropy": 0.07593924328684806, + "epoch": 6.981466371371955, + "grad_norm": 2.296875, + "learning_rate": 4.8345890479436295e-05, + "loss": 0.042, + "mean_token_accuracy": 0.9867578148841858, + "num_tokens": 89618224.0, + "step": 29950 + }, + { + "entropy": 0.059674417972564696, + "epoch": 6.982632008392587, + "grad_norm": 1.25, + "learning_rate": 4.834513156230719e-05, + "loss": 0.028, + "mean_token_accuracy": 0.9930515587329865, + "num_tokens": 89636322.0, + "step": 29955 + }, + { + "entropy": 0.06889725560322404, + "epoch": 6.983797645413218, + "grad_norm": 0.3125, + "learning_rate": 4.834437248346086e-05, + "loss": 0.0429, + "mean_token_accuracy": 0.9854258477687836, + "num_tokens": 89655119.0, + "step": 29960 + }, + { + "entropy": 0.08493516966700554, + "epoch": 6.98496328243385, + "grad_norm": 1.859375, + "learning_rate": 4.8343613242908624e-05, + "loss": 0.0396, + "mean_token_accuracy": 0.9889319539070129, + "num_tokens": 89670553.0, + "step": 29965 + }, + { + "entropy": 0.06519560664892196, + "epoch": 6.986128919454482, + "grad_norm": 2.640625, + "learning_rate": 4.834285384066181e-05, + "loss": 0.04, + "mean_token_accuracy": 0.9877417862415314, + "num_tokens": 89690403.0, + "step": 29970 + }, + { + "entropy": 0.06538761556148528, + "epoch": 6.987294556475113, + "grad_norm": 1.0390625, + "learning_rate": 4.834209427673173e-05, + "loss": 0.0407, + "mean_token_accuracy": 0.9880840897560119, + "num_tokens": 89702398.0, + "step": 29975 + }, + { + "entropy": 0.061143916193395854, + "epoch": 6.9884601934957455, + "grad_norm": 2.078125, + "learning_rate": 4.8341334551129716e-05, + "loss": 0.035, + "mean_token_accuracy": 0.9870936453342438, + "num_tokens": 89724457.0, + "step": 29980 + }, + { + "entropy": 0.0721611020155251, + "epoch": 6.989625830516378, + "grad_norm": 1.2421875, + "learning_rate": 4.8340574663867105e-05, + "loss": 0.0287, + "mean_token_accuracy": 0.9912511527538299, + "num_tokens": 89741400.0, + "step": 29985 + }, + { + "entropy": 0.07327480353415013, + "epoch": 6.990791467537009, + "grad_norm": 2.3125, + "learning_rate": 4.8339814614955216e-05, + "loss": 0.06, + "mean_token_accuracy": 0.9843566417694092, + "num_tokens": 89751318.0, + "step": 29990 + }, + { + "entropy": 0.0823413584381342, + "epoch": 6.991957104557641, + "grad_norm": 2.140625, + "learning_rate": 4.8339054404405384e-05, + "loss": 0.0423, + "mean_token_accuracy": 0.9876652479171752, + "num_tokens": 89762386.0, + "step": 29995 + }, + { + "entropy": 0.07767243403941393, + "epoch": 6.993122741578272, + "grad_norm": 1.9140625, + "learning_rate": 4.8338294032228954e-05, + "loss": 0.0382, + "mean_token_accuracy": 0.9865681767463684, + "num_tokens": 89783433.0, + "step": 30000 + }, + { + "entropy": 0.058387274667620656, + "epoch": 6.994288378598904, + "grad_norm": 0.6875, + "learning_rate": 4.8337533498437256e-05, + "loss": 0.0255, + "mean_token_accuracy": 0.9925489664077759, + "num_tokens": 89813128.0, + "step": 30005 + }, + { + "entropy": 0.05891055092215538, + "epoch": 6.995454015619536, + "grad_norm": 0.55859375, + "learning_rate": 4.833677280304163e-05, + "loss": 0.043, + "mean_token_accuracy": 0.9888588845729828, + "num_tokens": 89827575.0, + "step": 30010 + }, + { + "entropy": 0.049707501847296956, + "epoch": 6.996619652640168, + "grad_norm": 0.8359375, + "learning_rate": 4.8336011946053426e-05, + "loss": 0.0265, + "mean_token_accuracy": 0.992890453338623, + "num_tokens": 89855135.0, + "step": 30015 + }, + { + "entropy": 0.0824981439858675, + "epoch": 6.9977852896608, + "grad_norm": 0.734375, + "learning_rate": 4.833525092748399e-05, + "loss": 0.0532, + "mean_token_accuracy": 0.983066338300705, + "num_tokens": 89867098.0, + "step": 30020 + }, + { + "entropy": 0.08736152742058038, + "epoch": 6.998950926681431, + "grad_norm": 0.455078125, + "learning_rate": 4.8334489747344656e-05, + "loss": 0.0337, + "mean_token_accuracy": 0.9897234261035919, + "num_tokens": 89879929.0, + "step": 30025 + }, + { + "entropy": 0.15436643744922346, + "epoch": 7.0, + "grad_norm": 1.4375, + "learning_rate": 4.8333728405646787e-05, + "loss": 0.2063, + "mean_token_accuracy": 0.9645602371957567, + "num_tokens": 89903750.0, + "step": 30030 + }, + { + "entropy": 0.056994407624006274, + "epoch": 7.001165637020632, + "grad_norm": 1.4140625, + "learning_rate": 4.8332966902401736e-05, + "loss": 0.0151, + "mean_token_accuracy": 0.9942268610000611, + "num_tokens": 89917618.0, + "step": 30035 + }, + { + "entropy": 0.05067940205335617, + "epoch": 7.002331274041263, + "grad_norm": 1.703125, + "learning_rate": 4.833220523762085e-05, + "loss": 0.0105, + "mean_token_accuracy": 0.9973496615886688, + "num_tokens": 89942256.0, + "step": 30040 + }, + { + "entropy": 0.05828417530283332, + "epoch": 7.003496911061895, + "grad_norm": 0.32421875, + "learning_rate": 4.833144341131549e-05, + "loss": 0.0105, + "mean_token_accuracy": 0.9979564428329468, + "num_tokens": 89962786.0, + "step": 30045 + }, + { + "entropy": 0.03842740654945374, + "epoch": 7.0046625480825275, + "grad_norm": 0.5390625, + "learning_rate": 4.833068142349703e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9983078181743622, + "num_tokens": 89993518.0, + "step": 30050 + }, + { + "entropy": 0.07764594964683055, + "epoch": 7.005828185103159, + "grad_norm": 1.390625, + "learning_rate": 4.8329919274176804e-05, + "loss": 0.0146, + "mean_token_accuracy": 0.9965053021907806, + "num_tokens": 90002798.0, + "step": 30055 + }, + { + "entropy": 0.04811737164855003, + "epoch": 7.006993822123791, + "grad_norm": 0.171875, + "learning_rate": 4.83291569633662e-05, + "loss": 0.0149, + "mean_token_accuracy": 0.9973034262657166, + "num_tokens": 90031177.0, + "step": 30060 + }, + { + "entropy": 0.05407378124073148, + "epoch": 7.008159459144422, + "grad_norm": 0.224609375, + "learning_rate": 4.832839449107658e-05, + "loss": 0.0072, + "mean_token_accuracy": 0.9978997349739075, + "num_tokens": 90061071.0, + "step": 30065 + }, + { + "entropy": 0.051250881422311066, + "epoch": 7.009325096165054, + "grad_norm": 0.65625, + "learning_rate": 4.832763185731931e-05, + "loss": 0.0058, + "mean_token_accuracy": 0.9979933738708496, + "num_tokens": 90092153.0, + "step": 30070 + }, + { + "entropy": 0.05817384775727987, + "epoch": 7.010490733185686, + "grad_norm": 0.3515625, + "learning_rate": 4.832686906210576e-05, + "loss": 0.0071, + "mean_token_accuracy": 0.9981231033802033, + "num_tokens": 90107207.0, + "step": 30075 + }, + { + "entropy": 0.061513442732393744, + "epoch": 7.0116563702063175, + "grad_norm": 1.1796875, + "learning_rate": 4.83261061054473e-05, + "loss": 0.005, + "mean_token_accuracy": 0.9992294490337372, + "num_tokens": 90120331.0, + "step": 30080 + }, + { + "entropy": 0.05991012919694185, + "epoch": 7.01282200722695, + "grad_norm": 3.09375, + "learning_rate": 4.832534298735532e-05, + "loss": 0.0233, + "mean_token_accuracy": 0.9938737094402313, + "num_tokens": 90134757.0, + "step": 30085 + }, + { + "entropy": 0.05594187341630459, + "epoch": 7.013987644247582, + "grad_norm": 1.1640625, + "learning_rate": 4.8324579707841206e-05, + "loss": 0.0175, + "mean_token_accuracy": 0.9964846134185791, + "num_tokens": 90148305.0, + "step": 30090 + }, + { + "entropy": 0.04562456281855702, + "epoch": 7.015153281268213, + "grad_norm": 0.296875, + "learning_rate": 4.832381626691632e-05, + "loss": 0.0042, + "mean_token_accuracy": 0.9982622146606446, + "num_tokens": 90177205.0, + "step": 30095 + }, + { + "entropy": 0.05958506986498833, + "epoch": 7.016318918288845, + "grad_norm": 1.0859375, + "learning_rate": 4.832305266459205e-05, + "loss": 0.0142, + "mean_token_accuracy": 0.9959238231182098, + "num_tokens": 90188118.0, + "step": 30100 + }, + { + "entropy": 0.03934578532353043, + "epoch": 7.017484555309476, + "grad_norm": 0.53515625, + "learning_rate": 4.832228890087979e-05, + "loss": 0.0085, + "mean_token_accuracy": 0.9973318040370941, + "num_tokens": 90217972.0, + "step": 30105 + }, + { + "entropy": 0.06448184214532375, + "epoch": 7.018650192330108, + "grad_norm": 1.8984375, + "learning_rate": 4.832152497579092e-05, + "loss": 0.0131, + "mean_token_accuracy": 0.9953521192073822, + "num_tokens": 90237568.0, + "step": 30110 + }, + { + "entropy": 0.07385947611182928, + "epoch": 7.01981582935074, + "grad_norm": 1.015625, + "learning_rate": 4.8320760889336846e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.9970715284347534, + "num_tokens": 90262706.0, + "step": 30115 + }, + { + "entropy": 0.07642426621168852, + "epoch": 7.020981466371372, + "grad_norm": 1.640625, + "learning_rate": 4.8319996641528945e-05, + "loss": 0.0144, + "mean_token_accuracy": 0.996442312002182, + "num_tokens": 90279668.0, + "step": 30120 + }, + { + "entropy": 0.05433262949809432, + "epoch": 7.022147103392004, + "grad_norm": 0.2890625, + "learning_rate": 4.831923223237862e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9964205861091614, + "num_tokens": 90302213.0, + "step": 30125 + }, + { + "entropy": 0.05782925793901086, + "epoch": 7.023312740412636, + "grad_norm": 0.458984375, + "learning_rate": 4.831846766189727e-05, + "loss": 0.0183, + "mean_token_accuracy": 0.995810043811798, + "num_tokens": 90323727.0, + "step": 30130 + }, + { + "entropy": 0.05381921608932316, + "epoch": 7.024478377433267, + "grad_norm": 1.046875, + "learning_rate": 4.831770293009629e-05, + "loss": 0.0072, + "mean_token_accuracy": 0.9968258440494537, + "num_tokens": 90345446.0, + "step": 30135 + }, + { + "entropy": 0.05901498403400183, + "epoch": 7.025644014453899, + "grad_norm": 0.62109375, + "learning_rate": 4.83169380369871e-05, + "loss": 0.0191, + "mean_token_accuracy": 0.9951477110385895, + "num_tokens": 90368135.0, + "step": 30140 + }, + { + "entropy": 0.05866720397025347, + "epoch": 7.02680965147453, + "grad_norm": 1.6328125, + "learning_rate": 4.831617298258109e-05, + "loss": 0.0148, + "mean_token_accuracy": 0.996071708202362, + "num_tokens": 90387942.0, + "step": 30145 + }, + { + "entropy": 0.05969673302024603, + "epoch": 7.0279752884951625, + "grad_norm": 0.1318359375, + "learning_rate": 4.8315407766889665e-05, + "loss": 0.0133, + "mean_token_accuracy": 0.9976077020168305, + "num_tokens": 90441451.0, + "step": 30150 + }, + { + "entropy": 0.05562372365966439, + "epoch": 7.029140925515795, + "grad_norm": 1.6796875, + "learning_rate": 4.8314642389924246e-05, + "loss": 0.0118, + "mean_token_accuracy": 0.9983762323856353, + "num_tokens": 90455069.0, + "step": 30155 + }, + { + "entropy": 0.05172123843804002, + "epoch": 7.030306562536426, + "grad_norm": 0.26171875, + "learning_rate": 4.831387685169625e-05, + "loss": 0.0075, + "mean_token_accuracy": 0.996476697921753, + "num_tokens": 90473399.0, + "step": 30160 + }, + { + "entropy": 0.04368160245940089, + "epoch": 7.031472199557058, + "grad_norm": 0.287109375, + "learning_rate": 4.831311115221708e-05, + "loss": 0.0077, + "mean_token_accuracy": 0.998294198513031, + "num_tokens": 90500992.0, + "step": 30165 + }, + { + "entropy": 0.08033382706344128, + "epoch": 7.03263783657769, + "grad_norm": 2.203125, + "learning_rate": 4.8312345291498154e-05, + "loss": 0.0106, + "mean_token_accuracy": 0.9958215236663819, + "num_tokens": 90514663.0, + "step": 30170 + }, + { + "entropy": 0.0787579096853733, + "epoch": 7.033803473598321, + "grad_norm": 4.125, + "learning_rate": 4.83115792695509e-05, + "loss": 0.0281, + "mean_token_accuracy": 0.9938448548316956, + "num_tokens": 90530567.0, + "step": 30175 + }, + { + "entropy": 0.05292491652071476, + "epoch": 7.034969110618953, + "grad_norm": 0.51953125, + "learning_rate": 4.831081308638674e-05, + "loss": 0.0132, + "mean_token_accuracy": 0.9956141471862793, + "num_tokens": 90559390.0, + "step": 30180 + }, + { + "entropy": 0.05749682649038732, + "epoch": 7.0361347476395855, + "grad_norm": 0.2890625, + "learning_rate": 4.83100467420171e-05, + "loss": 0.0096, + "mean_token_accuracy": 0.9966318845748902, + "num_tokens": 90593190.0, + "step": 30185 + }, + { + "entropy": 0.060346757806837556, + "epoch": 7.037300384660217, + "grad_norm": 0.84375, + "learning_rate": 4.8309280236453395e-05, + "loss": 0.016, + "mean_token_accuracy": 0.9961776256561279, + "num_tokens": 90607243.0, + "step": 30190 + }, + { + "entropy": 0.0677456783130765, + "epoch": 7.038466021680849, + "grad_norm": 2.0, + "learning_rate": 4.830851356970707e-05, + "loss": 0.0147, + "mean_token_accuracy": 0.9954315066337586, + "num_tokens": 90619100.0, + "step": 30195 + }, + { + "entropy": 0.045301594864577056, + "epoch": 7.03963165870148, + "grad_norm": 1.375, + "learning_rate": 4.830774674178955e-05, + "loss": 0.0057, + "mean_token_accuracy": 0.9975920677185058, + "num_tokens": 90659501.0, + "step": 30200 + }, + { + "entropy": 0.05319325625896454, + "epoch": 7.040797295722112, + "grad_norm": 0.71875, + "learning_rate": 4.8306979752712264e-05, + "loss": 0.016, + "mean_token_accuracy": 0.9953602194786072, + "num_tokens": 90682812.0, + "step": 30205 + }, + { + "entropy": 0.036827477253973485, + "epoch": 7.041962932742744, + "grad_norm": 1.4375, + "learning_rate": 4.830621260248667e-05, + "loss": 0.0075, + "mean_token_accuracy": 0.9976148486137391, + "num_tokens": 90725897.0, + "step": 30210 + }, + { + "entropy": 0.06985178105533123, + "epoch": 7.043128569763375, + "grad_norm": 0.90625, + "learning_rate": 4.830544529112418e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9991308927536011, + "num_tokens": 90745736.0, + "step": 30215 + }, + { + "entropy": 0.06130665661767125, + "epoch": 7.0442942067840075, + "grad_norm": 0.484375, + "learning_rate": 4.830467781863625e-05, + "loss": 0.018, + "mean_token_accuracy": 0.9954739928245544, + "num_tokens": 90770570.0, + "step": 30220 + }, + { + "entropy": 0.05789151154458523, + "epoch": 7.04545984380464, + "grad_norm": 1.21875, + "learning_rate": 4.830391018503433e-05, + "loss": 0.015, + "mean_token_accuracy": 0.9938674390316009, + "num_tokens": 90783819.0, + "step": 30225 + }, + { + "entropy": 0.10048946421593427, + "epoch": 7.046625480825271, + "grad_norm": 0.73046875, + "learning_rate": 4.830314239032985e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9974006175994873, + "num_tokens": 90802624.0, + "step": 30230 + }, + { + "entropy": 0.03963483748957515, + "epoch": 7.047791117845903, + "grad_norm": 0.349609375, + "learning_rate": 4.830237443453427e-05, + "loss": 0.005, + "mean_token_accuracy": 0.9980258822441102, + "num_tokens": 90834972.0, + "step": 30235 + }, + { + "entropy": 0.05992593262344599, + "epoch": 7.048956754866534, + "grad_norm": 0.97265625, + "learning_rate": 4.830160631765904e-05, + "loss": 0.0133, + "mean_token_accuracy": 0.9962817609310151, + "num_tokens": 90855001.0, + "step": 30240 + }, + { + "entropy": 0.052057741489261386, + "epoch": 7.050122391887166, + "grad_norm": 0.6640625, + "learning_rate": 4.830083803971562e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.999865609407425, + "num_tokens": 90872978.0, + "step": 30245 + }, + { + "entropy": 0.05767310978844762, + "epoch": 7.051288028907798, + "grad_norm": 0.392578125, + "learning_rate": 4.830006960071545e-05, + "loss": 0.0115, + "mean_token_accuracy": 0.9978439092636109, + "num_tokens": 90904123.0, + "step": 30250 + }, + { + "entropy": 0.24345403034240007, + "epoch": 7.05245366592843, + "grad_norm": 5.6875, + "learning_rate": 4.8299301000670006e-05, + "loss": 0.3557, + "mean_token_accuracy": 0.9590793073177337, + "num_tokens": 90939162.0, + "step": 30255 + }, + { + "entropy": 0.08307226374745369, + "epoch": 7.053619302949062, + "grad_norm": 1.09375, + "learning_rate": 4.829853223959073e-05, + "loss": 0.0143, + "mean_token_accuracy": 0.9957392334938049, + "num_tokens": 90956827.0, + "step": 30260 + }, + { + "entropy": 0.050349775422364476, + "epoch": 7.054784939969694, + "grad_norm": 0.59765625, + "learning_rate": 4.8297763317489107e-05, + "loss": 0.0092, + "mean_token_accuracy": 0.9935024440288543, + "num_tokens": 90989495.0, + "step": 30265 + }, + { + "entropy": 0.0740093344822526, + "epoch": 7.055950576990325, + "grad_norm": 0.87890625, + "learning_rate": 4.829699423437659e-05, + "loss": 0.0069, + "mean_token_accuracy": 0.9964289367198944, + "num_tokens": 91002014.0, + "step": 30270 + }, + { + "entropy": 0.04686462339013815, + "epoch": 7.057116214010957, + "grad_norm": 1.203125, + "learning_rate": 4.829622499026465e-05, + "loss": 0.0107, + "mean_token_accuracy": 0.997639638185501, + "num_tokens": 91039458.0, + "step": 30275 + }, + { + "entropy": 0.060538587532937525, + "epoch": 7.058281851031588, + "grad_norm": 1.5078125, + "learning_rate": 4.829545558516475e-05, + "loss": 0.0121, + "mean_token_accuracy": 0.9965390801429749, + "num_tokens": 91062900.0, + "step": 30280 + }, + { + "entropy": 0.05883708633482456, + "epoch": 7.05944748805222, + "grad_norm": 4.0625, + "learning_rate": 4.8294686019088374e-05, + "loss": 0.0164, + "mean_token_accuracy": 0.9945460319519043, + "num_tokens": 91080738.0, + "step": 30285 + }, + { + "entropy": 0.05134750343859196, + "epoch": 7.0606131250728525, + "grad_norm": 4.40625, + "learning_rate": 4.829391629204699e-05, + "loss": 0.0223, + "mean_token_accuracy": 0.9939786851406097, + "num_tokens": 91115245.0, + "step": 30290 + }, + { + "entropy": 0.08587418049573899, + "epoch": 7.061778762093484, + "grad_norm": 1.1953125, + "learning_rate": 4.829314640405209e-05, + "loss": 0.0136, + "mean_token_accuracy": 0.9956977784633636, + "num_tokens": 91122351.0, + "step": 30295 + }, + { + "entropy": 0.06100914310663939, + "epoch": 7.062944399114116, + "grad_norm": 2.734375, + "learning_rate": 4.829237635511514e-05, + "loss": 0.0115, + "mean_token_accuracy": 0.9947611093521118, + "num_tokens": 91147872.0, + "step": 30300 + }, + { + "entropy": 0.07020860947668553, + "epoch": 7.064110036134748, + "grad_norm": 2.46875, + "learning_rate": 4.829160614524762e-05, + "loss": 0.0106, + "mean_token_accuracy": 0.9977278351783753, + "num_tokens": 91161175.0, + "step": 30305 + }, + { + "entropy": 0.052560966834425925, + "epoch": 7.065275673155379, + "grad_norm": 3.484375, + "learning_rate": 4.829083577446102e-05, + "loss": 0.0178, + "mean_token_accuracy": 0.9957786858081817, + "num_tokens": 91184931.0, + "step": 30310 + }, + { + "entropy": 0.06599666997790336, + "epoch": 7.066441310176011, + "grad_norm": 3.84375, + "learning_rate": 4.829006524276684e-05, + "loss": 0.0194, + "mean_token_accuracy": 0.9962544083595276, + "num_tokens": 91198511.0, + "step": 30315 + }, + { + "entropy": 0.061596107203513384, + "epoch": 7.067606947196643, + "grad_norm": 1.4296875, + "learning_rate": 4.8289294550176545e-05, + "loss": 0.013, + "mean_token_accuracy": 0.994150060415268, + "num_tokens": 91219731.0, + "step": 30320 + }, + { + "entropy": 0.07121949885040521, + "epoch": 7.068772584217275, + "grad_norm": 1.453125, + "learning_rate": 4.828852369670164e-05, + "loss": 0.0177, + "mean_token_accuracy": 0.9964398920536042, + "num_tokens": 91232224.0, + "step": 30325 + }, + { + "entropy": 0.06412322130054235, + "epoch": 7.069938221237907, + "grad_norm": 0.25390625, + "learning_rate": 4.8287752682353626e-05, + "loss": 0.0194, + "mean_token_accuracy": 0.9957955598831176, + "num_tokens": 91248088.0, + "step": 30330 + }, + { + "entropy": 0.07131289504468441, + "epoch": 7.071103858258538, + "grad_norm": 0.65625, + "learning_rate": 4.828698150714399e-05, + "loss": 0.013, + "mean_token_accuracy": 0.9965978026390075, + "num_tokens": 91257256.0, + "step": 30335 + }, + { + "entropy": 0.07580037731677294, + "epoch": 7.07226949527917, + "grad_norm": 0.75, + "learning_rate": 4.828621017108424e-05, + "loss": 0.0295, + "mean_token_accuracy": 0.9948435366153717, + "num_tokens": 91278451.0, + "step": 30340 + }, + { + "entropy": 0.11762396842241288, + "epoch": 7.073435132299802, + "grad_norm": 0.453125, + "learning_rate": 4.8285438674185873e-05, + "loss": 0.0959, + "mean_token_accuracy": 0.97947016954422, + "num_tokens": 91309859.0, + "step": 30345 + }, + { + "entropy": 0.05074176751077175, + "epoch": 7.074600769320433, + "grad_norm": 3.421875, + "learning_rate": 4.828466701646039e-05, + "loss": 0.0123, + "mean_token_accuracy": 0.9963578641414642, + "num_tokens": 91336540.0, + "step": 30350 + }, + { + "entropy": 0.06217975839972496, + "epoch": 7.0757664063410655, + "grad_norm": 0.5, + "learning_rate": 4.8283895197919304e-05, + "loss": 0.0191, + "mean_token_accuracy": 0.9958511829376221, + "num_tokens": 91347859.0, + "step": 30355 + }, + { + "entropy": 0.1270772408694029, + "epoch": 7.076932043361698, + "grad_norm": 1.4609375, + "learning_rate": 4.8283123218574116e-05, + "loss": 0.0067, + "mean_token_accuracy": 0.9971695721149445, + "num_tokens": 91357100.0, + "step": 30360 + }, + { + "entropy": 0.056195403542369605, + "epoch": 7.078097680382329, + "grad_norm": 0.2412109375, + "learning_rate": 4.8282351078436345e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9992462873458863, + "num_tokens": 91391188.0, + "step": 30365 + }, + { + "entropy": 0.06651196293532849, + "epoch": 7.079263317402961, + "grad_norm": 0.353515625, + "learning_rate": 4.82815787775175e-05, + "loss": 0.0132, + "mean_token_accuracy": 0.9964940130710602, + "num_tokens": 91402433.0, + "step": 30370 + }, + { + "entropy": 0.05127828456461429, + "epoch": 7.080428954423592, + "grad_norm": 1.3203125, + "learning_rate": 4.82808063158291e-05, + "loss": 0.0057, + "mean_token_accuracy": 0.9980728805065155, + "num_tokens": 91430005.0, + "step": 30375 + }, + { + "entropy": 0.05886371675878763, + "epoch": 7.081594591444224, + "grad_norm": 0.294921875, + "learning_rate": 4.8280033693382664e-05, + "loss": 0.0096, + "mean_token_accuracy": 0.997169828414917, + "num_tokens": 91456501.0, + "step": 30380 + }, + { + "entropy": 0.06507110595703125, + "epoch": 7.082760228464856, + "grad_norm": 0.85546875, + "learning_rate": 4.827926091018971e-05, + "loss": 0.0115, + "mean_token_accuracy": 0.995796662569046, + "num_tokens": 91476101.0, + "step": 30385 + }, + { + "entropy": 0.062191806919872764, + "epoch": 7.0839258654854875, + "grad_norm": 0.53125, + "learning_rate": 4.8278487966261765e-05, + "loss": 0.0039, + "mean_token_accuracy": 0.9979954600334168, + "num_tokens": 91514310.0, + "step": 30390 + }, + { + "entropy": 0.058631454780697825, + "epoch": 7.08509150250612, + "grad_norm": 1.2265625, + "learning_rate": 4.827771486161035e-05, + "loss": 0.0115, + "mean_token_accuracy": 0.9947159767150879, + "num_tokens": 91537150.0, + "step": 30395 + }, + { + "entropy": 0.06342284716665744, + "epoch": 7.086257139526752, + "grad_norm": 0.435546875, + "learning_rate": 4.8276941596246994e-05, + "loss": 0.0179, + "mean_token_accuracy": 0.994976383447647, + "num_tokens": 91558164.0, + "step": 30400 + }, + { + "entropy": 0.0707466502673924, + "epoch": 7.087422776547383, + "grad_norm": 1.625, + "learning_rate": 4.8276168170183233e-05, + "loss": 0.0087, + "mean_token_accuracy": 0.998279196023941, + "num_tokens": 91586951.0, + "step": 30405 + }, + { + "entropy": 0.05743299555033445, + "epoch": 7.088588413568015, + "grad_norm": 1.296875, + "learning_rate": 4.8275394583430594e-05, + "loss": 0.0123, + "mean_token_accuracy": 0.9970780968666076, + "num_tokens": 91619188.0, + "step": 30410 + }, + { + "entropy": 0.05649949889630079, + "epoch": 7.089754050588646, + "grad_norm": 0.375, + "learning_rate": 4.8274620836000616e-05, + "loss": 0.0058, + "mean_token_accuracy": 0.9983665943145752, + "num_tokens": 91636413.0, + "step": 30415 + }, + { + "entropy": 0.05755188856273889, + "epoch": 7.090919687609278, + "grad_norm": 1.1171875, + "learning_rate": 4.827384692790484e-05, + "loss": 0.005, + "mean_token_accuracy": 0.9977169036865234, + "num_tokens": 91657970.0, + "step": 30420 + }, + { + "entropy": 0.0699540264904499, + "epoch": 7.0920853246299105, + "grad_norm": 0.275390625, + "learning_rate": 4.8273072859154796e-05, + "loss": 0.0077, + "mean_token_accuracy": 0.9979713082313537, + "num_tokens": 91684805.0, + "step": 30425 + }, + { + "entropy": 0.060055936314165594, + "epoch": 7.093250961650542, + "grad_norm": 0.921875, + "learning_rate": 4.8272298629762033e-05, + "loss": 0.0112, + "mean_token_accuracy": 0.998055511713028, + "num_tokens": 91712342.0, + "step": 30430 + }, + { + "entropy": 0.0625823263078928, + "epoch": 7.094416598671174, + "grad_norm": 2.96875, + "learning_rate": 4.827152423973809e-05, + "loss": 0.0215, + "mean_token_accuracy": 0.995892733335495, + "num_tokens": 91722750.0, + "step": 30435 + }, + { + "entropy": 0.13746389281004667, + "epoch": 7.095582235691806, + "grad_norm": 0.53515625, + "learning_rate": 4.827074968909453e-05, + "loss": 0.1425, + "mean_token_accuracy": 0.9803636312484741, + "num_tokens": 91753933.0, + "step": 30440 + }, + { + "entropy": 0.07929829806089402, + "epoch": 7.096747872712437, + "grad_norm": 1.4453125, + "learning_rate": 4.826997497784289e-05, + "loss": 0.0238, + "mean_token_accuracy": 0.9956888616085052, + "num_tokens": 91775313.0, + "step": 30445 + }, + { + "entropy": 0.06876232139766217, + "epoch": 7.097913509733069, + "grad_norm": 2.796875, + "learning_rate": 4.826920010599472e-05, + "loss": 0.016, + "mean_token_accuracy": 0.996614670753479, + "num_tokens": 91799765.0, + "step": 30450 + }, + { + "entropy": 0.07557164933532476, + "epoch": 7.099079146753701, + "grad_norm": 0.65234375, + "learning_rate": 4.8268425073561574e-05, + "loss": 0.0144, + "mean_token_accuracy": 0.9956696331501007, + "num_tokens": 91812729.0, + "step": 30455 + }, + { + "entropy": 0.07304494511336088, + "epoch": 7.1002447837743325, + "grad_norm": 0.81640625, + "learning_rate": 4.826764988055502e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.9968676388263702, + "num_tokens": 91834090.0, + "step": 30460 + }, + { + "entropy": 0.062426519207656385, + "epoch": 7.101410420794965, + "grad_norm": 0.26953125, + "learning_rate": 4.82668745269866e-05, + "loss": 0.0077, + "mean_token_accuracy": 0.9980383157730103, + "num_tokens": 91849929.0, + "step": 30465 + }, + { + "entropy": 0.09062138125300408, + "epoch": 7.102576057815596, + "grad_norm": 3.3125, + "learning_rate": 4.826609901286791e-05, + "loss": 0.0194, + "mean_token_accuracy": 0.9943084239959716, + "num_tokens": 91859970.0, + "step": 30470 + }, + { + "entropy": 0.056807669810950756, + "epoch": 7.103741694836228, + "grad_norm": 0.2578125, + "learning_rate": 4.826532333821047e-05, + "loss": 0.0083, + "mean_token_accuracy": 0.9982002913951874, + "num_tokens": 91886469.0, + "step": 30475 + }, + { + "entropy": 0.04765348536893725, + "epoch": 7.10490733185686, + "grad_norm": 0.3515625, + "learning_rate": 4.826454750302587e-05, + "loss": 0.0137, + "mean_token_accuracy": 0.9960334420204162, + "num_tokens": 91914766.0, + "step": 30480 + }, + { + "entropy": 0.08689562901854515, + "epoch": 7.106072968877491, + "grad_norm": 0.80859375, + "learning_rate": 4.8263771507325685e-05, + "loss": 0.0131, + "mean_token_accuracy": 0.9975087523460389, + "num_tokens": 91943211.0, + "step": 30485 + }, + { + "entropy": 0.0745643438771367, + "epoch": 7.107238605898123, + "grad_norm": 1.328125, + "learning_rate": 4.826299535112147e-05, + "loss": 0.0079, + "mean_token_accuracy": 0.9984472215175628, + "num_tokens": 91961538.0, + "step": 30490 + }, + { + "entropy": 0.06446758769452572, + "epoch": 7.1084042429187555, + "grad_norm": 0.88671875, + "learning_rate": 4.826221903442481e-05, + "loss": 0.0118, + "mean_token_accuracy": 0.9975184500217438, + "num_tokens": 91970955.0, + "step": 30495 + }, + { + "entropy": 0.042829998023808005, + "epoch": 7.109569879939387, + "grad_norm": 0.90625, + "learning_rate": 4.826144255724727e-05, + "loss": 0.0091, + "mean_token_accuracy": 0.9957435607910157, + "num_tokens": 92002665.0, + "step": 30500 + }, + { + "entropy": 0.08332742396742106, + "epoch": 7.110735516960019, + "grad_norm": 0.6640625, + "learning_rate": 4.8260665919600436e-05, + "loss": 0.0766, + "mean_token_accuracy": 0.9868705093860626, + "num_tokens": 92028846.0, + "step": 30505 + }, + { + "entropy": 0.05979088693857193, + "epoch": 7.11190115398065, + "grad_norm": 0.66015625, + "learning_rate": 4.82598891214959e-05, + "loss": 0.0177, + "mean_token_accuracy": 0.9947042167186737, + "num_tokens": 92044394.0, + "step": 30510 + }, + { + "entropy": 0.06625119373202323, + "epoch": 7.113066791001282, + "grad_norm": 0.306640625, + "learning_rate": 4.8259112162945225e-05, + "loss": 0.0181, + "mean_token_accuracy": 0.9953732848167419, + "num_tokens": 92058160.0, + "step": 30515 + }, + { + "entropy": 0.06746600233018399, + "epoch": 7.114232428021914, + "grad_norm": 0.3203125, + "learning_rate": 4.825833504396e-05, + "loss": 0.0099, + "mean_token_accuracy": 0.9972993493080139, + "num_tokens": 92081803.0, + "step": 30520 + }, + { + "entropy": 0.04053811193443835, + "epoch": 7.1153980650425455, + "grad_norm": 0.392578125, + "learning_rate": 4.8257557764551826e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9975557744503021, + "num_tokens": 92119868.0, + "step": 30525 + }, + { + "entropy": 0.05157309314236045, + "epoch": 7.116563702063178, + "grad_norm": 0.1845703125, + "learning_rate": 4.825678032473229e-05, + "loss": 0.0084, + "mean_token_accuracy": 0.9957024157047272, + "num_tokens": 92153375.0, + "step": 30530 + }, + { + "entropy": 0.06940113585442305, + "epoch": 7.11772933908381, + "grad_norm": 1.0859375, + "learning_rate": 4.8256002724512964e-05, + "loss": 0.0162, + "mean_token_accuracy": 0.9951586663722992, + "num_tokens": 92172516.0, + "step": 30535 + }, + { + "entropy": 0.07014297656714916, + "epoch": 7.118894976104441, + "grad_norm": 6.40625, + "learning_rate": 4.825522496390547e-05, + "loss": 0.0181, + "mean_token_accuracy": 0.9962278366088867, + "num_tokens": 92182097.0, + "step": 30540 + }, + { + "entropy": 0.052831696905195714, + "epoch": 7.120060613125073, + "grad_norm": 2.0625, + "learning_rate": 4.8254447042921394e-05, + "loss": 0.01, + "mean_token_accuracy": 0.9970639288425446, + "num_tokens": 92202430.0, + "step": 30545 + }, + { + "entropy": 0.061360220145434144, + "epoch": 7.121226250145704, + "grad_norm": 1.3203125, + "learning_rate": 4.825366896157234e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.9982588768005372, + "num_tokens": 92225929.0, + "step": 30550 + }, + { + "entropy": 0.07295540943741799, + "epoch": 7.122391887166336, + "grad_norm": 0.890625, + "learning_rate": 4.82528907198699e-05, + "loss": 0.0172, + "mean_token_accuracy": 0.9961545705795288, + "num_tokens": 92242351.0, + "step": 30555 + }, + { + "entropy": 0.0615655729547143, + "epoch": 7.123557524186968, + "grad_norm": 0.78125, + "learning_rate": 4.825211231782569e-05, + "loss": 0.01, + "mean_token_accuracy": 0.9978924512863159, + "num_tokens": 92256604.0, + "step": 30560 + }, + { + "entropy": 0.07092278450727463, + "epoch": 7.1247231612076, + "grad_norm": 2.9375, + "learning_rate": 4.825133375545132e-05, + "loss": 0.0163, + "mean_token_accuracy": 0.9958444178104401, + "num_tokens": 92267376.0, + "step": 30565 + }, + { + "entropy": 0.05723753683269024, + "epoch": 7.125888798228232, + "grad_norm": 1.5625, + "learning_rate": 4.825055503275838e-05, + "loss": 0.0065, + "mean_token_accuracy": 0.9972610890865325, + "num_tokens": 92291685.0, + "step": 30570 + }, + { + "entropy": 0.056301530078053476, + "epoch": 7.127054435248864, + "grad_norm": 1.6796875, + "learning_rate": 4.82497761497585e-05, + "loss": 0.0059, + "mean_token_accuracy": 0.9979709684848785, + "num_tokens": 92316387.0, + "step": 30575 + }, + { + "entropy": 0.11123329075053334, + "epoch": 7.128220072269495, + "grad_norm": 7.5625, + "learning_rate": 4.824899710646329e-05, + "loss": 0.0204, + "mean_token_accuracy": 0.9949623763561248, + "num_tokens": 92329740.0, + "step": 30580 + }, + { + "entropy": 0.061691082268953326, + "epoch": 7.129385709290127, + "grad_norm": 1.3046875, + "learning_rate": 4.824821790288437e-05, + "loss": 0.0169, + "mean_token_accuracy": 0.9934255063533783, + "num_tokens": 92342801.0, + "step": 30585 + }, + { + "entropy": 0.037779290787875654, + "epoch": 7.130551346310758, + "grad_norm": 0.3671875, + "learning_rate": 4.824743853903335e-05, + "loss": 0.0083, + "mean_token_accuracy": 0.9969915330410004, + "num_tokens": 92373646.0, + "step": 30590 + }, + { + "entropy": 0.06015878664329648, + "epoch": 7.1317169833313905, + "grad_norm": 2.234375, + "learning_rate": 4.8246659014921855e-05, + "loss": 0.0133, + "mean_token_accuracy": 0.9963972091674804, + "num_tokens": 92387023.0, + "step": 30595 + }, + { + "entropy": 0.09237681282684207, + "epoch": 7.132882620352023, + "grad_norm": 0.55859375, + "learning_rate": 4.8245879330561514e-05, + "loss": 0.0115, + "mean_token_accuracy": 0.994150698184967, + "num_tokens": 92408623.0, + "step": 30600 + }, + { + "entropy": 0.08946602549403906, + "epoch": 7.134048257372654, + "grad_norm": 1.90625, + "learning_rate": 4.8245099485963944e-05, + "loss": 0.0068, + "mean_token_accuracy": 0.9951623618602753, + "num_tokens": 92419455.0, + "step": 30605 + }, + { + "entropy": 0.08082111086696386, + "epoch": 7.135213894393286, + "grad_norm": 0.73828125, + "learning_rate": 4.824431948114079e-05, + "loss": 0.0116, + "mean_token_accuracy": 0.9975998342037201, + "num_tokens": 92429612.0, + "step": 30610 + }, + { + "entropy": 0.06038438286632299, + "epoch": 7.136379531413918, + "grad_norm": 0.341796875, + "learning_rate": 4.8243539316103656e-05, + "loss": 0.0062, + "mean_token_accuracy": 0.9981464624404908, + "num_tokens": 92456992.0, + "step": 30615 + }, + { + "entropy": 0.0631004961207509, + "epoch": 7.137545168434549, + "grad_norm": 1.1875, + "learning_rate": 4.82427589908642e-05, + "loss": 0.012, + "mean_token_accuracy": 0.9945440351963043, + "num_tokens": 92473120.0, + "step": 30620 + }, + { + "entropy": 0.06316535836085677, + "epoch": 7.138710805455181, + "grad_norm": 1.796875, + "learning_rate": 4.8241978505434056e-05, + "loss": 0.0218, + "mean_token_accuracy": 0.9941232442855835, + "num_tokens": 92494820.0, + "step": 30625 + }, + { + "entropy": 0.05482297632843256, + "epoch": 7.139876442475813, + "grad_norm": 0.388671875, + "learning_rate": 4.824119785982485e-05, + "loss": 0.0082, + "mean_token_accuracy": 0.9979051113128662, + "num_tokens": 92519947.0, + "step": 30630 + }, + { + "entropy": 0.04249793980270624, + "epoch": 7.141042079496445, + "grad_norm": 1.8515625, + "learning_rate": 4.824041705404822e-05, + "loss": 0.0073, + "mean_token_accuracy": 0.9972292304039001, + "num_tokens": 92551694.0, + "step": 30635 + }, + { + "entropy": 0.052819710597395896, + "epoch": 7.142207716517077, + "grad_norm": 1.2421875, + "learning_rate": 4.823963608811583e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.99895738363266, + "num_tokens": 92585379.0, + "step": 30640 + }, + { + "entropy": 0.06095590833574534, + "epoch": 7.143373353537708, + "grad_norm": 2.8125, + "learning_rate": 4.823885496203931e-05, + "loss": 0.0138, + "mean_token_accuracy": 0.9947236835956573, + "num_tokens": 92605361.0, + "step": 30645 + }, + { + "entropy": 0.07070263214409352, + "epoch": 7.14453899055834, + "grad_norm": 2.90625, + "learning_rate": 4.82380736758303e-05, + "loss": 0.0223, + "mean_token_accuracy": 0.9947317957878112, + "num_tokens": 92614248.0, + "step": 30650 + }, + { + "entropy": 0.04973842538893223, + "epoch": 7.145704627578972, + "grad_norm": 0.32421875, + "learning_rate": 4.823729222950047e-05, + "loss": 0.0057, + "mean_token_accuracy": 0.9984661042690277, + "num_tokens": 92642526.0, + "step": 30655 + }, + { + "entropy": 0.05830220179632306, + "epoch": 7.146870264599603, + "grad_norm": 2.765625, + "learning_rate": 4.823651062306146e-05, + "loss": 0.0081, + "mean_token_accuracy": 0.9965169250965118, + "num_tokens": 92673422.0, + "step": 30660 + }, + { + "entropy": 0.05718004386872053, + "epoch": 7.1480359016202355, + "grad_norm": 0.234375, + "learning_rate": 4.8235728856524934e-05, + "loss": 0.0093, + "mean_token_accuracy": 0.9970381677150726, + "num_tokens": 92694855.0, + "step": 30665 + }, + { + "entropy": 0.07355883046984672, + "epoch": 7.149201538640868, + "grad_norm": 0.265625, + "learning_rate": 4.823494692990254e-05, + "loss": 0.0143, + "mean_token_accuracy": 0.995150500535965, + "num_tokens": 92706677.0, + "step": 30670 + }, + { + "entropy": 0.07787001654505729, + "epoch": 7.150367175661499, + "grad_norm": 0.9453125, + "learning_rate": 4.823416484320594e-05, + "loss": 0.0156, + "mean_token_accuracy": 0.9955798864364624, + "num_tokens": 92728270.0, + "step": 30675 + }, + { + "entropy": 0.07409054469317197, + "epoch": 7.151532812682131, + "grad_norm": 0.482421875, + "learning_rate": 4.82333825964468e-05, + "loss": 0.0127, + "mean_token_accuracy": 0.9979703485965729, + "num_tokens": 92741655.0, + "step": 30680 + }, + { + "entropy": 0.06822916008532047, + "epoch": 7.152698449702762, + "grad_norm": 6.5, + "learning_rate": 4.8232600189636775e-05, + "loss": 0.0165, + "mean_token_accuracy": 0.995071417093277, + "num_tokens": 92769257.0, + "step": 30685 + }, + { + "entropy": 0.07879522778093814, + "epoch": 7.153864086723394, + "grad_norm": 1.453125, + "learning_rate": 4.823181762278754e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.9965352594852448, + "num_tokens": 92786415.0, + "step": 30690 + }, + { + "entropy": 0.06394462883472443, + "epoch": 7.155029723744026, + "grad_norm": 2.765625, + "learning_rate": 4.8231034895910766e-05, + "loss": 0.0059, + "mean_token_accuracy": 0.9973495721817016, + "num_tokens": 92813736.0, + "step": 30695 + }, + { + "entropy": 0.07654921635985375, + "epoch": 7.156195360764658, + "grad_norm": 1.890625, + "learning_rate": 4.8230252009018116e-05, + "loss": 0.0154, + "mean_token_accuracy": 0.9948624432086944, + "num_tokens": 92825599.0, + "step": 30700 + }, + { + "entropy": 0.07231663931161166, + "epoch": 7.15736099778529, + "grad_norm": 1.4765625, + "learning_rate": 4.8229468962121274e-05, + "loss": 0.0146, + "mean_token_accuracy": 0.9950795829296112, + "num_tokens": 92837209.0, + "step": 30705 + }, + { + "entropy": 0.05991370286792517, + "epoch": 7.158526634805922, + "grad_norm": 4.4375, + "learning_rate": 4.8228685755231916e-05, + "loss": 0.0072, + "mean_token_accuracy": 0.997044563293457, + "num_tokens": 92852078.0, + "step": 30710 + }, + { + "entropy": 0.07421232592314482, + "epoch": 7.159692271826553, + "grad_norm": 0.248046875, + "learning_rate": 4.822790238836171e-05, + "loss": 0.0123, + "mean_token_accuracy": 0.9958754122257233, + "num_tokens": 92876834.0, + "step": 30715 + }, + { + "entropy": 0.05952296406030655, + "epoch": 7.160857908847185, + "grad_norm": 2.953125, + "learning_rate": 4.822711886152235e-05, + "loss": 0.0143, + "mean_token_accuracy": 0.9964723646640777, + "num_tokens": 92908025.0, + "step": 30720 + }, + { + "entropy": 0.06198753379285336, + "epoch": 7.162023545867816, + "grad_norm": 0.271484375, + "learning_rate": 4.82263351747255e-05, + "loss": 0.0111, + "mean_token_accuracy": 0.9940212309360504, + "num_tokens": 92935393.0, + "step": 30725 + }, + { + "entropy": 0.05733865359798074, + "epoch": 7.163189182888448, + "grad_norm": 0.296875, + "learning_rate": 4.8225551327982874e-05, + "loss": 0.0128, + "mean_token_accuracy": 0.9962757170200348, + "num_tokens": 92975482.0, + "step": 30730 + }, + { + "entropy": 0.07880000211298466, + "epoch": 7.1643548199090805, + "grad_norm": 3.625, + "learning_rate": 4.8224767321306135e-05, + "loss": 0.0196, + "mean_token_accuracy": 0.9946611404418946, + "num_tokens": 92983434.0, + "step": 30735 + }, + { + "entropy": 0.0455553256906569, + "epoch": 7.165520456929712, + "grad_norm": 0.306640625, + "learning_rate": 4.822398315470699e-05, + "loss": 0.0061, + "mean_token_accuracy": 0.9972747147083283, + "num_tokens": 93012570.0, + "step": 30740 + }, + { + "entropy": 0.06350925117731095, + "epoch": 7.166686093950344, + "grad_norm": 0.765625, + "learning_rate": 4.822319882819713e-05, + "loss": 0.0207, + "mean_token_accuracy": 0.9927310764789581, + "num_tokens": 93026257.0, + "step": 30745 + }, + { + "entropy": 0.07950380994006992, + "epoch": 7.167851730970976, + "grad_norm": 1.0234375, + "learning_rate": 4.8222414341788236e-05, + "loss": 0.0097, + "mean_token_accuracy": 0.9974476456642151, + "num_tokens": 93038257.0, + "step": 30750 + }, + { + "entropy": 0.07047421857714653, + "epoch": 7.169017367991607, + "grad_norm": 2.09375, + "learning_rate": 4.822162969549202e-05, + "loss": 0.0133, + "mean_token_accuracy": 0.9962194740772248, + "num_tokens": 93051971.0, + "step": 30755 + }, + { + "entropy": 0.07937701903283596, + "epoch": 7.170183005012239, + "grad_norm": 0.65234375, + "learning_rate": 4.8220844889320184e-05, + "loss": 0.0227, + "mean_token_accuracy": 0.9954998016357421, + "num_tokens": 93061600.0, + "step": 30760 + }, + { + "entropy": 0.06413892963901162, + "epoch": 7.171348642032871, + "grad_norm": 0.341796875, + "learning_rate": 4.822005992328442e-05, + "loss": 0.0243, + "mean_token_accuracy": 0.9955684006214142, + "num_tokens": 93086529.0, + "step": 30765 + }, + { + "entropy": 0.05696111330762506, + "epoch": 7.172514279053503, + "grad_norm": 0.8125, + "learning_rate": 4.821927479739645e-05, + "loss": 0.0105, + "mean_token_accuracy": 0.9965245842933654, + "num_tokens": 93106437.0, + "step": 30770 + }, + { + "entropy": 0.0651092673651874, + "epoch": 7.173679916074135, + "grad_norm": 1.34375, + "learning_rate": 4.821848951166796e-05, + "loss": 0.0072, + "mean_token_accuracy": 0.9980071961879731, + "num_tokens": 93128982.0, + "step": 30775 + }, + { + "entropy": 0.04683809150010347, + "epoch": 7.174845553094766, + "grad_norm": 1.734375, + "learning_rate": 4.821770406611067e-05, + "loss": 0.0073, + "mean_token_accuracy": 0.9978996217250824, + "num_tokens": 93156822.0, + "step": 30780 + }, + { + "entropy": 0.09004619605839252, + "epoch": 7.176011190115398, + "grad_norm": 2.71875, + "learning_rate": 4.821691846073629e-05, + "loss": 0.0213, + "mean_token_accuracy": 0.9955195188522339, + "num_tokens": 93164597.0, + "step": 30785 + }, + { + "entropy": 0.0713108105584979, + "epoch": 7.17717682713603, + "grad_norm": 0.90625, + "learning_rate": 4.821613269555654e-05, + "loss": 0.0093, + "mean_token_accuracy": 0.996299684047699, + "num_tokens": 93185720.0, + "step": 30790 + }, + { + "entropy": 0.08426001332700253, + "epoch": 7.178342464156661, + "grad_norm": 3.78125, + "learning_rate": 4.821534677058314e-05, + "loss": 0.0268, + "mean_token_accuracy": 0.9953032076358795, + "num_tokens": 93202802.0, + "step": 30795 + }, + { + "entropy": 0.05344439307227731, + "epoch": 7.179508101177293, + "grad_norm": 2.703125, + "learning_rate": 4.821456068582779e-05, + "loss": 0.0099, + "mean_token_accuracy": 0.9985413789749146, + "num_tokens": 93235860.0, + "step": 30800 + }, + { + "entropy": 0.06476940959692001, + "epoch": 7.1806737381979255, + "grad_norm": 0.84375, + "learning_rate": 4.821377444130223e-05, + "loss": 0.0155, + "mean_token_accuracy": 0.9960738599300385, + "num_tokens": 93246928.0, + "step": 30805 + }, + { + "entropy": 0.08522103652358055, + "epoch": 7.181839375218557, + "grad_norm": 4.34375, + "learning_rate": 4.821298803701819e-05, + "loss": 0.0307, + "mean_token_accuracy": 0.9906064748764039, + "num_tokens": 93256045.0, + "step": 30810 + }, + { + "entropy": 0.06372003946453333, + "epoch": 7.183005012239189, + "grad_norm": 2.703125, + "learning_rate": 4.8212201472987374e-05, + "loss": 0.0134, + "mean_token_accuracy": 0.9962263941764832, + "num_tokens": 93272200.0, + "step": 30815 + }, + { + "entropy": 0.06631615618243814, + "epoch": 7.18417064925982, + "grad_norm": 0.98828125, + "learning_rate": 4.8211414749221515e-05, + "loss": 0.0139, + "mean_token_accuracy": 0.9977785289287567, + "num_tokens": 93303496.0, + "step": 30820 + }, + { + "entropy": 0.07567157708108425, + "epoch": 7.185336286280452, + "grad_norm": 2.140625, + "learning_rate": 4.821062786573236e-05, + "loss": 0.0147, + "mean_token_accuracy": 0.9937583327293396, + "num_tokens": 93314712.0, + "step": 30825 + }, + { + "entropy": 0.08115587830543518, + "epoch": 7.186501923301084, + "grad_norm": 3.15625, + "learning_rate": 4.8209840822531635e-05, + "loss": 0.013, + "mean_token_accuracy": 0.994137454032898, + "num_tokens": 93326234.0, + "step": 30830 + }, + { + "entropy": 0.05947356568649411, + "epoch": 7.1876675603217155, + "grad_norm": 0.375, + "learning_rate": 4.820905361963107e-05, + "loss": 0.0178, + "mean_token_accuracy": 0.995909696817398, + "num_tokens": 93358945.0, + "step": 30835 + }, + { + "entropy": 0.07598287798464298, + "epoch": 7.188833197342348, + "grad_norm": 2.828125, + "learning_rate": 4.820826625704242e-05, + "loss": 0.0094, + "mean_token_accuracy": 0.9963949680328369, + "num_tokens": 93374097.0, + "step": 30840 + }, + { + "entropy": 0.06978492699563503, + "epoch": 7.18999883436298, + "grad_norm": 2.734375, + "learning_rate": 4.82074787347774e-05, + "loss": 0.0144, + "mean_token_accuracy": 0.9953686773777009, + "num_tokens": 93390116.0, + "step": 30845 + }, + { + "entropy": 0.0937451772391796, + "epoch": 7.191164471383611, + "grad_norm": 0.9140625, + "learning_rate": 4.820669105284778e-05, + "loss": 0.0216, + "mean_token_accuracy": 0.9946691036224365, + "num_tokens": 93400332.0, + "step": 30850 + }, + { + "entropy": 0.05569373117759824, + "epoch": 7.192330108404243, + "grad_norm": 0.470703125, + "learning_rate": 4.820590321126528e-05, + "loss": 0.0095, + "mean_token_accuracy": 0.9965024709701538, + "num_tokens": 93424812.0, + "step": 30855 + }, + { + "entropy": 0.05078269252553582, + "epoch": 7.193495745424874, + "grad_norm": 1.5546875, + "learning_rate": 4.8205115210041665e-05, + "loss": 0.0096, + "mean_token_accuracy": 0.9979369282722473, + "num_tokens": 93443474.0, + "step": 30860 + }, + { + "entropy": 0.07130427304655314, + "epoch": 7.194661382445506, + "grad_norm": 0.9296875, + "learning_rate": 4.820432704918868e-05, + "loss": 0.016, + "mean_token_accuracy": 0.9947200953960419, + "num_tokens": 93457213.0, + "step": 30865 + }, + { + "entropy": 0.0634447991847992, + "epoch": 7.1958270194661385, + "grad_norm": 1.171875, + "learning_rate": 4.820353872871808e-05, + "loss": 0.0107, + "mean_token_accuracy": 0.9964709997177124, + "num_tokens": 93475313.0, + "step": 30870 + }, + { + "entropy": 0.07064968943595887, + "epoch": 7.19699265648677, + "grad_norm": 0.92578125, + "learning_rate": 4.820275024864162e-05, + "loss": 0.014, + "mean_token_accuracy": 0.9950363337993622, + "num_tokens": 93484861.0, + "step": 30875 + }, + { + "entropy": 0.068212578445673, + "epoch": 7.198158293507402, + "grad_norm": 1.59375, + "learning_rate": 4.8201961608971055e-05, + "loss": 0.0108, + "mean_token_accuracy": 0.9966789484024048, + "num_tokens": 93497639.0, + "step": 30880 + }, + { + "entropy": 0.08882842306047678, + "epoch": 7.199323930528034, + "grad_norm": 2.921875, + "learning_rate": 4.820117280971814e-05, + "loss": 0.0233, + "mean_token_accuracy": 0.9924453973770142, + "num_tokens": 93514110.0, + "step": 30885 + }, + { + "entropy": 0.07468615574762225, + "epoch": 7.200489567548665, + "grad_norm": 3.203125, + "learning_rate": 4.8200383850894645e-05, + "loss": 0.0238, + "mean_token_accuracy": 0.9929708957672119, + "num_tokens": 93540971.0, + "step": 30890 + }, + { + "entropy": 0.07364825969561935, + "epoch": 7.201655204569297, + "grad_norm": 2.328125, + "learning_rate": 4.819959473251234e-05, + "loss": 0.0132, + "mean_token_accuracy": 0.9956311583518982, + "num_tokens": 93561027.0, + "step": 30895 + }, + { + "entropy": 0.05888860169798136, + "epoch": 7.202820841589929, + "grad_norm": 0.341796875, + "learning_rate": 4.8198805454582976e-05, + "loss": 0.0088, + "mean_token_accuracy": 0.9973181843757629, + "num_tokens": 93592104.0, + "step": 30900 + }, + { + "entropy": 0.09244950031861662, + "epoch": 7.2039864786105605, + "grad_norm": 0.310546875, + "learning_rate": 4.819801601711834e-05, + "loss": 0.0398, + "mean_token_accuracy": 0.9923376441001892, + "num_tokens": 93616408.0, + "step": 30905 + }, + { + "entropy": 0.03939249962568283, + "epoch": 7.205152115631193, + "grad_norm": 0.2275390625, + "learning_rate": 4.819722642013019e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9981208801269531, + "num_tokens": 93655274.0, + "step": 30910 + }, + { + "entropy": 0.06765931397676468, + "epoch": 7.206317752651824, + "grad_norm": 0.65234375, + "learning_rate": 4.81964366636303e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9975310564041138, + "num_tokens": 93670281.0, + "step": 30915 + }, + { + "entropy": 0.07954985983669757, + "epoch": 7.207483389672456, + "grad_norm": 1.3046875, + "learning_rate": 4.8195646747630455e-05, + "loss": 0.0216, + "mean_token_accuracy": 0.9954479992389679, + "num_tokens": 93679604.0, + "step": 30920 + }, + { + "entropy": 0.05836689174175262, + "epoch": 7.208649026693088, + "grad_norm": 0.62109375, + "learning_rate": 4.819485667214243e-05, + "loss": 0.0189, + "mean_token_accuracy": 0.9950154960155487, + "num_tokens": 93695595.0, + "step": 30925 + }, + { + "entropy": 0.06158583052456379, + "epoch": 7.209814663713719, + "grad_norm": 0.80859375, + "learning_rate": 4.8194066437178004e-05, + "loss": 0.0084, + "mean_token_accuracy": 0.9972615122795105, + "num_tokens": 93727751.0, + "step": 30930 + }, + { + "entropy": 0.0682966934517026, + "epoch": 7.210980300734351, + "grad_norm": 1.53125, + "learning_rate": 4.8193276042748966e-05, + "loss": 0.0115, + "mean_token_accuracy": 0.9970169782638549, + "num_tokens": 93738365.0, + "step": 30935 + }, + { + "entropy": 0.07385541684925556, + "epoch": 7.2121459377549835, + "grad_norm": 1.203125, + "learning_rate": 4.8192485488867094e-05, + "loss": 0.0262, + "mean_token_accuracy": 0.9942777693271637, + "num_tokens": 93756002.0, + "step": 30940 + }, + { + "entropy": 0.07035349495708942, + "epoch": 7.213311574775615, + "grad_norm": 1.984375, + "learning_rate": 4.8191694775544185e-05, + "loss": 0.0183, + "mean_token_accuracy": 0.9962132215499878, + "num_tokens": 93788334.0, + "step": 30945 + }, + { + "entropy": 0.06736944075673819, + "epoch": 7.214477211796247, + "grad_norm": 0.71875, + "learning_rate": 4.819090390279202e-05, + "loss": 0.0074, + "mean_token_accuracy": 0.9972393333911895, + "num_tokens": 93816896.0, + "step": 30950 + }, + { + "entropy": 0.05105153433978558, + "epoch": 7.215642848816878, + "grad_norm": 2.671875, + "learning_rate": 4.8190112870622406e-05, + "loss": 0.0123, + "mean_token_accuracy": 0.9973272025585175, + "num_tokens": 93839774.0, + "step": 30955 + }, + { + "entropy": 0.0589878392405808, + "epoch": 7.21680848583751, + "grad_norm": 0.87109375, + "learning_rate": 4.818932167904713e-05, + "loss": 0.0154, + "mean_token_accuracy": 0.9960299611091614, + "num_tokens": 93854636.0, + "step": 30960 + }, + { + "entropy": 0.08665582574903966, + "epoch": 7.217974122858142, + "grad_norm": 1.65625, + "learning_rate": 4.818853032807799e-05, + "loss": 0.0165, + "mean_token_accuracy": 0.9953882753849029, + "num_tokens": 93864548.0, + "step": 30965 + }, + { + "entropy": 0.05970696024596691, + "epoch": 7.219139759878773, + "grad_norm": 1.6796875, + "learning_rate": 4.818773881772678e-05, + "loss": 0.0141, + "mean_token_accuracy": 0.9952605664730072, + "num_tokens": 93877575.0, + "step": 30970 + }, + { + "entropy": 0.07558971364051104, + "epoch": 7.2203053968994055, + "grad_norm": 0.51171875, + "learning_rate": 4.818694714800531e-05, + "loss": 0.0053, + "mean_token_accuracy": 0.9963312029838562, + "num_tokens": 93913383.0, + "step": 30975 + }, + { + "entropy": 0.06744187790900469, + "epoch": 7.221471033920038, + "grad_norm": 1.1484375, + "learning_rate": 4.818615531892539e-05, + "loss": 0.0094, + "mean_token_accuracy": 0.9975740015506744, + "num_tokens": 93931739.0, + "step": 30980 + }, + { + "entropy": 0.06372264893725514, + "epoch": 7.222636670940669, + "grad_norm": 2.625, + "learning_rate": 4.818536333049881e-05, + "loss": 0.014, + "mean_token_accuracy": 0.9937172710895539, + "num_tokens": 93954776.0, + "step": 30985 + }, + { + "entropy": 0.08202089443802833, + "epoch": 7.223802307961301, + "grad_norm": 1.4140625, + "learning_rate": 4.81845711827374e-05, + "loss": 0.0077, + "mean_token_accuracy": 0.9972925662994385, + "num_tokens": 93974412.0, + "step": 30990 + }, + { + "entropy": 0.14377728216350077, + "epoch": 7.224967944981932, + "grad_norm": 0.3203125, + "learning_rate": 4.818377887565296e-05, + "loss": 0.2043, + "mean_token_accuracy": 0.9597078502178192, + "num_tokens": 94001058.0, + "step": 30995 + }, + { + "entropy": 0.04768119920045137, + "epoch": 7.226133582002564, + "grad_norm": 1.171875, + "learning_rate": 4.8182986409257315e-05, + "loss": 0.0141, + "mean_token_accuracy": 0.9953707456588745, + "num_tokens": 94022836.0, + "step": 31000 + }, + { + "entropy": 0.07950283214449883, + "epoch": 7.227299219023196, + "grad_norm": 2.546875, + "learning_rate": 4.818219378356226e-05, + "loss": 0.0199, + "mean_token_accuracy": 0.9949168682098388, + "num_tokens": 94031304.0, + "step": 31005 + }, + { + "entropy": 0.06591652268543839, + "epoch": 7.228464856043828, + "grad_norm": 1.6328125, + "learning_rate": 4.818140099857964e-05, + "loss": 0.0139, + "mean_token_accuracy": 0.9966680228710174, + "num_tokens": 94052316.0, + "step": 31010 + }, + { + "entropy": 0.055729389935731885, + "epoch": 7.22963049306446, + "grad_norm": 0.7109375, + "learning_rate": 4.8180608054321266e-05, + "loss": 0.0086, + "mean_token_accuracy": 0.9960857272148133, + "num_tokens": 94085005.0, + "step": 31015 + }, + { + "entropy": 0.031023029517382384, + "epoch": 7.230796130085092, + "grad_norm": 0.12109375, + "learning_rate": 4.8179814950798956e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.998980051279068, + "num_tokens": 94136607.0, + "step": 31020 + }, + { + "entropy": 0.06259532477706671, + "epoch": 7.231961767105723, + "grad_norm": 2.84375, + "learning_rate": 4.8179021688024546e-05, + "loss": 0.0134, + "mean_token_accuracy": 0.9964537560939789, + "num_tokens": 94150543.0, + "step": 31025 + }, + { + "entropy": 0.07360369060188532, + "epoch": 7.233127404126355, + "grad_norm": 0.37109375, + "learning_rate": 4.817822826600986e-05, + "loss": 0.0134, + "mean_token_accuracy": 0.9939245641231537, + "num_tokens": 94167749.0, + "step": 31030 + }, + { + "entropy": 0.0606883866712451, + "epoch": 7.234293041146987, + "grad_norm": 0.81640625, + "learning_rate": 4.817743468476672e-05, + "loss": 0.0129, + "mean_token_accuracy": 0.9957722008228302, + "num_tokens": 94180749.0, + "step": 31035 + }, + { + "entropy": 0.06940312702208758, + "epoch": 7.2354586781676185, + "grad_norm": 3.234375, + "learning_rate": 4.817664094430698e-05, + "loss": 0.0155, + "mean_token_accuracy": 0.9943855345249176, + "num_tokens": 94198809.0, + "step": 31040 + }, + { + "entropy": 0.0686707628890872, + "epoch": 7.2366243151882506, + "grad_norm": 0.2578125, + "learning_rate": 4.817584704464246e-05, + "loss": 0.0074, + "mean_token_accuracy": 0.998262470960617, + "num_tokens": 94218672.0, + "step": 31045 + }, + { + "entropy": 0.06387984249740838, + "epoch": 7.237789952208882, + "grad_norm": 1.8203125, + "learning_rate": 4.817505298578501e-05, + "loss": 0.0166, + "mean_token_accuracy": 0.9973008513450623, + "num_tokens": 94245039.0, + "step": 31050 + }, + { + "entropy": 0.07732600383460522, + "epoch": 7.238955589229514, + "grad_norm": 0.51171875, + "learning_rate": 4.817425876774646e-05, + "loss": 0.0173, + "mean_token_accuracy": 0.9943410992622376, + "num_tokens": 94254637.0, + "step": 31055 + }, + { + "entropy": 0.06010149214416742, + "epoch": 7.240121226250146, + "grad_norm": 2.734375, + "learning_rate": 4.817346439053865e-05, + "loss": 0.0215, + "mean_token_accuracy": 0.9950605452060699, + "num_tokens": 94282839.0, + "step": 31060 + }, + { + "entropy": 0.06231997692957521, + "epoch": 7.241286863270777, + "grad_norm": 0.384765625, + "learning_rate": 4.8172669854173444e-05, + "loss": 0.0128, + "mean_token_accuracy": 0.9954926490783691, + "num_tokens": 94304068.0, + "step": 31065 + }, + { + "entropy": 0.07848136788234114, + "epoch": 7.242452500291409, + "grad_norm": 4.34375, + "learning_rate": 4.8171875158662665e-05, + "loss": 0.0132, + "mean_token_accuracy": 0.9944968461990357, + "num_tokens": 94324087.0, + "step": 31070 + }, + { + "entropy": 0.06527646128088235, + "epoch": 7.243618137312041, + "grad_norm": 0.54296875, + "learning_rate": 4.8171080304018186e-05, + "loss": 0.0126, + "mean_token_accuracy": 0.9959727704524994, + "num_tokens": 94337863.0, + "step": 31075 + }, + { + "entropy": 0.07139136102050543, + "epoch": 7.244783774332673, + "grad_norm": 2.15625, + "learning_rate": 4.8170285290251846e-05, + "loss": 0.016, + "mean_token_accuracy": 0.9945382833480835, + "num_tokens": 94352721.0, + "step": 31080 + }, + { + "entropy": 0.05915911886841059, + "epoch": 7.245949411353305, + "grad_norm": 1.1953125, + "learning_rate": 4.81694901173755e-05, + "loss": 0.007, + "mean_token_accuracy": 0.9973145842552185, + "num_tokens": 94366680.0, + "step": 31085 + }, + { + "entropy": 0.04861733708530665, + "epoch": 7.247115048373936, + "grad_norm": 0.376953125, + "learning_rate": 4.8168694785401016e-05, + "loss": 0.007, + "mean_token_accuracy": 0.9972374498844147, + "num_tokens": 94394164.0, + "step": 31090 + }, + { + "entropy": 0.053044412098824975, + "epoch": 7.248280685394568, + "grad_norm": 0.3203125, + "learning_rate": 4.816789929434024e-05, + "loss": 0.0084, + "mean_token_accuracy": 0.9966397881507874, + "num_tokens": 94415197.0, + "step": 31095 + }, + { + "entropy": 0.07270872667431831, + "epoch": 7.2494463224152, + "grad_norm": 3.046875, + "learning_rate": 4.816710364420504e-05, + "loss": 0.0159, + "mean_token_accuracy": 0.9957484424114227, + "num_tokens": 94435321.0, + "step": 31100 + }, + { + "entropy": 0.06985992044210435, + "epoch": 7.250611959435831, + "grad_norm": 0.78515625, + "learning_rate": 4.816630783500729e-05, + "loss": 0.0161, + "mean_token_accuracy": 0.9964280486106872, + "num_tokens": 94445779.0, + "step": 31105 + }, + { + "entropy": 0.04990783054381609, + "epoch": 7.2517775964564635, + "grad_norm": 0.2099609375, + "learning_rate": 4.8165511866758835e-05, + "loss": 0.0149, + "mean_token_accuracy": 0.9973464012145996, + "num_tokens": 94479579.0, + "step": 31110 + }, + { + "entropy": 0.042833476420491935, + "epoch": 7.252943233477096, + "grad_norm": 0.6328125, + "learning_rate": 4.816471573947156e-05, + "loss": 0.0165, + "mean_token_accuracy": 0.9955586969852448, + "num_tokens": 94510088.0, + "step": 31115 + }, + { + "entropy": 0.05222136527299881, + "epoch": 7.254108870497727, + "grad_norm": 0.314453125, + "learning_rate": 4.8163919453157335e-05, + "loss": 0.009, + "mean_token_accuracy": 0.9957200348377228, + "num_tokens": 94546329.0, + "step": 31120 + }, + { + "entropy": 0.08356106411665679, + "epoch": 7.255274507518359, + "grad_norm": 0.609375, + "learning_rate": 4.8163123007828024e-05, + "loss": 0.0613, + "mean_token_accuracy": 0.9861360251903534, + "num_tokens": 94571572.0, + "step": 31125 + }, + { + "entropy": 0.03666842384263873, + "epoch": 7.25644014453899, + "grad_norm": 0.310546875, + "learning_rate": 4.8162326403495524e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9983543157577515, + "num_tokens": 94598814.0, + "step": 31130 + }, + { + "entropy": 0.08895245492458344, + "epoch": 7.257605781559622, + "grad_norm": 1.4609375, + "learning_rate": 4.816152964017169e-05, + "loss": 0.0154, + "mean_token_accuracy": 0.995537132024765, + "num_tokens": 94608335.0, + "step": 31135 + }, + { + "entropy": 0.0762575170956552, + "epoch": 7.258771418580254, + "grad_norm": 2.625, + "learning_rate": 4.816073271786842e-05, + "loss": 0.0268, + "mean_token_accuracy": 0.9923174262046814, + "num_tokens": 94634806.0, + "step": 31140 + }, + { + "entropy": 0.0727899644523859, + "epoch": 7.2599370556008855, + "grad_norm": 0.51171875, + "learning_rate": 4.815993563659759e-05, + "loss": 0.0164, + "mean_token_accuracy": 0.9957876443862915, + "num_tokens": 94646234.0, + "step": 31145 + }, + { + "entropy": 0.052048580907285216, + "epoch": 7.261102692621518, + "grad_norm": 0.353515625, + "learning_rate": 4.8159138396371075e-05, + "loss": 0.009, + "mean_token_accuracy": 0.9969389081001282, + "num_tokens": 94662074.0, + "step": 31150 + }, + { + "entropy": 0.054339785035699605, + "epoch": 7.26226832964215, + "grad_norm": 0.65234375, + "learning_rate": 4.815834099720079e-05, + "loss": 0.0139, + "mean_token_accuracy": 0.9946308076381684, + "num_tokens": 94695848.0, + "step": 31155 + }, + { + "entropy": 0.088789024297148, + "epoch": 7.263433966662781, + "grad_norm": 0.390625, + "learning_rate": 4.81575434390986e-05, + "loss": 0.0118, + "mean_token_accuracy": 0.9980597138404846, + "num_tokens": 94712847.0, + "step": 31160 + }, + { + "entropy": 0.04380578193813563, + "epoch": 7.264599603683413, + "grad_norm": 0.51953125, + "learning_rate": 4.8156745722076406e-05, + "loss": 0.0086, + "mean_token_accuracy": 0.9967470228672027, + "num_tokens": 94743216.0, + "step": 31165 + }, + { + "entropy": 0.05533214872702956, + "epoch": 7.265765240704045, + "grad_norm": 3.203125, + "learning_rate": 4.815594784614611e-05, + "loss": 0.0222, + "mean_token_accuracy": 0.995652312040329, + "num_tokens": 94760494.0, + "step": 31170 + }, + { + "entropy": 0.06883603855967521, + "epoch": 7.266930877724676, + "grad_norm": 0.58984375, + "learning_rate": 4.81551498113196e-05, + "loss": 0.0202, + "mean_token_accuracy": 0.9935734689235687, + "num_tokens": 94773816.0, + "step": 31175 + }, + { + "entropy": 0.05732705658301711, + "epoch": 7.2680965147453085, + "grad_norm": 0.640625, + "learning_rate": 4.815435161760878e-05, + "loss": 0.0162, + "mean_token_accuracy": 0.9956275284290313, + "num_tokens": 94790719.0, + "step": 31180 + }, + { + "entropy": 0.07628957759588957, + "epoch": 7.26926215176594, + "grad_norm": 0.9453125, + "learning_rate": 4.815355326502556e-05, + "loss": 0.0157, + "mean_token_accuracy": 0.9964455723762512, + "num_tokens": 94803985.0, + "step": 31185 + }, + { + "entropy": 0.061471117474138734, + "epoch": 7.270427788786572, + "grad_norm": 1.4765625, + "learning_rate": 4.815275475358183e-05, + "loss": 0.0164, + "mean_token_accuracy": 0.9955154120922088, + "num_tokens": 94818813.0, + "step": 31190 + }, + { + "entropy": 0.046590684168040755, + "epoch": 7.271593425807204, + "grad_norm": 0.294921875, + "learning_rate": 4.8151956083289504e-05, + "loss": 0.0191, + "mean_token_accuracy": 0.9956287026405335, + "num_tokens": 94848573.0, + "step": 31195 + }, + { + "entropy": 0.10268030576407909, + "epoch": 7.272759062827835, + "grad_norm": 5.28125, + "learning_rate": 4.815115725416049e-05, + "loss": 0.1076, + "mean_token_accuracy": 0.9697431027889252, + "num_tokens": 94877708.0, + "step": 31200 + }, + { + "entropy": 0.044332510232925414, + "epoch": 7.273924699848467, + "grad_norm": 0.34375, + "learning_rate": 4.8150358266206705e-05, + "loss": 0.007, + "mean_token_accuracy": 0.9973241150379181, + "num_tokens": 94895674.0, + "step": 31205 + }, + { + "entropy": 0.05917203584685922, + "epoch": 7.275090336869099, + "grad_norm": 0.53125, + "learning_rate": 4.814955911944006e-05, + "loss": 0.0068, + "mean_token_accuracy": 0.9969590663909912, + "num_tokens": 94913506.0, + "step": 31210 + }, + { + "entropy": 0.07185657788068056, + "epoch": 7.2762559738897306, + "grad_norm": 1.2734375, + "learning_rate": 4.814875981387247e-05, + "loss": 0.0201, + "mean_token_accuracy": 0.9954153180122376, + "num_tokens": 94924274.0, + "step": 31215 + }, + { + "entropy": 0.05664256140589714, + "epoch": 7.277421610910363, + "grad_norm": 1.046875, + "learning_rate": 4.814796034951585e-05, + "loss": 0.011, + "mean_token_accuracy": 0.9930629730224609, + "num_tokens": 94949709.0, + "step": 31220 + }, + { + "entropy": 0.05459074741229415, + "epoch": 7.278587247930994, + "grad_norm": 2.890625, + "learning_rate": 4.814716072638213e-05, + "loss": 0.0189, + "mean_token_accuracy": 0.9927839577198029, + "num_tokens": 94985059.0, + "step": 31225 + }, + { + "entropy": 0.045205979235470296, + "epoch": 7.279752884951626, + "grad_norm": 2.21875, + "learning_rate": 4.8146360944483235e-05, + "loss": 0.0061, + "mean_token_accuracy": 0.9978555560111999, + "num_tokens": 95014556.0, + "step": 31230 + }, + { + "entropy": 0.0740995816886425, + "epoch": 7.280918521972258, + "grad_norm": 2.3125, + "learning_rate": 4.814556100383108e-05, + "loss": 0.0168, + "mean_token_accuracy": 0.9948262691497802, + "num_tokens": 95035974.0, + "step": 31235 + }, + { + "entropy": 0.03894362915307283, + "epoch": 7.282084158992889, + "grad_norm": 1.453125, + "learning_rate": 4.8144760904437594e-05, + "loss": 0.0065, + "mean_token_accuracy": 0.9980915129184723, + "num_tokens": 95080772.0, + "step": 31240 + }, + { + "entropy": 0.09444695636630059, + "epoch": 7.283249796013521, + "grad_norm": 0.5390625, + "learning_rate": 4.814396064631471e-05, + "loss": 0.0087, + "mean_token_accuracy": 0.9984256386756897, + "num_tokens": 95097268.0, + "step": 31245 + }, + { + "entropy": 0.07851285748183727, + "epoch": 7.2844154330341535, + "grad_norm": 0.314453125, + "learning_rate": 4.814316022947437e-05, + "loss": 0.0222, + "mean_token_accuracy": 0.9953002572059632, + "num_tokens": 95115624.0, + "step": 31250 + }, + { + "entropy": 0.0706376725807786, + "epoch": 7.285581070054785, + "grad_norm": 0.296875, + "learning_rate": 4.814235965392851e-05, + "loss": 0.0059, + "mean_token_accuracy": 0.9987712681293488, + "num_tokens": 95140592.0, + "step": 31255 + }, + { + "entropy": 0.06456660237163306, + "epoch": 7.286746707075417, + "grad_norm": 0.373046875, + "learning_rate": 4.814155891968905e-05, + "loss": 0.013, + "mean_token_accuracy": 0.9957951843738556, + "num_tokens": 95157676.0, + "step": 31260 + }, + { + "entropy": 0.04452027985826135, + "epoch": 7.287912344096048, + "grad_norm": 2.53125, + "learning_rate": 4.814075802676794e-05, + "loss": 0.0113, + "mean_token_accuracy": 0.9970516860485077, + "num_tokens": 95181335.0, + "step": 31265 + }, + { + "entropy": 0.06406694650650024, + "epoch": 7.28907798111668, + "grad_norm": 0.9921875, + "learning_rate": 4.813995697517712e-05, + "loss": 0.007, + "mean_token_accuracy": 0.9972039103507996, + "num_tokens": 95195683.0, + "step": 31270 + }, + { + "entropy": 0.043226651614531875, + "epoch": 7.290243618137312, + "grad_norm": 0.67578125, + "learning_rate": 4.813915576492855e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9987265706062317, + "num_tokens": 95224524.0, + "step": 31275 + }, + { + "entropy": 0.05426203664392233, + "epoch": 7.2914092551579435, + "grad_norm": 0.7890625, + "learning_rate": 4.8138354396034165e-05, + "loss": 0.0177, + "mean_token_accuracy": 0.9938402116298676, + "num_tokens": 95243533.0, + "step": 31280 + }, + { + "entropy": 0.05316199092194438, + "epoch": 7.292574892178576, + "grad_norm": 2.75, + "learning_rate": 4.813755286850591e-05, + "loss": 0.0129, + "mean_token_accuracy": 0.9960735082626343, + "num_tokens": 95266239.0, + "step": 31285 + }, + { + "entropy": 0.08508195830509066, + "epoch": 7.293740529199208, + "grad_norm": 4.5625, + "learning_rate": 4.813675118235574e-05, + "loss": 0.0252, + "mean_token_accuracy": 0.9939753770828247, + "num_tokens": 95294519.0, + "step": 31290 + }, + { + "entropy": 0.08679478149861097, + "epoch": 7.294906166219839, + "grad_norm": 0.890625, + "learning_rate": 4.813594933759561e-05, + "loss": 0.016, + "mean_token_accuracy": 0.9938629746437073, + "num_tokens": 95313085.0, + "step": 31295 + }, + { + "entropy": 0.07704577697440981, + "epoch": 7.296071803240471, + "grad_norm": 0.2080078125, + "learning_rate": 4.813514733423749e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9971446514129638, + "num_tokens": 95331867.0, + "step": 31300 + }, + { + "entropy": 0.038349354080855845, + "epoch": 7.297237440261103, + "grad_norm": 1.5625, + "learning_rate": 4.813434517229331e-05, + "loss": 0.01, + "mean_token_accuracy": 0.9969878733158112, + "num_tokens": 95373062.0, + "step": 31305 + }, + { + "entropy": 0.05551351401954889, + "epoch": 7.298403077281734, + "grad_norm": 0.416015625, + "learning_rate": 4.813354285177506e-05, + "loss": 0.0133, + "mean_token_accuracy": 0.9945835828781128, + "num_tokens": 95394246.0, + "step": 31310 + }, + { + "entropy": 0.08316299207508564, + "epoch": 7.299568714302366, + "grad_norm": 3.328125, + "learning_rate": 4.813274037269468e-05, + "loss": 0.0285, + "mean_token_accuracy": 0.9920492529869079, + "num_tokens": 95402582.0, + "step": 31315 + }, + { + "entropy": 0.05540920048952103, + "epoch": 7.300734351322998, + "grad_norm": 1.3359375, + "learning_rate": 4.8131937735064164e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9974058926105499, + "num_tokens": 95426015.0, + "step": 31320 + }, + { + "entropy": 0.07322446051985025, + "epoch": 7.30189998834363, + "grad_norm": 2.9375, + "learning_rate": 4.813113493889546e-05, + "loss": 0.0159, + "mean_token_accuracy": 0.9926839590072631, + "num_tokens": 95442392.0, + "step": 31325 + }, + { + "entropy": 0.053282452002167704, + "epoch": 7.303065625364262, + "grad_norm": 0.50390625, + "learning_rate": 4.813033198420054e-05, + "loss": 0.0042, + "mean_token_accuracy": 0.9982658386230469, + "num_tokens": 95460704.0, + "step": 31330 + }, + { + "entropy": 0.06922962255775929, + "epoch": 7.304231262384893, + "grad_norm": 1.171875, + "learning_rate": 4.8129528870991386e-05, + "loss": 0.0128, + "mean_token_accuracy": 0.9957706928253174, + "num_tokens": 95484780.0, + "step": 31335 + }, + { + "entropy": 0.058275452349334955, + "epoch": 7.305396899405525, + "grad_norm": 0.33984375, + "learning_rate": 4.8128725599279965e-05, + "loss": 0.0067, + "mean_token_accuracy": 0.9980259537696838, + "num_tokens": 95505425.0, + "step": 31340 + }, + { + "entropy": 0.0893326872959733, + "epoch": 7.306562536426157, + "grad_norm": 1.046875, + "learning_rate": 4.812792216907826e-05, + "loss": 0.0666, + "mean_token_accuracy": 0.9879299819469451, + "num_tokens": 95527201.0, + "step": 31345 + }, + { + "entropy": 0.09231711477041245, + "epoch": 7.3077281734467885, + "grad_norm": 1.625, + "learning_rate": 4.812711858039825e-05, + "loss": 0.021, + "mean_token_accuracy": 0.9963175833225251, + "num_tokens": 95535423.0, + "step": 31350 + }, + { + "entropy": 0.06640668958425522, + "epoch": 7.308893810467421, + "grad_norm": 1.3984375, + "learning_rate": 4.8126314833251916e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9974156856536865, + "num_tokens": 95568929.0, + "step": 31355 + }, + { + "entropy": 0.06366755496710538, + "epoch": 7.310059447488052, + "grad_norm": 4.46875, + "learning_rate": 4.812551092765125e-05, + "loss": 0.03, + "mean_token_accuracy": 0.9931754112243653, + "num_tokens": 95583716.0, + "step": 31360 + }, + { + "entropy": 0.08571641780436039, + "epoch": 7.311225084508684, + "grad_norm": 0.671875, + "learning_rate": 4.812470686360823e-05, + "loss": 0.0187, + "mean_token_accuracy": 0.9947358131408691, + "num_tokens": 95594442.0, + "step": 31365 + }, + { + "entropy": 0.06556187737733125, + "epoch": 7.312390721529316, + "grad_norm": 1.5078125, + "learning_rate": 4.812390264113486e-05, + "loss": 0.0104, + "mean_token_accuracy": 0.9965194284915924, + "num_tokens": 95608186.0, + "step": 31370 + }, + { + "entropy": 0.059163821302354334, + "epoch": 7.313556358549947, + "grad_norm": 2.828125, + "learning_rate": 4.812309826024311e-05, + "loss": 0.0167, + "mean_token_accuracy": 0.9942824065685272, + "num_tokens": 95624376.0, + "step": 31375 + }, + { + "entropy": 0.050896511506289245, + "epoch": 7.314721995570579, + "grad_norm": 2.078125, + "learning_rate": 4.812229372094499e-05, + "loss": 0.0067, + "mean_token_accuracy": 0.9977230668067932, + "num_tokens": 95649652.0, + "step": 31380 + }, + { + "entropy": 0.07648697402328253, + "epoch": 7.315887632591211, + "grad_norm": 1.9140625, + "learning_rate": 4.81214890232525e-05, + "loss": 0.0091, + "mean_token_accuracy": 0.9963666439056397, + "num_tokens": 95670557.0, + "step": 31385 + }, + { + "entropy": 0.05390965268015861, + "epoch": 7.317053269611843, + "grad_norm": 0.26953125, + "learning_rate": 4.8120684167177623e-05, + "loss": 0.0114, + "mean_token_accuracy": 0.9976705610752106, + "num_tokens": 95692401.0, + "step": 31390 + }, + { + "entropy": 0.07451889421790839, + "epoch": 7.318218906632475, + "grad_norm": 0.2216796875, + "learning_rate": 4.811987915273237e-05, + "loss": 0.0196, + "mean_token_accuracy": 0.9942530870437623, + "num_tokens": 95710753.0, + "step": 31395 + }, + { + "entropy": 0.0531097125262022, + "epoch": 7.319384543653106, + "grad_norm": 3.953125, + "learning_rate": 4.8119073979928755e-05, + "loss": 0.0106, + "mean_token_accuracy": 0.9969158947467804, + "num_tokens": 95739383.0, + "step": 31400 + }, + { + "entropy": 0.07482300028204918, + "epoch": 7.320550180673738, + "grad_norm": 1.28125, + "learning_rate": 4.8118268648778776e-05, + "loss": 0.02, + "mean_token_accuracy": 0.9964318454265595, + "num_tokens": 95750170.0, + "step": 31405 + }, + { + "entropy": 0.06803834708407522, + "epoch": 7.32171581769437, + "grad_norm": 0.73828125, + "learning_rate": 4.811746315929443e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9968580782413483, + "num_tokens": 95785014.0, + "step": 31410 + }, + { + "entropy": 0.06311820847913623, + "epoch": 7.322881454715001, + "grad_norm": 0.52734375, + "learning_rate": 4.8116657511487745e-05, + "loss": 0.0066, + "mean_token_accuracy": 0.9981098234653473, + "num_tokens": 95804301.0, + "step": 31415 + }, + { + "entropy": 0.06809269674122334, + "epoch": 7.3240470917356335, + "grad_norm": 0.9296875, + "learning_rate": 4.811585170537073e-05, + "loss": 0.0104, + "mean_token_accuracy": 0.9975564420223236, + "num_tokens": 95816307.0, + "step": 31420 + }, + { + "entropy": 0.06396800447255373, + "epoch": 7.325212728756266, + "grad_norm": 0.365234375, + "learning_rate": 4.811504574095539e-05, + "loss": 0.0122, + "mean_token_accuracy": 0.9960817337036133, + "num_tokens": 95838870.0, + "step": 31425 + }, + { + "entropy": 0.07934959940612316, + "epoch": 7.326378365776897, + "grad_norm": 0.84765625, + "learning_rate": 4.811423961825377e-05, + "loss": 0.0274, + "mean_token_accuracy": 0.9933449506759644, + "num_tokens": 95847317.0, + "step": 31430 + }, + { + "entropy": 0.06323989136144519, + "epoch": 7.327544002797529, + "grad_norm": 1.7421875, + "learning_rate": 4.8113433337277857e-05, + "loss": 0.0147, + "mean_token_accuracy": 0.9955657243728637, + "num_tokens": 95867740.0, + "step": 31435 + }, + { + "entropy": 0.07817384712398052, + "epoch": 7.328709639818161, + "grad_norm": 2.0625, + "learning_rate": 4.8112626898039694e-05, + "loss": 0.0086, + "mean_token_accuracy": 0.9980550169944763, + "num_tokens": 95882846.0, + "step": 31440 + }, + { + "entropy": 0.07506803907454014, + "epoch": 7.329875276838792, + "grad_norm": 2.109375, + "learning_rate": 4.81118203005513e-05, + "loss": 0.0244, + "mean_token_accuracy": 0.9934002816677093, + "num_tokens": 95902766.0, + "step": 31445 + }, + { + "entropy": 0.06314781978726387, + "epoch": 7.331040913859424, + "grad_norm": 0.345703125, + "learning_rate": 4.811101354482471e-05, + "loss": 0.0109, + "mean_token_accuracy": 0.9966369867324829, + "num_tokens": 95921553.0, + "step": 31450 + }, + { + "entropy": 0.05526378992944956, + "epoch": 7.332206550880056, + "grad_norm": 0.52734375, + "learning_rate": 4.811020663087194e-05, + "loss": 0.0104, + "mean_token_accuracy": 0.9964692711830139, + "num_tokens": 95950474.0, + "step": 31455 + }, + { + "entropy": 0.06638829614967108, + "epoch": 7.333372187900688, + "grad_norm": 1.46875, + "learning_rate": 4.810939955870504e-05, + "loss": 0.0158, + "mean_token_accuracy": 0.994761723279953, + "num_tokens": 95968461.0, + "step": 31460 + }, + { + "entropy": 0.04524884703569114, + "epoch": 7.33453782492132, + "grad_norm": 0.1982421875, + "learning_rate": 4.810859232833602e-05, + "loss": 0.0049, + "mean_token_accuracy": 0.998431783914566, + "num_tokens": 95999830.0, + "step": 31465 + }, + { + "entropy": 0.070090286526829, + "epoch": 7.335703461941951, + "grad_norm": 0.333984375, + "learning_rate": 4.8107784939776946e-05, + "loss": 0.0167, + "mean_token_accuracy": 0.9949487507343292, + "num_tokens": 96032091.0, + "step": 31470 + }, + { + "entropy": 0.07050778605043888, + "epoch": 7.336869098962583, + "grad_norm": 2.234375, + "learning_rate": 4.8106977393039844e-05, + "loss": 0.0265, + "mean_token_accuracy": 0.9949378252029419, + "num_tokens": 96044365.0, + "step": 31475 + }, + { + "entropy": 0.0740006735548377, + "epoch": 7.338034735983215, + "grad_norm": 1.2109375, + "learning_rate": 4.810616968813675e-05, + "loss": 0.0173, + "mean_token_accuracy": 0.9964466452598572, + "num_tokens": 96058233.0, + "step": 31480 + }, + { + "entropy": 0.07905077319592238, + "epoch": 7.339200373003846, + "grad_norm": 1.9765625, + "learning_rate": 4.810536182507971e-05, + "loss": 0.009, + "mean_token_accuracy": 0.9967888534069062, + "num_tokens": 96069124.0, + "step": 31485 + }, + { + "entropy": 0.05249534146860242, + "epoch": 7.3403660100244785, + "grad_norm": 0.2890625, + "learning_rate": 4.810455380388078e-05, + "loss": 0.0083, + "mean_token_accuracy": 0.9959168851375579, + "num_tokens": 96100217.0, + "step": 31490 + }, + { + "entropy": 0.08653380684554576, + "epoch": 7.34153164704511, + "grad_norm": 1.3046875, + "learning_rate": 4.8103745624552e-05, + "loss": 0.0142, + "mean_token_accuracy": 0.9960395634174347, + "num_tokens": 96110199.0, + "step": 31495 + }, + { + "entropy": 0.0720094045624137, + "epoch": 7.342697284065742, + "grad_norm": 1.609375, + "learning_rate": 4.810293728710542e-05, + "loss": 0.0172, + "mean_token_accuracy": 0.9962703585624695, + "num_tokens": 96129305.0, + "step": 31500 + }, + { + "entropy": 0.07461393307894468, + "epoch": 7.343862921086374, + "grad_norm": 2.515625, + "learning_rate": 4.81021287915531e-05, + "loss": 0.0143, + "mean_token_accuracy": 0.9961204707622529, + "num_tokens": 96150839.0, + "step": 31505 + }, + { + "entropy": 0.06459999550133944, + "epoch": 7.345028558107005, + "grad_norm": 0.5703125, + "learning_rate": 4.8101320137907095e-05, + "loss": 0.0125, + "mean_token_accuracy": 0.9957730710506439, + "num_tokens": 96182234.0, + "step": 31510 + }, + { + "entropy": 0.06358031257987022, + "epoch": 7.346194195127637, + "grad_norm": 0.65234375, + "learning_rate": 4.810051132617947e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9975331246852874, + "num_tokens": 96195513.0, + "step": 31515 + }, + { + "entropy": 0.06775630675256253, + "epoch": 7.347359832148269, + "grad_norm": 0.8046875, + "learning_rate": 4.8099702356382264e-05, + "loss": 0.013, + "mean_token_accuracy": 0.9957094848155975, + "num_tokens": 96212860.0, + "step": 31520 + }, + { + "entropy": 0.06747577143833042, + "epoch": 7.348525469168901, + "grad_norm": 0.361328125, + "learning_rate": 4.809889322852756e-05, + "loss": 0.0209, + "mean_token_accuracy": 0.9947060763835907, + "num_tokens": 96227657.0, + "step": 31525 + }, + { + "entropy": 0.06826590951532126, + "epoch": 7.349691106189533, + "grad_norm": 4.0, + "learning_rate": 4.809808394262741e-05, + "loss": 0.0212, + "mean_token_accuracy": 0.9943705976009369, + "num_tokens": 96239039.0, + "step": 31530 + }, + { + "entropy": 0.05712395738810301, + "epoch": 7.350856743210164, + "grad_norm": 0.353515625, + "learning_rate": 4.809727449869389e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9990525543689728, + "num_tokens": 96261487.0, + "step": 31535 + }, + { + "entropy": 0.06899423655122519, + "epoch": 7.352022380230796, + "grad_norm": 1.65625, + "learning_rate": 4.809646489673907e-05, + "loss": 0.0165, + "mean_token_accuracy": 0.9933532297611236, + "num_tokens": 96272372.0, + "step": 31540 + }, + { + "entropy": 0.06043548882007599, + "epoch": 7.353188017251428, + "grad_norm": 0.484375, + "learning_rate": 4.809565513677502e-05, + "loss": 0.0141, + "mean_token_accuracy": 0.995878529548645, + "num_tokens": 96303355.0, + "step": 31545 + }, + { + "entropy": 0.05828229039907455, + "epoch": 7.354353654272059, + "grad_norm": 0.314453125, + "learning_rate": 4.809484521881381e-05, + "loss": 0.0196, + "mean_token_accuracy": 0.9947497963905334, + "num_tokens": 96319372.0, + "step": 31550 + }, + { + "entropy": 0.0643775088712573, + "epoch": 7.355519291292691, + "grad_norm": 1.8984375, + "learning_rate": 4.8094035142867536e-05, + "loss": 0.0068, + "mean_token_accuracy": 0.9980500638484955, + "num_tokens": 96333139.0, + "step": 31555 + }, + { + "entropy": 0.0727043965831399, + "epoch": 7.3566849283133235, + "grad_norm": 4.78125, + "learning_rate": 4.809322490894825e-05, + "loss": 0.012, + "mean_token_accuracy": 0.9963323414325714, + "num_tokens": 96352988.0, + "step": 31560 + }, + { + "entropy": 0.05997302522882819, + "epoch": 7.357850565333955, + "grad_norm": 1.609375, + "learning_rate": 4.809241451706805e-05, + "loss": 0.0062, + "mean_token_accuracy": 0.9979426980018615, + "num_tokens": 96378601.0, + "step": 31565 + }, + { + "entropy": 0.058050360158085824, + "epoch": 7.359016202354587, + "grad_norm": 0.447265625, + "learning_rate": 4.8091603967239025e-05, + "loss": 0.0192, + "mean_token_accuracy": 0.9955137133598327, + "num_tokens": 96409377.0, + "step": 31570 + }, + { + "entropy": 0.06100328993052244, + "epoch": 7.360181839375219, + "grad_norm": 1.28125, + "learning_rate": 4.809079325947325e-05, + "loss": 0.0076, + "mean_token_accuracy": 0.9978339493274688, + "num_tokens": 96442510.0, + "step": 31575 + }, + { + "entropy": 0.04876450030133128, + "epoch": 7.36134747639585, + "grad_norm": 0.2041015625, + "learning_rate": 4.808998239378282e-05, + "loss": 0.0113, + "mean_token_accuracy": 0.995351231098175, + "num_tokens": 96465017.0, + "step": 31580 + }, + { + "entropy": 0.05196169055998325, + "epoch": 7.362513113416482, + "grad_norm": 1.5390625, + "learning_rate": 4.808917137017982e-05, + "loss": 0.0111, + "mean_token_accuracy": 0.9975907921791076, + "num_tokens": 96480390.0, + "step": 31585 + }, + { + "entropy": 0.06991583518683911, + "epoch": 7.3636787504371135, + "grad_norm": 1.6484375, + "learning_rate": 4.8088360188676354e-05, + "loss": 0.009, + "mean_token_accuracy": 0.9967041075229645, + "num_tokens": 96496176.0, + "step": 31590 + }, + { + "entropy": 0.0919257765635848, + "epoch": 7.364844387457746, + "grad_norm": 3.15625, + "learning_rate": 4.8087548849284504e-05, + "loss": 0.0168, + "mean_token_accuracy": 0.9965686082839966, + "num_tokens": 96512875.0, + "step": 31595 + }, + { + "entropy": 0.08283570520579815, + "epoch": 7.366010024478378, + "grad_norm": 1.5703125, + "learning_rate": 4.808673735201637e-05, + "loss": 0.0143, + "mean_token_accuracy": 0.9972459256649018, + "num_tokens": 96521569.0, + "step": 31600 + }, + { + "entropy": 0.06374187842011451, + "epoch": 7.367175661499009, + "grad_norm": 0.86328125, + "learning_rate": 4.8085925696884074e-05, + "loss": 0.0162, + "mean_token_accuracy": 0.996217918395996, + "num_tokens": 96531666.0, + "step": 31605 + }, + { + "entropy": 0.07336053401231765, + "epoch": 7.368341298519641, + "grad_norm": 0.890625, + "learning_rate": 4.8085113883899704e-05, + "loss": 0.0061, + "mean_token_accuracy": 0.9984713077545166, + "num_tokens": 96556172.0, + "step": 31610 + }, + { + "entropy": 0.06423005685210229, + "epoch": 7.369506935540273, + "grad_norm": 0.490234375, + "learning_rate": 4.808430191307535e-05, + "loss": 0.0125, + "mean_token_accuracy": 0.9968792617321014, + "num_tokens": 96572479.0, + "step": 31615 + }, + { + "entropy": 0.07545432206243277, + "epoch": 7.370672572560904, + "grad_norm": 0.87890625, + "learning_rate": 4.808348978442315e-05, + "loss": 0.0164, + "mean_token_accuracy": 0.9944819986820221, + "num_tokens": 96587508.0, + "step": 31620 + }, + { + "entropy": 0.07405749782919883, + "epoch": 7.3718382095815365, + "grad_norm": 3.890625, + "learning_rate": 4.808267749795519e-05, + "loss": 0.0209, + "mean_token_accuracy": 0.9963535487651825, + "num_tokens": 96599591.0, + "step": 31625 + }, + { + "entropy": 0.057722126320004466, + "epoch": 7.373003846602168, + "grad_norm": 2.375, + "learning_rate": 4.8081865053683595e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9980563938617706, + "num_tokens": 96629812.0, + "step": 31630 + }, + { + "entropy": 0.05824717171490192, + "epoch": 7.3741694836228, + "grad_norm": 0.9609375, + "learning_rate": 4.8081052451620476e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9970640122890473, + "num_tokens": 96661085.0, + "step": 31635 + }, + { + "entropy": 0.04979136511683464, + "epoch": 7.375335120643432, + "grad_norm": 0.244140625, + "learning_rate": 4.808023969177795e-05, + "loss": 0.0128, + "mean_token_accuracy": 0.9955516993999481, + "num_tokens": 96683599.0, + "step": 31640 + }, + { + "entropy": 0.06467875819653272, + "epoch": 7.376500757664063, + "grad_norm": 0.384765625, + "learning_rate": 4.8079426774168134e-05, + "loss": 0.0219, + "mean_token_accuracy": 0.9962655901908875, + "num_tokens": 96706960.0, + "step": 31645 + }, + { + "entropy": 0.06323386076837778, + "epoch": 7.377666394684695, + "grad_norm": 1.9921875, + "learning_rate": 4.807861369880316e-05, + "loss": 0.0142, + "mean_token_accuracy": 0.9936850070953369, + "num_tokens": 96738935.0, + "step": 31650 + }, + { + "entropy": 0.06506092166528106, + "epoch": 7.378832031705327, + "grad_norm": 3.671875, + "learning_rate": 4.807780046569513e-05, + "loss": 0.0248, + "mean_token_accuracy": 0.993993878364563, + "num_tokens": 96757812.0, + "step": 31655 + }, + { + "entropy": 0.07635737303644419, + "epoch": 7.3799976687259585, + "grad_norm": 0.42578125, + "learning_rate": 4.8076987074856196e-05, + "loss": 0.0111, + "mean_token_accuracy": 0.9954919457435608, + "num_tokens": 96778442.0, + "step": 31660 + }, + { + "entropy": 0.0630334172397852, + "epoch": 7.381163305746591, + "grad_norm": 0.5859375, + "learning_rate": 4.807617352629847e-05, + "loss": 0.0134, + "mean_token_accuracy": 0.9967043399810791, + "num_tokens": 96801329.0, + "step": 31665 + }, + { + "entropy": 0.0476308373734355, + "epoch": 7.382328942767222, + "grad_norm": 1.2265625, + "learning_rate": 4.80753598200341e-05, + "loss": 0.0061, + "mean_token_accuracy": 0.9978141784667969, + "num_tokens": 96817981.0, + "step": 31670 + }, + { + "entropy": 0.07085022889077663, + "epoch": 7.383494579787854, + "grad_norm": 5.46875, + "learning_rate": 4.8074545956075203e-05, + "loss": 0.0194, + "mean_token_accuracy": 0.9938320398330689, + "num_tokens": 96835143.0, + "step": 31675 + }, + { + "entropy": 0.06873668488115073, + "epoch": 7.384660216808486, + "grad_norm": 0.466796875, + "learning_rate": 4.807373193443392e-05, + "loss": 0.0314, + "mean_token_accuracy": 0.9937922835350037, + "num_tokens": 96847103.0, + "step": 31680 + }, + { + "entropy": 0.08670074734836816, + "epoch": 7.385825853829117, + "grad_norm": 2.640625, + "learning_rate": 4.807291775512239e-05, + "loss": 0.0589, + "mean_token_accuracy": 0.9884765923023224, + "num_tokens": 96876792.0, + "step": 31685 + }, + { + "entropy": 0.06278931275010109, + "epoch": 7.386991490849749, + "grad_norm": 0.478515625, + "learning_rate": 4.807210341815275e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9977745175361633, + "num_tokens": 96907806.0, + "step": 31690 + }, + { + "entropy": 0.05830715810880065, + "epoch": 7.3881571278703815, + "grad_norm": 0.26953125, + "learning_rate": 4.807128892353715e-05, + "loss": 0.0061, + "mean_token_accuracy": 0.9984943211078644, + "num_tokens": 96931150.0, + "step": 31695 + }, + { + "entropy": 0.09116251096129417, + "epoch": 7.389322764891013, + "grad_norm": 2.328125, + "learning_rate": 4.8070474271287735e-05, + "loss": 0.01, + "mean_token_accuracy": 0.996881228685379, + "num_tokens": 96941888.0, + "step": 31700 + }, + { + "entropy": 0.06741825370118022, + "epoch": 7.390488401911645, + "grad_norm": 1.7578125, + "learning_rate": 4.8069659461416644e-05, + "loss": 0.0171, + "mean_token_accuracy": 0.9934270083904266, + "num_tokens": 96964333.0, + "step": 31705 + }, + { + "entropy": 0.048117708042263986, + "epoch": 7.391654038932277, + "grad_norm": 1.3125, + "learning_rate": 4.806884449393604e-05, + "loss": 0.0098, + "mean_token_accuracy": 0.9970331609249115, + "num_tokens": 96991219.0, + "step": 31710 + }, + { + "entropy": 0.06609851317480206, + "epoch": 7.392819675952908, + "grad_norm": 1.953125, + "learning_rate": 4.806802936885806e-05, + "loss": 0.0141, + "mean_token_accuracy": 0.9967052280902863, + "num_tokens": 97021576.0, + "step": 31715 + }, + { + "entropy": 0.07261874917894602, + "epoch": 7.39398531297354, + "grad_norm": 0.2353515625, + "learning_rate": 4.806721408619487e-05, + "loss": 0.0134, + "mean_token_accuracy": 0.9963623523712158, + "num_tokens": 97040151.0, + "step": 31720 + }, + { + "entropy": 0.06871300302445889, + "epoch": 7.395150949994171, + "grad_norm": 0.86328125, + "learning_rate": 4.806639864595863e-05, + "loss": 0.0186, + "mean_token_accuracy": 0.9961176872253418, + "num_tokens": 97058961.0, + "step": 31725 + }, + { + "entropy": 0.06867877654731273, + "epoch": 7.3963165870148035, + "grad_norm": 1.671875, + "learning_rate": 4.806558304816148e-05, + "loss": 0.0182, + "mean_token_accuracy": 0.9950770616531373, + "num_tokens": 97072234.0, + "step": 31730 + }, + { + "entropy": 0.06973572825081646, + "epoch": 7.397482224035436, + "grad_norm": 1.7109375, + "learning_rate": 4.806476729281561e-05, + "loss": 0.0197, + "mean_token_accuracy": 0.9921569526195526, + "num_tokens": 97088397.0, + "step": 31735 + }, + { + "entropy": 0.06464776555076242, + "epoch": 7.398647861056067, + "grad_norm": 0.69921875, + "learning_rate": 4.806395137993316e-05, + "loss": 0.0124, + "mean_token_accuracy": 0.9959135890007019, + "num_tokens": 97111838.0, + "step": 31740 + }, + { + "entropy": 0.04916971167549491, + "epoch": 7.399813498076699, + "grad_norm": 0.1845703125, + "learning_rate": 4.806313530952631e-05, + "loss": 0.0056, + "mean_token_accuracy": 0.9973500072956085, + "num_tokens": 97146271.0, + "step": 31745 + }, + { + "entropy": 0.05610126769170165, + "epoch": 7.400979135097331, + "grad_norm": 1.8984375, + "learning_rate": 4.806231908160722e-05, + "loss": 0.0084, + "mean_token_accuracy": 0.9978776752948761, + "num_tokens": 97175592.0, + "step": 31750 + }, + { + "entropy": 0.06999030411243438, + "epoch": 7.402144772117962, + "grad_norm": 0.62890625, + "learning_rate": 4.806150269618807e-05, + "loss": 0.0108, + "mean_token_accuracy": 0.9972579479217529, + "num_tokens": 97197064.0, + "step": 31755 + }, + { + "entropy": 0.06999958539381623, + "epoch": 7.403310409138594, + "grad_norm": 0.279296875, + "learning_rate": 4.8060686153281034e-05, + "loss": 0.0194, + "mean_token_accuracy": 0.9947843551635742, + "num_tokens": 97219096.0, + "step": 31760 + }, + { + "entropy": 0.06066929465159774, + "epoch": 7.404476046159226, + "grad_norm": 0.396484375, + "learning_rate": 4.805986945289828e-05, + "loss": 0.0126, + "mean_token_accuracy": 0.9972352147102356, + "num_tokens": 97240817.0, + "step": 31765 + }, + { + "entropy": 0.053822552971541884, + "epoch": 7.405641683179858, + "grad_norm": 3.375, + "learning_rate": 4.8059052595051986e-05, + "loss": 0.0106, + "mean_token_accuracy": 0.9965937435626984, + "num_tokens": 97266739.0, + "step": 31770 + }, + { + "entropy": 0.0714537937194109, + "epoch": 7.40680732020049, + "grad_norm": 1.0078125, + "learning_rate": 4.805823557975433e-05, + "loss": 0.0287, + "mean_token_accuracy": 0.9925433158874511, + "num_tokens": 97278114.0, + "step": 31775 + }, + { + "entropy": 0.055256042256951333, + "epoch": 7.407972957221121, + "grad_norm": 1.2265625, + "learning_rate": 4.805741840701751e-05, + "loss": 0.0076, + "mean_token_accuracy": 0.9974486231803894, + "num_tokens": 97293752.0, + "step": 31780 + }, + { + "entropy": 0.057471389323472975, + "epoch": 7.409138594241753, + "grad_norm": 0.40625, + "learning_rate": 4.8056601076853704e-05, + "loss": 0.0083, + "mean_token_accuracy": 0.995576798915863, + "num_tokens": 97311430.0, + "step": 31785 + }, + { + "entropy": 0.07471015406772494, + "epoch": 7.410304231262385, + "grad_norm": 1.0390625, + "learning_rate": 4.80557835892751e-05, + "loss": 0.0098, + "mean_token_accuracy": 0.9975471258163452, + "num_tokens": 97325200.0, + "step": 31790 + }, + { + "entropy": 0.05630330964922905, + "epoch": 7.4114698682830165, + "grad_norm": 1.5625, + "learning_rate": 4.8054965944293876e-05, + "loss": 0.0108, + "mean_token_accuracy": 0.9970169067382812, + "num_tokens": 97347193.0, + "step": 31795 + }, + { + "entropy": 0.05957003049552441, + "epoch": 7.412635505303649, + "grad_norm": 0.38671875, + "learning_rate": 4.8054148141922236e-05, + "loss": 0.0073, + "mean_token_accuracy": 0.9971933484077453, + "num_tokens": 97386951.0, + "step": 31800 + }, + { + "entropy": 0.0552212193608284, + "epoch": 7.41380114232428, + "grad_norm": 4.4375, + "learning_rate": 4.805333018217238e-05, + "loss": 0.015, + "mean_token_accuracy": 0.9929179668426513, + "num_tokens": 97410465.0, + "step": 31805 + }, + { + "entropy": 0.06529962951317429, + "epoch": 7.414966779344912, + "grad_norm": 0.2734375, + "learning_rate": 4.8052512065056495e-05, + "loss": 0.0116, + "mean_token_accuracy": 0.9965193450450898, + "num_tokens": 97437080.0, + "step": 31810 + }, + { + "entropy": 0.07579502649605274, + "epoch": 7.416132416365544, + "grad_norm": 2.953125, + "learning_rate": 4.805169379058678e-05, + "loss": 0.016, + "mean_token_accuracy": 0.9949338555335998, + "num_tokens": 97458811.0, + "step": 31815 + }, + { + "entropy": 0.06360966097563506, + "epoch": 7.417298053386175, + "grad_norm": 1.5390625, + "learning_rate": 4.805087535877544e-05, + "loss": 0.0194, + "mean_token_accuracy": 0.994461327791214, + "num_tokens": 97474463.0, + "step": 31820 + }, + { + "entropy": 0.07172002922743559, + "epoch": 7.418463690406807, + "grad_norm": 0.7578125, + "learning_rate": 4.8050056769634684e-05, + "loss": 0.0059, + "mean_token_accuracy": 0.9970459818840027, + "num_tokens": 97496723.0, + "step": 31825 + }, + { + "entropy": 0.07104908116161823, + "epoch": 7.419629327427439, + "grad_norm": 0.482421875, + "learning_rate": 4.804923802317671e-05, + "loss": 0.0218, + "mean_token_accuracy": 0.9935750663280487, + "num_tokens": 97520510.0, + "step": 31830 + }, + { + "entropy": 0.05038301609456539, + "epoch": 7.420794964448071, + "grad_norm": 0.390625, + "learning_rate": 4.804841911941373e-05, + "loss": 0.0133, + "mean_token_accuracy": 0.9973720788955689, + "num_tokens": 97551599.0, + "step": 31835 + }, + { + "entropy": 0.07734383950009942, + "epoch": 7.421960601468703, + "grad_norm": 2.515625, + "learning_rate": 4.804760005835796e-05, + "loss": 0.0235, + "mean_token_accuracy": 0.994698303937912, + "num_tokens": 97569532.0, + "step": 31840 + }, + { + "entropy": 0.0816195654682815, + "epoch": 7.423126238489335, + "grad_norm": 1.1875, + "learning_rate": 4.80467808400216e-05, + "loss": 0.0288, + "mean_token_accuracy": 0.9956186056137085, + "num_tokens": 97582823.0, + "step": 31845 + }, + { + "entropy": 0.07614492103457451, + "epoch": 7.424291875509966, + "grad_norm": 1.859375, + "learning_rate": 4.8045961464416876e-05, + "loss": 0.0124, + "mean_token_accuracy": 0.9969462215900421, + "num_tokens": 97595668.0, + "step": 31850 + }, + { + "entropy": 0.052913818042725325, + "epoch": 7.425457512530598, + "grad_norm": 0.6796875, + "learning_rate": 4.804514193155601e-05, + "loss": 0.0084, + "mean_token_accuracy": 0.9972995042800903, + "num_tokens": 97625129.0, + "step": 31855 + }, + { + "entropy": 0.083005213085562, + "epoch": 7.426623149551229, + "grad_norm": 1.3203125, + "learning_rate": 4.804432224145121e-05, + "loss": 0.0199, + "mean_token_accuracy": 0.9942511141300201, + "num_tokens": 97640906.0, + "step": 31860 + }, + { + "entropy": 0.07574100028723478, + "epoch": 7.4277887865718615, + "grad_norm": 2.375, + "learning_rate": 4.8043502394114714e-05, + "loss": 0.0111, + "mean_token_accuracy": 0.9965871274471283, + "num_tokens": 97663290.0, + "step": 31865 + }, + { + "entropy": 0.06771274656057358, + "epoch": 7.428954423592494, + "grad_norm": 0.380859375, + "learning_rate": 4.804268238955874e-05, + "loss": 0.0068, + "mean_token_accuracy": 0.995803314447403, + "num_tokens": 97677727.0, + "step": 31870 + }, + { + "entropy": 0.06518225539475679, + "epoch": 7.430120060613125, + "grad_norm": 2.8125, + "learning_rate": 4.8041862227795506e-05, + "loss": 0.0264, + "mean_token_accuracy": 0.994991272687912, + "num_tokens": 97695900.0, + "step": 31875 + }, + { + "entropy": 0.06668935623019934, + "epoch": 7.431285697633757, + "grad_norm": 1.234375, + "learning_rate": 4.804104190883725e-05, + "loss": 0.0189, + "mean_token_accuracy": 0.9964043319225311, + "num_tokens": 97708289.0, + "step": 31880 + }, + { + "entropy": 0.06050615776330233, + "epoch": 7.432451334654389, + "grad_norm": 0.177734375, + "learning_rate": 4.804022143269621e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9986397385597229, + "num_tokens": 97739608.0, + "step": 31885 + }, + { + "entropy": 0.05716359736397862, + "epoch": 7.43361697167502, + "grad_norm": 0.2421875, + "learning_rate": 4.803940079938461e-05, + "loss": 0.0088, + "mean_token_accuracy": 0.9968883991241455, + "num_tokens": 97755159.0, + "step": 31890 + }, + { + "entropy": 0.08151376890018583, + "epoch": 7.434782608695652, + "grad_norm": 0.53125, + "learning_rate": 4.80385800089147e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9967722475528717, + "num_tokens": 97778369.0, + "step": 31895 + }, + { + "entropy": 0.060337006114423275, + "epoch": 7.4359482457162835, + "grad_norm": 2.421875, + "learning_rate": 4.80377590612987e-05, + "loss": 0.0121, + "mean_token_accuracy": 0.9959405541419983, + "num_tokens": 97788905.0, + "step": 31900 + }, + { + "entropy": 0.07208081735298037, + "epoch": 7.437113882736916, + "grad_norm": 2.0, + "learning_rate": 4.8036937956548875e-05, + "loss": 0.0245, + "mean_token_accuracy": 0.992925900220871, + "num_tokens": 97807287.0, + "step": 31905 + }, + { + "entropy": 0.05150127690285444, + "epoch": 7.438279519757548, + "grad_norm": 0.35546875, + "learning_rate": 4.8036116694677446e-05, + "loss": 0.0085, + "mean_token_accuracy": 0.9969991385936737, + "num_tokens": 97837029.0, + "step": 31910 + }, + { + "entropy": 0.0542297987267375, + "epoch": 7.439445156778179, + "grad_norm": 1.03125, + "learning_rate": 4.803529527569668e-05, + "loss": 0.0119, + "mean_token_accuracy": 0.9952615916728973, + "num_tokens": 97851701.0, + "step": 31915 + }, + { + "entropy": 0.06879463642835618, + "epoch": 7.440610793798811, + "grad_norm": 0.2470703125, + "learning_rate": 4.803447369961881e-05, + "loss": 0.0078, + "mean_token_accuracy": 0.9959939181804657, + "num_tokens": 97881684.0, + "step": 31920 + }, + { + "entropy": 0.048804645985364915, + "epoch": 7.441776430819443, + "grad_norm": 0.625, + "learning_rate": 4.803365196645609e-05, + "loss": 0.0143, + "mean_token_accuracy": 0.9956522107124328, + "num_tokens": 97902591.0, + "step": 31925 + }, + { + "entropy": 0.05077600758522749, + "epoch": 7.442942067840074, + "grad_norm": 2.3125, + "learning_rate": 4.803283007622078e-05, + "loss": 0.0121, + "mean_token_accuracy": 0.9966121196746827, + "num_tokens": 97917592.0, + "step": 31930 + }, + { + "entropy": 0.06004767008125782, + "epoch": 7.4441077048607065, + "grad_norm": 2.390625, + "learning_rate": 4.803200802892513e-05, + "loss": 0.017, + "mean_token_accuracy": 0.9940854012966156, + "num_tokens": 97930297.0, + "step": 31935 + }, + { + "entropy": 0.07400213945657015, + "epoch": 7.445273341881338, + "grad_norm": 1.1953125, + "learning_rate": 4.803118582458139e-05, + "loss": 0.0148, + "mean_token_accuracy": 0.9951455473899842, + "num_tokens": 97940683.0, + "step": 31940 + }, + { + "entropy": 0.0650801496580243, + "epoch": 7.44643897890197, + "grad_norm": 1.6171875, + "learning_rate": 4.803036346320184e-05, + "loss": 0.0137, + "mean_token_accuracy": 0.9969987511634827, + "num_tokens": 97952741.0, + "step": 31945 + }, + { + "entropy": 0.09372318238019943, + "epoch": 7.447604615922602, + "grad_norm": 0.69921875, + "learning_rate": 4.802954094479873e-05, + "loss": 0.0159, + "mean_token_accuracy": 0.9942943751811981, + "num_tokens": 97965865.0, + "step": 31950 + }, + { + "entropy": 0.047209294699132445, + "epoch": 7.448770252943233, + "grad_norm": 0.4375, + "learning_rate": 4.8028718269384333e-05, + "loss": 0.0053, + "mean_token_accuracy": 0.9976045370101929, + "num_tokens": 97999982.0, + "step": 31955 + }, + { + "entropy": 0.07388658728450537, + "epoch": 7.449935889963865, + "grad_norm": 3.59375, + "learning_rate": 4.80278954369709e-05, + "loss": 0.015, + "mean_token_accuracy": 0.9949574530124664, + "num_tokens": 98014580.0, + "step": 31960 + }, + { + "entropy": 0.07657665573060513, + "epoch": 7.451101526984497, + "grad_norm": 0.44140625, + "learning_rate": 4.802707244757072e-05, + "loss": 0.0116, + "mean_token_accuracy": 0.9976080656051636, + "num_tokens": 98035444.0, + "step": 31965 + }, + { + "entropy": 0.05655227024108171, + "epoch": 7.452267164005129, + "grad_norm": 0.2314453125, + "learning_rate": 4.802624930119605e-05, + "loss": 0.0128, + "mean_token_accuracy": 0.9970692336559296, + "num_tokens": 98069827.0, + "step": 31970 + }, + { + "entropy": 0.061259434837847945, + "epoch": 7.453432801025761, + "grad_norm": 0.2294921875, + "learning_rate": 4.8025425997859176e-05, + "loss": 0.0086, + "mean_token_accuracy": 0.9976824820041656, + "num_tokens": 98097657.0, + "step": 31975 + }, + { + "entropy": 0.03952852394431829, + "epoch": 7.454598438046393, + "grad_norm": 0.2021484375, + "learning_rate": 4.802460253757237e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.9980733573436738, + "num_tokens": 98127437.0, + "step": 31980 + }, + { + "entropy": 0.05817561913281679, + "epoch": 7.455764075067024, + "grad_norm": 2.984375, + "learning_rate": 4.80237789203479e-05, + "loss": 0.0128, + "mean_token_accuracy": 0.9965545058250427, + "num_tokens": 98150403.0, + "step": 31985 + }, + { + "entropy": 0.08537760525941848, + "epoch": 7.456929712087656, + "grad_norm": 2.8125, + "learning_rate": 4.802295514619807e-05, + "loss": 0.0349, + "mean_token_accuracy": 0.9901233434677124, + "num_tokens": 98157269.0, + "step": 31990 + }, + { + "entropy": 0.03773313369601965, + "epoch": 7.458095349108287, + "grad_norm": 0.3046875, + "learning_rate": 4.802213121513515e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9988882124423981, + "num_tokens": 98195898.0, + "step": 31995 + }, + { + "entropy": 0.05709927398711443, + "epoch": 7.459260986128919, + "grad_norm": 0.404296875, + "learning_rate": 4.802130712717142e-05, + "loss": 0.0097, + "mean_token_accuracy": 0.9951608777046204, + "num_tokens": 98216906.0, + "step": 32000 + }, + { + "entropy": 0.07515623532235623, + "epoch": 7.4604266231495515, + "grad_norm": 2.171875, + "learning_rate": 4.802048288231917e-05, + "loss": 0.0239, + "mean_token_accuracy": 0.9943017184734344, + "num_tokens": 98229341.0, + "step": 32005 + }, + { + "entropy": 0.046360192447900773, + "epoch": 7.461592260170183, + "grad_norm": 0.25390625, + "learning_rate": 4.8019658480590715e-05, + "loss": 0.0134, + "mean_token_accuracy": 0.9963595449924469, + "num_tokens": 98262472.0, + "step": 32010 + }, + { + "entropy": 0.06408070325851441, + "epoch": 7.462757897190815, + "grad_norm": 0.11767578125, + "learning_rate": 4.8018833921998316e-05, + "loss": 0.0097, + "mean_token_accuracy": 0.9963421761989594, + "num_tokens": 98282518.0, + "step": 32015 + }, + { + "entropy": 0.07481765616685151, + "epoch": 7.463923534211447, + "grad_norm": 1.390625, + "learning_rate": 4.801800920655429e-05, + "loss": 0.0125, + "mean_token_accuracy": 0.9957054257392883, + "num_tokens": 98294251.0, + "step": 32020 + }, + { + "entropy": 0.061605995427817105, + "epoch": 7.465089171232078, + "grad_norm": 0.25390625, + "learning_rate": 4.801718433427092e-05, + "loss": 0.0159, + "mean_token_accuracy": 0.9957810342311859, + "num_tokens": 98320647.0, + "step": 32025 + }, + { + "entropy": 0.06579680051654577, + "epoch": 7.46625480825271, + "grad_norm": 0.28125, + "learning_rate": 4.801635930516051e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.9973134338855744, + "num_tokens": 98339371.0, + "step": 32030 + }, + { + "entropy": 0.07847501374781132, + "epoch": 7.4674204452733415, + "grad_norm": 1.4609375, + "learning_rate": 4.801553411923537e-05, + "loss": 0.0193, + "mean_token_accuracy": 0.9948892652988434, + "num_tokens": 98351538.0, + "step": 32035 + }, + { + "entropy": 0.05895627625286579, + "epoch": 7.468586082293974, + "grad_norm": 0.703125, + "learning_rate": 4.8014708776507806e-05, + "loss": 0.009, + "mean_token_accuracy": 0.9976952493190765, + "num_tokens": 98369458.0, + "step": 32040 + }, + { + "entropy": 0.06033395798876882, + "epoch": 7.469751719314606, + "grad_norm": 0.2001953125, + "learning_rate": 4.801388327699011e-05, + "loss": 0.0098, + "mean_token_accuracy": 0.9974822103977203, + "num_tokens": 98394279.0, + "step": 32045 + }, + { + "entropy": 0.0509566405788064, + "epoch": 7.470917356335237, + "grad_norm": 0.515625, + "learning_rate": 4.801305762069461e-05, + "loss": 0.0082, + "mean_token_accuracy": 0.9969842314720154, + "num_tokens": 98415653.0, + "step": 32050 + }, + { + "entropy": 0.07163450829684734, + "epoch": 7.472082993355869, + "grad_norm": 1.4609375, + "learning_rate": 4.80122318076336e-05, + "loss": 0.009, + "mean_token_accuracy": 0.9957123637199402, + "num_tokens": 98444850.0, + "step": 32055 + }, + { + "entropy": 0.07031346326693892, + "epoch": 7.473248630376501, + "grad_norm": 2.9375, + "learning_rate": 4.8011405837819403e-05, + "loss": 0.017, + "mean_token_accuracy": 0.994475269317627, + "num_tokens": 98461675.0, + "step": 32060 + }, + { + "entropy": 0.10657533258199692, + "epoch": 7.474414267397132, + "grad_norm": 4.46875, + "learning_rate": 4.801057971126434e-05, + "loss": 0.0245, + "mean_token_accuracy": 0.9923931121826172, + "num_tokens": 98478969.0, + "step": 32065 + }, + { + "entropy": 0.0678681674413383, + "epoch": 7.475579904417764, + "grad_norm": 0.2294921875, + "learning_rate": 4.8009753427980724e-05, + "loss": 0.0176, + "mean_token_accuracy": 0.9935955286026001, + "num_tokens": 98494078.0, + "step": 32070 + }, + { + "entropy": 0.04767877887934446, + "epoch": 7.476745541438396, + "grad_norm": 0.318359375, + "learning_rate": 4.8008926987980874e-05, + "loss": 0.0062, + "mean_token_accuracy": 0.9986327946186065, + "num_tokens": 98534938.0, + "step": 32075 + }, + { + "entropy": 0.05674191527068615, + "epoch": 7.477911178459028, + "grad_norm": 0.53515625, + "learning_rate": 4.800810039127712e-05, + "loss": 0.0144, + "mean_token_accuracy": 0.9943017423152923, + "num_tokens": 98550014.0, + "step": 32080 + }, + { + "entropy": 0.07790519241243601, + "epoch": 7.47907681547966, + "grad_norm": 1.7109375, + "learning_rate": 4.800727363788178e-05, + "loss": 0.0164, + "mean_token_accuracy": 0.9978551805019379, + "num_tokens": 98567666.0, + "step": 32085 + }, + { + "entropy": 0.05544118182733655, + "epoch": 7.480242452500291, + "grad_norm": 0.74609375, + "learning_rate": 4.8006446727807186e-05, + "loss": 0.0105, + "mean_token_accuracy": 0.9966945469379425, + "num_tokens": 98609715.0, + "step": 32090 + }, + { + "entropy": 0.04747336655855179, + "epoch": 7.481408089520923, + "grad_norm": 0.431640625, + "learning_rate": 4.800561966106567e-05, + "loss": 0.0106, + "mean_token_accuracy": 0.9962104082107544, + "num_tokens": 98638761.0, + "step": 32095 + }, + { + "entropy": 0.08676061071455479, + "epoch": 7.482573726541555, + "grad_norm": 1.640625, + "learning_rate": 4.8004792437669564e-05, + "loss": 0.0164, + "mean_token_accuracy": 0.9954579770565033, + "num_tokens": 98652723.0, + "step": 32100 + }, + { + "entropy": 0.08409857619553804, + "epoch": 7.4837393635621865, + "grad_norm": 2.703125, + "learning_rate": 4.80039650576312e-05, + "loss": 0.0486, + "mean_token_accuracy": 0.9907464563846589, + "num_tokens": 98675617.0, + "step": 32105 + }, + { + "entropy": 0.0578452062793076, + "epoch": 7.484905000582819, + "grad_norm": 0.365234375, + "learning_rate": 4.800313752096292e-05, + "loss": 0.011, + "mean_token_accuracy": 0.9951134145259857, + "num_tokens": 98696778.0, + "step": 32110 + }, + { + "entropy": 0.06942653022706509, + "epoch": 7.486070637603451, + "grad_norm": 2.046875, + "learning_rate": 4.800230982767707e-05, + "loss": 0.0112, + "mean_token_accuracy": 0.9965254843235016, + "num_tokens": 98709420.0, + "step": 32115 + }, + { + "entropy": 0.06638341769576073, + "epoch": 7.487236274624082, + "grad_norm": 1.046875, + "learning_rate": 4.800148197778597e-05, + "loss": 0.0172, + "mean_token_accuracy": 0.9953685343265534, + "num_tokens": 98719108.0, + "step": 32120 + }, + { + "entropy": 0.06705099958926439, + "epoch": 7.488401911644714, + "grad_norm": 0.90625, + "learning_rate": 4.800065397130199e-05, + "loss": 0.0198, + "mean_token_accuracy": 0.9936238288879394, + "num_tokens": 98737993.0, + "step": 32125 + }, + { + "entropy": 0.06405184855684638, + "epoch": 7.489567548665345, + "grad_norm": 3.078125, + "learning_rate": 4.799982580823746e-05, + "loss": 0.0241, + "mean_token_accuracy": 0.9920284271240234, + "num_tokens": 98756576.0, + "step": 32130 + }, + { + "entropy": 0.0811619933694601, + "epoch": 7.490733185685977, + "grad_norm": 0.59765625, + "learning_rate": 4.799899748860473e-05, + "loss": 0.0228, + "mean_token_accuracy": 0.9951043605804444, + "num_tokens": 98773519.0, + "step": 32135 + }, + { + "entropy": 0.10115591492503881, + "epoch": 7.4918988227066095, + "grad_norm": 1.234375, + "learning_rate": 4.799816901241616e-05, + "loss": 0.0294, + "mean_token_accuracy": 0.9908878207206726, + "num_tokens": 98785515.0, + "step": 32140 + }, + { + "entropy": 0.06037537744268775, + "epoch": 7.493064459727241, + "grad_norm": 1.4765625, + "learning_rate": 4.79973403796841e-05, + "loss": 0.0121, + "mean_token_accuracy": 0.9955259382724762, + "num_tokens": 98802327.0, + "step": 32145 + }, + { + "entropy": 0.10647099614143371, + "epoch": 7.494230096747873, + "grad_norm": 2.390625, + "learning_rate": 4.799651159042091e-05, + "loss": 0.0377, + "mean_token_accuracy": 0.99305180311203, + "num_tokens": 98811942.0, + "step": 32150 + }, + { + "entropy": 0.06047349814325571, + "epoch": 7.495395733768505, + "grad_norm": 3.109375, + "learning_rate": 4.799568264463894e-05, + "loss": 0.0118, + "mean_token_accuracy": 0.9965644776821136, + "num_tokens": 98831430.0, + "step": 32155 + }, + { + "entropy": 0.08296026848256588, + "epoch": 7.496561370789136, + "grad_norm": 0.98828125, + "learning_rate": 4.799485354235056e-05, + "loss": 0.0152, + "mean_token_accuracy": 0.9960346400737763, + "num_tokens": 98840931.0, + "step": 32160 + }, + { + "entropy": 0.06511731203645468, + "epoch": 7.497727007809768, + "grad_norm": 0.333984375, + "learning_rate": 4.7994024283568125e-05, + "loss": 0.0079, + "mean_token_accuracy": 0.9975647747516632, + "num_tokens": 98857778.0, + "step": 32165 + }, + { + "entropy": 0.07737187128514052, + "epoch": 7.498892644830399, + "grad_norm": 1.7890625, + "learning_rate": 4.7993194868304e-05, + "loss": 0.0143, + "mean_token_accuracy": 0.9938872277736663, + "num_tokens": 98875306.0, + "step": 32170 + }, + { + "entropy": 0.06764257559552789, + "epoch": 7.5000582818510315, + "grad_norm": 0.314453125, + "learning_rate": 4.7992365296570564e-05, + "loss": 0.0074, + "mean_token_accuracy": 0.9962385296821594, + "num_tokens": 98907649.0, + "step": 32175 + }, + { + "entropy": 0.05583788137882948, + "epoch": 7.501223918871664, + "grad_norm": 0.400390625, + "learning_rate": 4.799153556838018e-05, + "loss": 0.0301, + "mean_token_accuracy": 0.9933856427669525, + "num_tokens": 98938264.0, + "step": 32180 + }, + { + "entropy": 0.09082546047866344, + "epoch": 7.502389555892295, + "grad_norm": 2.59375, + "learning_rate": 4.799070568374522e-05, + "loss": 0.0166, + "mean_token_accuracy": 0.9947808086872101, + "num_tokens": 98948444.0, + "step": 32185 + }, + { + "entropy": 0.06142871137708426, + "epoch": 7.503555192912927, + "grad_norm": 2.109375, + "learning_rate": 4.7989875642678054e-05, + "loss": 0.016, + "mean_token_accuracy": 0.9952453315258026, + "num_tokens": 98961866.0, + "step": 32190 + }, + { + "entropy": 0.05687323287129402, + "epoch": 7.504720829933559, + "grad_norm": 1.2109375, + "learning_rate": 4.798904544519107e-05, + "loss": 0.0111, + "mean_token_accuracy": 0.9948042273521424, + "num_tokens": 98982911.0, + "step": 32195 + }, + { + "entropy": 0.06204276205971837, + "epoch": 7.50588646695419, + "grad_norm": 0.9375, + "learning_rate": 4.7988215091296637e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9979773998260498, + "num_tokens": 99009256.0, + "step": 32200 + }, + { + "entropy": 0.05596322454512119, + "epoch": 7.507052103974822, + "grad_norm": 1.9375, + "learning_rate": 4.798738458100714e-05, + "loss": 0.0232, + "mean_token_accuracy": 0.9954159021377563, + "num_tokens": 99034387.0, + "step": 32205 + }, + { + "entropy": 0.07269154787063599, + "epoch": 7.508217740995454, + "grad_norm": 1.1796875, + "learning_rate": 4.798655391433498e-05, + "loss": 0.0139, + "mean_token_accuracy": 0.9955158352851867, + "num_tokens": 99051466.0, + "step": 32210 + }, + { + "entropy": 0.05379718169569969, + "epoch": 7.509383378016086, + "grad_norm": 0.66015625, + "learning_rate": 4.798572309129251e-05, + "loss": 0.0102, + "mean_token_accuracy": 0.9950558304786682, + "num_tokens": 99081520.0, + "step": 32215 + }, + { + "entropy": 0.05792910754680634, + "epoch": 7.510549015036718, + "grad_norm": 0.6953125, + "learning_rate": 4.798489211189215e-05, + "loss": 0.0115, + "mean_token_accuracy": 0.9956493020057678, + "num_tokens": 99107916.0, + "step": 32220 + }, + { + "entropy": 0.06945790387690068, + "epoch": 7.511714652057349, + "grad_norm": 3.578125, + "learning_rate": 4.798406097614627e-05, + "loss": 0.0228, + "mean_token_accuracy": 0.9951068162918091, + "num_tokens": 99117208.0, + "step": 32225 + }, + { + "entropy": 0.05028128759004176, + "epoch": 7.512880289077981, + "grad_norm": 0.3515625, + "learning_rate": 4.7983229684067275e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9985537052154541, + "num_tokens": 99151148.0, + "step": 32230 + }, + { + "entropy": 0.06575621776282788, + "epoch": 7.514045926098613, + "grad_norm": 0.83984375, + "learning_rate": 4.7982398235667556e-05, + "loss": 0.0162, + "mean_token_accuracy": 0.994884067773819, + "num_tokens": 99170846.0, + "step": 32235 + }, + { + "entropy": 0.06433080639690161, + "epoch": 7.515211563119244, + "grad_norm": 0.63671875, + "learning_rate": 4.798156663095952e-05, + "loss": 0.0168, + "mean_token_accuracy": 0.9959873199462891, + "num_tokens": 99184422.0, + "step": 32240 + }, + { + "entropy": 0.06489841155707836, + "epoch": 7.5163772001398765, + "grad_norm": 0.8984375, + "learning_rate": 4.7980734869955555e-05, + "loss": 0.0097, + "mean_token_accuracy": 0.9961045622825623, + "num_tokens": 99198234.0, + "step": 32245 + }, + { + "entropy": 0.060334295779466626, + "epoch": 7.517542837160509, + "grad_norm": 2.140625, + "learning_rate": 4.797990295266807e-05, + "loss": 0.0151, + "mean_token_accuracy": 0.9957007765769958, + "num_tokens": 99220548.0, + "step": 32250 + }, + { + "entropy": 0.045729524735361335, + "epoch": 7.51870847418114, + "grad_norm": 0.220703125, + "learning_rate": 4.797907087910947e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9980747818946838, + "num_tokens": 99249556.0, + "step": 32255 + }, + { + "entropy": 0.06361297219991684, + "epoch": 7.519874111201772, + "grad_norm": 0.96875, + "learning_rate": 4.797823864929216e-05, + "loss": 0.0084, + "mean_token_accuracy": 0.9971418261528016, + "num_tokens": 99260254.0, + "step": 32260 + }, + { + "entropy": 0.06000940101221204, + "epoch": 7.521039748222403, + "grad_norm": 0.73046875, + "learning_rate": 4.7977406263228555e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9969924569129944, + "num_tokens": 99282455.0, + "step": 32265 + }, + { + "entropy": 0.06560409255325794, + "epoch": 7.522205385243035, + "grad_norm": 2.421875, + "learning_rate": 4.797657372093106e-05, + "loss": 0.0213, + "mean_token_accuracy": 0.9930356025695801, + "num_tokens": 99300204.0, + "step": 32270 + }, + { + "entropy": 0.07295639859512448, + "epoch": 7.523371022263667, + "grad_norm": 0.330078125, + "learning_rate": 4.797574102241209e-05, + "loss": 0.0071, + "mean_token_accuracy": 0.9984568536281586, + "num_tokens": 99317838.0, + "step": 32275 + }, + { + "entropy": 0.07099381582811475, + "epoch": 7.524536659284299, + "grad_norm": 3.53125, + "learning_rate": 4.797490816768407e-05, + "loss": 0.0134, + "mean_token_accuracy": 0.9947207570075989, + "num_tokens": 99348614.0, + "step": 32280 + }, + { + "entropy": 0.0586695110425353, + "epoch": 7.525702296304931, + "grad_norm": 3.53125, + "learning_rate": 4.7974075156759415e-05, + "loss": 0.0112, + "mean_token_accuracy": 0.9969553589820862, + "num_tokens": 99363903.0, + "step": 32285 + }, + { + "entropy": 0.06191801391541958, + "epoch": 7.526867933325562, + "grad_norm": 0.337890625, + "learning_rate": 4.797324198965055e-05, + "loss": 0.0088, + "mean_token_accuracy": 0.9979340076446533, + "num_tokens": 99392391.0, + "step": 32290 + }, + { + "entropy": 0.07315607350319624, + "epoch": 7.528033570346194, + "grad_norm": 2.28125, + "learning_rate": 4.797240866636988e-05, + "loss": 0.0276, + "mean_token_accuracy": 0.990977531671524, + "num_tokens": 99403416.0, + "step": 32295 + }, + { + "entropy": 0.06841037645936013, + "epoch": 7.529199207366826, + "grad_norm": 2.3125, + "learning_rate": 4.797157518692986e-05, + "loss": 0.0204, + "mean_token_accuracy": 0.9952999293804169, + "num_tokens": 99414810.0, + "step": 32300 + }, + { + "entropy": 0.05617341762408614, + "epoch": 7.530364844387458, + "grad_norm": 1.4375, + "learning_rate": 4.79707415513429e-05, + "loss": 0.0095, + "mean_token_accuracy": 0.9951882123947143, + "num_tokens": 99442828.0, + "step": 32305 + }, + { + "entropy": 0.06965567506849765, + "epoch": 7.5315304814080895, + "grad_norm": 1.109375, + "learning_rate": 4.796990775962143e-05, + "loss": 0.023, + "mean_token_accuracy": 0.9933463513851166, + "num_tokens": 99451675.0, + "step": 32310 + }, + { + "entropy": 0.07169121066108346, + "epoch": 7.5326961184287216, + "grad_norm": 0.31640625, + "learning_rate": 4.7969073811777885e-05, + "loss": 0.0132, + "mean_token_accuracy": 0.9940380156040192, + "num_tokens": 99477402.0, + "step": 32315 + }, + { + "entropy": 0.05028648842126131, + "epoch": 7.533861755449353, + "grad_norm": 0.34375, + "learning_rate": 4.7968239707824715e-05, + "loss": 0.0135, + "mean_token_accuracy": 0.996212112903595, + "num_tokens": 99498726.0, + "step": 32320 + }, + { + "entropy": 0.09644905207678675, + "epoch": 7.535027392469985, + "grad_norm": 2.15625, + "learning_rate": 4.796740544777433e-05, + "loss": 0.0125, + "mean_token_accuracy": 0.9962563216686249, + "num_tokens": 99516871.0, + "step": 32325 + }, + { + "entropy": 0.06047167964279652, + "epoch": 7.536193029490617, + "grad_norm": 0.6328125, + "learning_rate": 4.7966571031639205e-05, + "loss": 0.0082, + "mean_token_accuracy": 0.9959788024425507, + "num_tokens": 99547359.0, + "step": 32330 + }, + { + "entropy": 0.07342666406184435, + "epoch": 7.537358666511248, + "grad_norm": 0.54296875, + "learning_rate": 4.796573645943175e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.9982951283454895, + "num_tokens": 99567827.0, + "step": 32335 + }, + { + "entropy": 0.07892006533220411, + "epoch": 7.53852430353188, + "grad_norm": 1.3515625, + "learning_rate": 4.7964901731164425e-05, + "loss": 0.005, + "mean_token_accuracy": 0.9984602570533753, + "num_tokens": 99590514.0, + "step": 32340 + }, + { + "entropy": 0.05502187833189964, + "epoch": 7.5396899405525115, + "grad_norm": 0.84765625, + "learning_rate": 4.7964066846849674e-05, + "loss": 0.0184, + "mean_token_accuracy": 0.995114940404892, + "num_tokens": 99607407.0, + "step": 32345 + }, + { + "entropy": 0.061845211498439315, + "epoch": 7.540855577573144, + "grad_norm": 0.671875, + "learning_rate": 4.796323180649995e-05, + "loss": 0.0053, + "mean_token_accuracy": 0.9959377884864807, + "num_tokens": 99629271.0, + "step": 32350 + }, + { + "entropy": 0.06737384842708707, + "epoch": 7.542021214593776, + "grad_norm": 0.373046875, + "learning_rate": 4.79623966101277e-05, + "loss": 0.0075, + "mean_token_accuracy": 0.9972203969955444, + "num_tokens": 99649342.0, + "step": 32355 + }, + { + "entropy": 0.06474924013018608, + "epoch": 7.543186851614407, + "grad_norm": 0.7734375, + "learning_rate": 4.796156125774538e-05, + "loss": 0.0147, + "mean_token_accuracy": 0.9958956420421601, + "num_tokens": 99664635.0, + "step": 32360 + }, + { + "entropy": 0.05531628727912903, + "epoch": 7.544352488635039, + "grad_norm": 0.5078125, + "learning_rate": 4.796072574936545e-05, + "loss": 0.0131, + "mean_token_accuracy": 0.994882071018219, + "num_tokens": 99692508.0, + "step": 32365 + }, + { + "entropy": 0.07443120554089547, + "epoch": 7.545518125655671, + "grad_norm": 2.265625, + "learning_rate": 4.795989008500037e-05, + "loss": 0.0348, + "mean_token_accuracy": 0.9899700045585632, + "num_tokens": 99714721.0, + "step": 32370 + }, + { + "entropy": 0.07098412998020649, + "epoch": 7.546683762676302, + "grad_norm": 4.90625, + "learning_rate": 4.795905426466259e-05, + "loss": 0.0211, + "mean_token_accuracy": 0.9950291216373444, + "num_tokens": 99732835.0, + "step": 32375 + }, + { + "entropy": 0.050486131198704244, + "epoch": 7.5478493996969345, + "grad_norm": 0.5859375, + "learning_rate": 4.7958218288364574e-05, + "loss": 0.0097, + "mean_token_accuracy": 0.9978505671024323, + "num_tokens": 99756728.0, + "step": 32380 + }, + { + "entropy": 0.06294616991654038, + "epoch": 7.549015036717567, + "grad_norm": 0.9609375, + "learning_rate": 4.7957382156118804e-05, + "loss": 0.0109, + "mean_token_accuracy": 0.9962819039821624, + "num_tokens": 99779085.0, + "step": 32385 + }, + { + "entropy": 0.049626897927373646, + "epoch": 7.550180673738198, + "grad_norm": 0.1328125, + "learning_rate": 4.795654586793773e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9979463160037995, + "num_tokens": 99816126.0, + "step": 32390 + }, + { + "entropy": 0.06566182002425194, + "epoch": 7.55134631075883, + "grad_norm": 2.703125, + "learning_rate": 4.795570942383383e-05, + "loss": 0.022, + "mean_token_accuracy": 0.9944956481456757, + "num_tokens": 99828398.0, + "step": 32395 + }, + { + "entropy": 0.06577919237315655, + "epoch": 7.552511947779461, + "grad_norm": 1.2890625, + "learning_rate": 4.7954872823819576e-05, + "loss": 0.0107, + "mean_token_accuracy": 0.9972432315349579, + "num_tokens": 99838581.0, + "step": 32400 + }, + { + "entropy": 0.18690302036702633, + "epoch": 7.553677584800093, + "grad_norm": 5.34375, + "learning_rate": 4.795403606790745e-05, + "loss": 0.2838, + "mean_token_accuracy": 0.9480214655399323, + "num_tokens": 99856832.0, + "step": 32405 + }, + { + "entropy": 0.08401225432753563, + "epoch": 7.554843221820725, + "grad_norm": 0.78125, + "learning_rate": 4.795319915610991e-05, + "loss": 0.0166, + "mean_token_accuracy": 0.9959981799125671, + "num_tokens": 99881690.0, + "step": 32410 + }, + { + "entropy": 0.08292626310139894, + "epoch": 7.5560088588413565, + "grad_norm": 0.89453125, + "learning_rate": 4.7952362088439463e-05, + "loss": 0.0102, + "mean_token_accuracy": 0.9968954145908355, + "num_tokens": 99901095.0, + "step": 32415 + }, + { + "entropy": 0.07522573880851269, + "epoch": 7.557174495861989, + "grad_norm": 3.515625, + "learning_rate": 4.7951524864908563e-05, + "loss": 0.0231, + "mean_token_accuracy": 0.9964980185031891, + "num_tokens": 99910958.0, + "step": 32420 + }, + { + "entropy": 0.07450153809040785, + "epoch": 7.55834013288262, + "grad_norm": 1.1796875, + "learning_rate": 4.795068748552971e-05, + "loss": 0.0196, + "mean_token_accuracy": 0.9937357306480408, + "num_tokens": 99922733.0, + "step": 32425 + }, + { + "entropy": 0.05930988015606999, + "epoch": 7.559505769903252, + "grad_norm": 0.33984375, + "learning_rate": 4.7949849950315395e-05, + "loss": 0.0081, + "mean_token_accuracy": 0.9973505437374115, + "num_tokens": 99956843.0, + "step": 32430 + }, + { + "entropy": 0.06503955796360969, + "epoch": 7.560671406923884, + "grad_norm": 0.84765625, + "learning_rate": 4.79490122592781e-05, + "loss": 0.0134, + "mean_token_accuracy": 0.9955211043357849, + "num_tokens": 99980564.0, + "step": 32435 + }, + { + "entropy": 0.06906016170978546, + "epoch": 7.561837043944516, + "grad_norm": 0.4296875, + "learning_rate": 4.794817441243031e-05, + "loss": 0.0108, + "mean_token_accuracy": 0.9956372857093811, + "num_tokens": 100001928.0, + "step": 32440 + }, + { + "entropy": 0.08328457288444042, + "epoch": 7.563002680965147, + "grad_norm": 2.1875, + "learning_rate": 4.7947336409784524e-05, + "loss": 0.016, + "mean_token_accuracy": 0.9953269362449646, + "num_tokens": 100010956.0, + "step": 32445 + }, + { + "entropy": 0.05307028675451875, + "epoch": 7.5641683179857795, + "grad_norm": 1.6015625, + "learning_rate": 4.7946498251353246e-05, + "loss": 0.0106, + "mean_token_accuracy": 0.9949360966682435, + "num_tokens": 100035076.0, + "step": 32450 + }, + { + "entropy": 0.0713282000273466, + "epoch": 7.565333955006411, + "grad_norm": 1.8828125, + "learning_rate": 4.794565993714896e-05, + "loss": 0.0202, + "mean_token_accuracy": 0.9958377063274384, + "num_tokens": 100052207.0, + "step": 32455 + }, + { + "entropy": 0.06920521166175604, + "epoch": 7.566499592027043, + "grad_norm": 0.400390625, + "learning_rate": 4.794482146718417e-05, + "loss": 0.0217, + "mean_token_accuracy": 0.9941867649555206, + "num_tokens": 100078093.0, + "step": 32460 + }, + { + "entropy": 0.05242059859447181, + "epoch": 7.567665229047675, + "grad_norm": 2.984375, + "learning_rate": 4.794398284147139e-05, + "loss": 0.0095, + "mean_token_accuracy": 0.9971729397773743, + "num_tokens": 100106656.0, + "step": 32465 + }, + { + "entropy": 0.06839284533634782, + "epoch": 7.568830866068306, + "grad_norm": 1.7109375, + "learning_rate": 4.794314406002311e-05, + "loss": 0.0188, + "mean_token_accuracy": 0.9950945854187012, + "num_tokens": 100125560.0, + "step": 32470 + }, + { + "entropy": 0.09131706580519676, + "epoch": 7.569996503088938, + "grad_norm": 1.2578125, + "learning_rate": 4.7942305122851846e-05, + "loss": 0.0125, + "mean_token_accuracy": 0.997002649307251, + "num_tokens": 100135364.0, + "step": 32475 + }, + { + "entropy": 0.05984076950699091, + "epoch": 7.5711621401095694, + "grad_norm": 0.310546875, + "learning_rate": 4.794146602997011e-05, + "loss": 0.0106, + "mean_token_accuracy": 0.9975899815559387, + "num_tokens": 100151513.0, + "step": 32480 + }, + { + "entropy": 0.08737356476485729, + "epoch": 7.5723277771302016, + "grad_norm": 1.625, + "learning_rate": 4.794062678139041e-05, + "loss": 0.0134, + "mean_token_accuracy": 0.9978051900863647, + "num_tokens": 100160893.0, + "step": 32485 + }, + { + "entropy": 0.07065956890583039, + "epoch": 7.573493414150834, + "grad_norm": 2.953125, + "learning_rate": 4.793978737712526e-05, + "loss": 0.0174, + "mean_token_accuracy": 0.9937400400638581, + "num_tokens": 100172491.0, + "step": 32490 + }, + { + "entropy": 0.05652084918692708, + "epoch": 7.574659051171465, + "grad_norm": 0.291015625, + "learning_rate": 4.793894781718718e-05, + "loss": 0.0177, + "mean_token_accuracy": 0.9928201317787171, + "num_tokens": 100195349.0, + "step": 32495 + }, + { + "entropy": 0.07366820741444827, + "epoch": 7.575824688192097, + "grad_norm": 0.61328125, + "learning_rate": 4.793810810158868e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9973739683628082, + "num_tokens": 100217402.0, + "step": 32500 + }, + { + "entropy": 0.048085775785148145, + "epoch": 7.576990325212729, + "grad_norm": 1.5234375, + "learning_rate": 4.79372682303423e-05, + "loss": 0.0116, + "mean_token_accuracy": 0.9953145444393158, + "num_tokens": 100248272.0, + "step": 32505 + }, + { + "entropy": 0.05929643930867314, + "epoch": 7.57815596223336, + "grad_norm": 0.302734375, + "learning_rate": 4.793642820346055e-05, + "loss": 0.0134, + "mean_token_accuracy": 0.9968596279621125, + "num_tokens": 100268834.0, + "step": 32510 + }, + { + "entropy": 0.05762045104056597, + "epoch": 7.579321599253992, + "grad_norm": 0.515625, + "learning_rate": 4.793558802095595e-05, + "loss": 0.009, + "mean_token_accuracy": 0.9958757042884827, + "num_tokens": 100292134.0, + "step": 32515 + }, + { + "entropy": 0.0705728504806757, + "epoch": 7.5804872362746245, + "grad_norm": 0.28515625, + "learning_rate": 4.793474768284104e-05, + "loss": 0.0187, + "mean_token_accuracy": 0.9952540934085846, + "num_tokens": 100302583.0, + "step": 32520 + }, + { + "entropy": 0.0648424707353115, + "epoch": 7.581652873295256, + "grad_norm": 1.0234375, + "learning_rate": 4.7933907189128353e-05, + "loss": 0.0076, + "mean_token_accuracy": 0.9974114298820496, + "num_tokens": 100318585.0, + "step": 32525 + }, + { + "entropy": 0.07678204700350762, + "epoch": 7.582818510315888, + "grad_norm": 1.328125, + "learning_rate": 4.7933066539830405e-05, + "loss": 0.0094, + "mean_token_accuracy": 0.9962783277034759, + "num_tokens": 100330717.0, + "step": 32530 + }, + { + "entropy": 0.05130081316456199, + "epoch": 7.583984147336519, + "grad_norm": 0.375, + "learning_rate": 4.793222573495975e-05, + "loss": 0.0082, + "mean_token_accuracy": 0.9958062887191772, + "num_tokens": 100363015.0, + "step": 32535 + }, + { + "entropy": 0.06532498020678759, + "epoch": 7.585149784357151, + "grad_norm": 0.9765625, + "learning_rate": 4.793138477452892e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9959288775920868, + "num_tokens": 100385936.0, + "step": 32540 + }, + { + "entropy": 0.04017478302121162, + "epoch": 7.586315421377783, + "grad_norm": 0.625, + "learning_rate": 4.793054365855045e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9981935679912567, + "num_tokens": 100422413.0, + "step": 32545 + }, + { + "entropy": 0.051121273264288905, + "epoch": 7.5874810583984145, + "grad_norm": 1.4140625, + "learning_rate": 4.792970238703689e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9979244947433472, + "num_tokens": 100452225.0, + "step": 32550 + }, + { + "entropy": 0.07465707622468472, + "epoch": 7.588646695419047, + "grad_norm": 0.416015625, + "learning_rate": 4.7928860960000774e-05, + "loss": 0.0106, + "mean_token_accuracy": 0.9957206606864929, + "num_tokens": 100464424.0, + "step": 32555 + }, + { + "entropy": 0.0724950728006661, + "epoch": 7.589812332439678, + "grad_norm": 2.203125, + "learning_rate": 4.792801937745466e-05, + "loss": 0.012, + "mean_token_accuracy": 0.9970667362213135, + "num_tokens": 100485939.0, + "step": 32560 + }, + { + "entropy": 0.05878008343279362, + "epoch": 7.59097796946031, + "grad_norm": 5.03125, + "learning_rate": 4.792717763941109e-05, + "loss": 0.0186, + "mean_token_accuracy": 0.9959011018276215, + "num_tokens": 100502937.0, + "step": 32565 + }, + { + "entropy": 0.07744065094739198, + "epoch": 7.592143606480942, + "grad_norm": 1.625, + "learning_rate": 4.7926335745882615e-05, + "loss": 0.0157, + "mean_token_accuracy": 0.9964656233787537, + "num_tokens": 100521009.0, + "step": 32570 + }, + { + "entropy": 0.04522500513121486, + "epoch": 7.593309243501574, + "grad_norm": 0.56640625, + "learning_rate": 4.7925493696881797e-05, + "loss": 0.0072, + "mean_token_accuracy": 0.998151433467865, + "num_tokens": 100562959.0, + "step": 32575 + }, + { + "entropy": 0.05111492648720741, + "epoch": 7.594474880522205, + "grad_norm": 0.609375, + "learning_rate": 4.7924651492421186e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9987885475158691, + "num_tokens": 100585745.0, + "step": 32580 + }, + { + "entropy": 0.06261113192886114, + "epoch": 7.595640517542837, + "grad_norm": 0.248046875, + "learning_rate": 4.792380913251334e-05, + "loss": 0.0114, + "mean_token_accuracy": 0.9969954133033753, + "num_tokens": 100605109.0, + "step": 32585 + }, + { + "entropy": 0.07228537742048502, + "epoch": 7.596806154563469, + "grad_norm": 1.640625, + "learning_rate": 4.792296661717082e-05, + "loss": 0.0086, + "mean_token_accuracy": 0.9956452965736389, + "num_tokens": 100625700.0, + "step": 32590 + }, + { + "entropy": 0.060614088317379355, + "epoch": 7.597971791584101, + "grad_norm": 2.703125, + "learning_rate": 4.792212394640619e-05, + "loss": 0.0185, + "mean_token_accuracy": 0.994023448228836, + "num_tokens": 100645065.0, + "step": 32595 + }, + { + "entropy": 0.05851653721183538, + "epoch": 7.599137428604733, + "grad_norm": 0.78515625, + "learning_rate": 4.792128112023203e-05, + "loss": 0.0117, + "mean_token_accuracy": 0.997266286611557, + "num_tokens": 100663742.0, + "step": 32600 + }, + { + "entropy": 0.054263474978506566, + "epoch": 7.600303065625364, + "grad_norm": 0.359375, + "learning_rate": 4.7920438138660886e-05, + "loss": 0.0077, + "mean_token_accuracy": 0.9973829567432404, + "num_tokens": 100697972.0, + "step": 32605 + }, + { + "entropy": 0.04637954393401742, + "epoch": 7.601468702645996, + "grad_norm": 1.140625, + "learning_rate": 4.791959500170533e-05, + "loss": 0.0073, + "mean_token_accuracy": 0.9971019864082337, + "num_tokens": 100734309.0, + "step": 32610 + }, + { + "entropy": 0.08320282809436322, + "epoch": 7.602634339666627, + "grad_norm": 1.2109375, + "learning_rate": 4.791875170937794e-05, + "loss": 0.0132, + "mean_token_accuracy": 0.9958483457565308, + "num_tokens": 100744170.0, + "step": 32615 + }, + { + "entropy": 0.06456623747944831, + "epoch": 7.6037999766872595, + "grad_norm": 2.46875, + "learning_rate": 4.7917908261691296e-05, + "loss": 0.0147, + "mean_token_accuracy": 0.9935942471027375, + "num_tokens": 100765054.0, + "step": 32620 + }, + { + "entropy": 0.06546109467744828, + "epoch": 7.604965613707892, + "grad_norm": 0.45703125, + "learning_rate": 4.7917064658657974e-05, + "loss": 0.0055, + "mean_token_accuracy": 0.9976051330566407, + "num_tokens": 100780513.0, + "step": 32625 + }, + { + "entropy": 0.05621226644143462, + "epoch": 7.606131250728523, + "grad_norm": 0.515625, + "learning_rate": 4.7916220900290545e-05, + "loss": 0.011, + "mean_token_accuracy": 0.9961635172367096, + "num_tokens": 100798992.0, + "step": 32630 + }, + { + "entropy": 0.07289885049685836, + "epoch": 7.607296887749155, + "grad_norm": 1.578125, + "learning_rate": 4.7915376986601595e-05, + "loss": 0.0132, + "mean_token_accuracy": 0.9941843330860138, + "num_tokens": 100821013.0, + "step": 32635 + }, + { + "entropy": 0.0671339999884367, + "epoch": 7.608462524769787, + "grad_norm": 2.71875, + "learning_rate": 4.791453291760371e-05, + "loss": 0.024, + "mean_token_accuracy": 0.9942957162857056, + "num_tokens": 100831114.0, + "step": 32640 + }, + { + "entropy": 0.04521031361073256, + "epoch": 7.609628161790418, + "grad_norm": 1.609375, + "learning_rate": 4.791368869330948e-05, + "loss": 0.0099, + "mean_token_accuracy": 0.9969500958919525, + "num_tokens": 100865685.0, + "step": 32645 + }, + { + "entropy": 0.06600057780742645, + "epoch": 7.61079379881105, + "grad_norm": 0.828125, + "learning_rate": 4.791284431373148e-05, + "loss": 0.0078, + "mean_token_accuracy": 0.9975499629974365, + "num_tokens": 100887627.0, + "step": 32650 + }, + { + "entropy": 0.06534403078258037, + "epoch": 7.611959435831682, + "grad_norm": 3.0, + "learning_rate": 4.791199977888231e-05, + "loss": 0.017, + "mean_token_accuracy": 0.9944500923156738, + "num_tokens": 100900901.0, + "step": 32655 + }, + { + "entropy": 0.07990739308297634, + "epoch": 7.613125072852314, + "grad_norm": 1.7890625, + "learning_rate": 4.791115508877457e-05, + "loss": 0.0136, + "mean_token_accuracy": 0.9956984043121337, + "num_tokens": 100911816.0, + "step": 32660 + }, + { + "entropy": 0.05803863704204559, + "epoch": 7.614290709872946, + "grad_norm": 0.443359375, + "learning_rate": 4.7910310243420845e-05, + "loss": 0.0225, + "mean_token_accuracy": 0.9933572173118591, + "num_tokens": 100943353.0, + "step": 32665 + }, + { + "entropy": 0.05729225566610694, + "epoch": 7.615456346893577, + "grad_norm": 0.419921875, + "learning_rate": 4.790946524283373e-05, + "loss": 0.0092, + "mean_token_accuracy": 0.9972775459289551, + "num_tokens": 100969851.0, + "step": 32670 + }, + { + "entropy": 0.05942230150103569, + "epoch": 7.616621983914209, + "grad_norm": 1.265625, + "learning_rate": 4.790862008702584e-05, + "loss": 0.0166, + "mean_token_accuracy": 0.9958006918430329, + "num_tokens": 100985383.0, + "step": 32675 + }, + { + "entropy": 0.05730917723849416, + "epoch": 7.617787620934841, + "grad_norm": 1.9765625, + "learning_rate": 4.790777477600976e-05, + "loss": 0.0119, + "mean_token_accuracy": 0.9960092306137085, + "num_tokens": 101016757.0, + "step": 32680 + }, + { + "entropy": 0.0662191977724433, + "epoch": 7.618953257955472, + "grad_norm": 0.8125, + "learning_rate": 4.790692930979811e-05, + "loss": 0.0171, + "mean_token_accuracy": 0.993461686372757, + "num_tokens": 101029341.0, + "step": 32685 + }, + { + "entropy": 0.05978534407913685, + "epoch": 7.6201188949761045, + "grad_norm": 3.21875, + "learning_rate": 4.790608368840349e-05, + "loss": 0.0207, + "mean_token_accuracy": 0.9946033835411072, + "num_tokens": 101042189.0, + "step": 32690 + }, + { + "entropy": 0.08330644629895687, + "epoch": 7.621284531996736, + "grad_norm": 1.0703125, + "learning_rate": 4.790523791183852e-05, + "loss": 0.0099, + "mean_token_accuracy": 0.9977569222450257, + "num_tokens": 101053521.0, + "step": 32695 + }, + { + "entropy": 0.07074883468449115, + "epoch": 7.622450169017368, + "grad_norm": 2.953125, + "learning_rate": 4.7904391980115786e-05, + "loss": 0.0182, + "mean_token_accuracy": 0.9951606273651123, + "num_tokens": 101068673.0, + "step": 32700 + }, + { + "entropy": 0.06495154052972793, + "epoch": 7.623615806038, + "grad_norm": 2.046875, + "learning_rate": 4.790354589324793e-05, + "loss": 0.0102, + "mean_token_accuracy": 0.995569896697998, + "num_tokens": 101091515.0, + "step": 32705 + }, + { + "entropy": 0.06703095585107803, + "epoch": 7.624781443058632, + "grad_norm": 2.03125, + "learning_rate": 4.790269965124755e-05, + "loss": 0.0184, + "mean_token_accuracy": 0.9955197989940643, + "num_tokens": 101111464.0, + "step": 32710 + }, + { + "entropy": 0.11674296110868454, + "epoch": 7.625947080079263, + "grad_norm": 6.34375, + "learning_rate": 4.790185325412728e-05, + "loss": 0.0926, + "mean_token_accuracy": 0.987501859664917, + "num_tokens": 101134805.0, + "step": 32715 + }, + { + "entropy": 0.09656250309199095, + "epoch": 7.627112717099895, + "grad_norm": 2.25, + "learning_rate": 4.7901006701899724e-05, + "loss": 0.0162, + "mean_token_accuracy": 0.9959886252880097, + "num_tokens": 101155209.0, + "step": 32720 + }, + { + "entropy": 0.06006800103932619, + "epoch": 7.628278354120527, + "grad_norm": 2.796875, + "learning_rate": 4.790015999457752e-05, + "loss": 0.0085, + "mean_token_accuracy": 0.9979590594768524, + "num_tokens": 101173279.0, + "step": 32725 + }, + { + "entropy": 0.0692501813173294, + "epoch": 7.629443991141159, + "grad_norm": 0.5859375, + "learning_rate": 4.7899313132173284e-05, + "loss": 0.0085, + "mean_token_accuracy": 0.9972864508628845, + "num_tokens": 101199392.0, + "step": 32730 + }, + { + "entropy": 0.23568055331707, + "epoch": 7.630609628161791, + "grad_norm": 2.171875, + "learning_rate": 4.789846611469965e-05, + "loss": 0.2858, + "mean_token_accuracy": 0.9683476388454437, + "num_tokens": 101219041.0, + "step": 32735 + }, + { + "entropy": 0.07892981367185711, + "epoch": 7.631775265182422, + "grad_norm": 0.953125, + "learning_rate": 4.7897618942169245e-05, + "loss": 0.0115, + "mean_token_accuracy": 0.9962479889392852, + "num_tokens": 101240090.0, + "step": 32740 + }, + { + "entropy": 0.0884470010176301, + "epoch": 7.632940902203054, + "grad_norm": 2.90625, + "learning_rate": 4.7896771614594705e-05, + "loss": 0.0073, + "mean_token_accuracy": 0.9975970149040222, + "num_tokens": 101255685.0, + "step": 32745 + }, + { + "entropy": 0.052130958810448644, + "epoch": 7.634106539223685, + "grad_norm": 0.32421875, + "learning_rate": 4.7895924131988654e-05, + "loss": 0.0062, + "mean_token_accuracy": 0.997983181476593, + "num_tokens": 101292950.0, + "step": 32750 + }, + { + "entropy": 0.0583219145424664, + "epoch": 7.635272176244317, + "grad_norm": 1.125, + "learning_rate": 4.789507649436374e-05, + "loss": 0.0072, + "mean_token_accuracy": 0.9973702132701874, + "num_tokens": 101318481.0, + "step": 32755 + }, + { + "entropy": 0.06547697465866804, + "epoch": 7.6364378132649495, + "grad_norm": 1.953125, + "learning_rate": 4.7894228701732613e-05, + "loss": 0.0129, + "mean_token_accuracy": 0.9954418540000916, + "num_tokens": 101339941.0, + "step": 32760 + }, + { + "entropy": 0.11741905976086855, + "epoch": 7.637603450285581, + "grad_norm": 0.93359375, + "learning_rate": 4.789338075410789e-05, + "loss": 0.0696, + "mean_token_accuracy": 0.9896807491779327, + "num_tokens": 101366743.0, + "step": 32765 + }, + { + "entropy": 0.06383249973878265, + "epoch": 7.638769087306213, + "grad_norm": 3.375, + "learning_rate": 4.789253265150223e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9968828380107879, + "num_tokens": 101385178.0, + "step": 32770 + }, + { + "entropy": 0.06623956738039852, + "epoch": 7.639934724326845, + "grad_norm": 1.2109375, + "learning_rate": 4.789168439392828e-05, + "loss": 0.0125, + "mean_token_accuracy": 0.9964322030544281, + "num_tokens": 101400186.0, + "step": 32775 + }, + { + "entropy": 0.0739117719233036, + "epoch": 7.641100361347476, + "grad_norm": 0.71875, + "learning_rate": 4.7890835981398686e-05, + "loss": 0.0125, + "mean_token_accuracy": 0.9961394846439362, + "num_tokens": 101414902.0, + "step": 32780 + }, + { + "entropy": 0.05914564449340105, + "epoch": 7.642265998368108, + "grad_norm": 0.46875, + "learning_rate": 4.78899874139261e-05, + "loss": 0.0148, + "mean_token_accuracy": 0.9944905817508698, + "num_tokens": 101434462.0, + "step": 32785 + }, + { + "entropy": 0.06933594457805156, + "epoch": 7.64343163538874, + "grad_norm": 0.9453125, + "learning_rate": 4.7889138691523166e-05, + "loss": 0.009, + "mean_token_accuracy": 0.9975636959075928, + "num_tokens": 101446358.0, + "step": 32790 + }, + { + "entropy": 0.08477734699845314, + "epoch": 7.644597272409372, + "grad_norm": 1.0234375, + "learning_rate": 4.788828981420255e-05, + "loss": 0.0117, + "mean_token_accuracy": 0.9955092072486877, + "num_tokens": 101456184.0, + "step": 32795 + }, + { + "entropy": 0.058824803587049244, + "epoch": 7.645762909430004, + "grad_norm": 2.234375, + "learning_rate": 4.7887440781976915e-05, + "loss": 0.0124, + "mean_token_accuracy": 0.9953935146331787, + "num_tokens": 101488985.0, + "step": 32800 + }, + { + "entropy": 0.05272114276885986, + "epoch": 7.646928546450635, + "grad_norm": 1.3828125, + "learning_rate": 4.788659159485891e-05, + "loss": 0.0138, + "mean_token_accuracy": 0.9938934504985809, + "num_tokens": 101507299.0, + "step": 32805 + }, + { + "entropy": 0.0796303316950798, + "epoch": 7.648094183471267, + "grad_norm": 3.28125, + "learning_rate": 4.7885742252861205e-05, + "loss": 0.0185, + "mean_token_accuracy": 0.9956585705280304, + "num_tokens": 101518078.0, + "step": 32810 + }, + { + "entropy": 0.07179453428834677, + "epoch": 7.649259820491899, + "grad_norm": 3.34375, + "learning_rate": 4.788489275599646e-05, + "loss": 0.0091, + "mean_token_accuracy": 0.9973179042339325, + "num_tokens": 101540068.0, + "step": 32815 + }, + { + "entropy": 0.05889113489538431, + "epoch": 7.65042545751253, + "grad_norm": 3.328125, + "learning_rate": 4.788404310427734e-05, + "loss": 0.0128, + "mean_token_accuracy": 0.9952363848686219, + "num_tokens": 101560366.0, + "step": 32820 + }, + { + "entropy": 0.06677535912021995, + "epoch": 7.651591094533162, + "grad_norm": 0.37109375, + "learning_rate": 4.788319329771652e-05, + "loss": 0.006, + "mean_token_accuracy": 0.9987049341201782, + "num_tokens": 101578158.0, + "step": 32825 + }, + { + "entropy": 0.06673195157200099, + "epoch": 7.652756731553794, + "grad_norm": 0.6015625, + "learning_rate": 4.7882343336326675e-05, + "loss": 0.0122, + "mean_token_accuracy": 0.9957050621509552, + "num_tokens": 101590083.0, + "step": 32830 + }, + { + "entropy": 0.07282912842929364, + "epoch": 7.653922368574426, + "grad_norm": 1.3203125, + "learning_rate": 4.788149322012048e-05, + "loss": 0.0196, + "mean_token_accuracy": 0.994915121793747, + "num_tokens": 101610498.0, + "step": 32835 + }, + { + "entropy": 0.05825249031186104, + "epoch": 7.655088005595058, + "grad_norm": 2.921875, + "learning_rate": 4.7880642949110594e-05, + "loss": 0.0069, + "mean_token_accuracy": 0.9975422143936157, + "num_tokens": 101631178.0, + "step": 32840 + }, + { + "entropy": 0.083549792971462, + "epoch": 7.65625364261569, + "grad_norm": 0.365234375, + "learning_rate": 4.7879792523309715e-05, + "loss": 0.0113, + "mean_token_accuracy": 0.9959293603897095, + "num_tokens": 101647880.0, + "step": 32845 + }, + { + "entropy": 0.07544692466035485, + "epoch": 7.657419279636321, + "grad_norm": 0.359375, + "learning_rate": 4.787894194273052e-05, + "loss": 0.0189, + "mean_token_accuracy": 0.9943213880062103, + "num_tokens": 101666788.0, + "step": 32850 + }, + { + "entropy": 0.06447355775162578, + "epoch": 7.658584916656953, + "grad_norm": 1.0, + "learning_rate": 4.787809120738568e-05, + "loss": 0.0074, + "mean_token_accuracy": 0.9963658154010773, + "num_tokens": 101694306.0, + "step": 32855 + }, + { + "entropy": 0.060479212738573554, + "epoch": 7.6597505536775845, + "grad_norm": 1.6171875, + "learning_rate": 4.7877240317287896e-05, + "loss": 0.0113, + "mean_token_accuracy": 0.996816223859787, + "num_tokens": 101716460.0, + "step": 32860 + }, + { + "entropy": 0.0611457291059196, + "epoch": 7.660916190698217, + "grad_norm": 1.671875, + "learning_rate": 4.787638927244985e-05, + "loss": 0.0143, + "mean_token_accuracy": 0.9953365206718445, + "num_tokens": 101740617.0, + "step": 32865 + }, + { + "entropy": 0.062300180457532404, + "epoch": 7.662081827718849, + "grad_norm": 2.515625, + "learning_rate": 4.7875538072884234e-05, + "loss": 0.0129, + "mean_token_accuracy": 0.9958526492118835, + "num_tokens": 101762005.0, + "step": 32870 + }, + { + "entropy": 0.06913834474980832, + "epoch": 7.66324746473948, + "grad_norm": 0.314453125, + "learning_rate": 4.787468671860374e-05, + "loss": 0.0125, + "mean_token_accuracy": 0.995975774526596, + "num_tokens": 101783436.0, + "step": 32875 + }, + { + "entropy": 0.04262585397809744, + "epoch": 7.664413101760112, + "grad_norm": 1.6640625, + "learning_rate": 4.787383520962106e-05, + "loss": 0.0111, + "mean_token_accuracy": 0.9966529786586762, + "num_tokens": 101803270.0, + "step": 32880 + }, + { + "entropy": 0.05681615024805069, + "epoch": 7.665578738780743, + "grad_norm": 2.046875, + "learning_rate": 4.78729835459489e-05, + "loss": 0.0196, + "mean_token_accuracy": 0.9942091941833496, + "num_tokens": 101819136.0, + "step": 32885 + }, + { + "entropy": 0.06326032225042581, + "epoch": 7.666744375801375, + "grad_norm": 0.5390625, + "learning_rate": 4.787213172759995e-05, + "loss": 0.0168, + "mean_token_accuracy": 0.9958334505558014, + "num_tokens": 101833599.0, + "step": 32890 + }, + { + "entropy": 0.07607240015640855, + "epoch": 7.6679100128220075, + "grad_norm": 0.546875, + "learning_rate": 4.787127975458692e-05, + "loss": 0.0095, + "mean_token_accuracy": 0.9953685104846954, + "num_tokens": 101852948.0, + "step": 32895 + }, + { + "entropy": 0.06080687679350376, + "epoch": 7.669075649842639, + "grad_norm": 2.140625, + "learning_rate": 4.78704276269225e-05, + "loss": 0.0237, + "mean_token_accuracy": 0.9924868643283844, + "num_tokens": 101866462.0, + "step": 32900 + }, + { + "entropy": 0.05896556191146374, + "epoch": 7.670241286863271, + "grad_norm": 1.8671875, + "learning_rate": 4.786957534461941e-05, + "loss": 0.0118, + "mean_token_accuracy": 0.9962682902812958, + "num_tokens": 101878867.0, + "step": 32905 + }, + { + "entropy": 0.055679400265216825, + "epoch": 7.671406923883903, + "grad_norm": 1.0390625, + "learning_rate": 4.786872290769036e-05, + "loss": 0.0185, + "mean_token_accuracy": 0.9933323562145233, + "num_tokens": 101891123.0, + "step": 32910 + }, + { + "entropy": 0.044333659764379266, + "epoch": 7.672572560904534, + "grad_norm": 0.82421875, + "learning_rate": 4.786787031614804e-05, + "loss": 0.009, + "mean_token_accuracy": 0.996362054347992, + "num_tokens": 101920179.0, + "step": 32915 + }, + { + "entropy": 0.05670885499566793, + "epoch": 7.673738197925166, + "grad_norm": 0.77734375, + "learning_rate": 4.78670175700052e-05, + "loss": 0.0109, + "mean_token_accuracy": 0.9959345281124115, + "num_tokens": 101944715.0, + "step": 32920 + }, + { + "entropy": 0.07925061210989952, + "epoch": 7.674903834945798, + "grad_norm": 0.470703125, + "learning_rate": 4.7866164669274526e-05, + "loss": 0.0089, + "mean_token_accuracy": 0.9954618394374848, + "num_tokens": 101974097.0, + "step": 32925 + }, + { + "entropy": 0.05602764692157507, + "epoch": 7.6760694719664295, + "grad_norm": 0.80078125, + "learning_rate": 4.786531161396874e-05, + "loss": 0.0167, + "mean_token_accuracy": 0.9930595636367798, + "num_tokens": 101991808.0, + "step": 32930 + }, + { + "entropy": 0.06822612164542079, + "epoch": 7.677235108987062, + "grad_norm": 0.2294921875, + "learning_rate": 4.7864458404100575e-05, + "loss": 0.0188, + "mean_token_accuracy": 0.9941983044147491, + "num_tokens": 102010298.0, + "step": 32935 + }, + { + "entropy": 0.07209388744086027, + "epoch": 7.678400746007693, + "grad_norm": 2.296875, + "learning_rate": 4.786360503968275e-05, + "loss": 0.0276, + "mean_token_accuracy": 0.9932683050632477, + "num_tokens": 102024598.0, + "step": 32940 + }, + { + "entropy": 0.07343168566003441, + "epoch": 7.679566383028325, + "grad_norm": 1.6953125, + "learning_rate": 4.7862751520727976e-05, + "loss": 0.0225, + "mean_token_accuracy": 0.9909487366676331, + "num_tokens": 102042228.0, + "step": 32945 + }, + { + "entropy": 0.06603799071162939, + "epoch": 7.680732020048957, + "grad_norm": 1.2578125, + "learning_rate": 4.786189784724899e-05, + "loss": 0.0191, + "mean_token_accuracy": 0.9925129771232605, + "num_tokens": 102056086.0, + "step": 32950 + }, + { + "entropy": 0.04232637556269765, + "epoch": 7.681897657069588, + "grad_norm": 0.30078125, + "learning_rate": 4.7861044019258536e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.9985785365104676, + "num_tokens": 102088710.0, + "step": 32955 + }, + { + "entropy": 0.046622142335399984, + "epoch": 7.68306329409022, + "grad_norm": 0.8515625, + "learning_rate": 4.786019003676931e-05, + "loss": 0.0112, + "mean_token_accuracy": 0.9971826553344727, + "num_tokens": 102117410.0, + "step": 32960 + }, + { + "entropy": 0.04436410292983055, + "epoch": 7.684228931110852, + "grad_norm": 0.35546875, + "learning_rate": 4.785933589979409e-05, + "loss": 0.0091, + "mean_token_accuracy": 0.9968259632587433, + "num_tokens": 102139165.0, + "step": 32965 + }, + { + "entropy": 0.05858845524489879, + "epoch": 7.685394568131484, + "grad_norm": 0.8671875, + "learning_rate": 4.785848160834558e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9990635514259338, + "num_tokens": 102149919.0, + "step": 32970 + }, + { + "entropy": 0.08302914574742318, + "epoch": 7.686560205152116, + "grad_norm": 1.6484375, + "learning_rate": 4.785762716243653e-05, + "loss": 0.0106, + "mean_token_accuracy": 0.9958298087120057, + "num_tokens": 102162836.0, + "step": 32975 + }, + { + "entropy": 0.061361842602491376, + "epoch": 7.687725842172748, + "grad_norm": 0.392578125, + "learning_rate": 4.7856772562079675e-05, + "loss": 0.0175, + "mean_token_accuracy": 0.9943647503852844, + "num_tokens": 102190595.0, + "step": 32980 + }, + { + "entropy": 0.06810544840991498, + "epoch": 7.688891479193379, + "grad_norm": 0.69140625, + "learning_rate": 4.785591780728777e-05, + "loss": 0.0174, + "mean_token_accuracy": 0.9955644488334656, + "num_tokens": 102200770.0, + "step": 32985 + }, + { + "entropy": 0.07041889689862728, + "epoch": 7.690057116214011, + "grad_norm": 1.75, + "learning_rate": 4.785506289807356e-05, + "loss": 0.0274, + "mean_token_accuracy": 0.9925129294395447, + "num_tokens": 102218653.0, + "step": 32990 + }, + { + "entropy": 0.04661326901987195, + "epoch": 7.691222753234642, + "grad_norm": 0.291015625, + "learning_rate": 4.785420783444978e-05, + "loss": 0.0125, + "mean_token_accuracy": 0.9967496514320373, + "num_tokens": 102248406.0, + "step": 32995 + }, + { + "entropy": 0.06012960709631443, + "epoch": 7.6923883902552745, + "grad_norm": 3.8125, + "learning_rate": 4.785335261642918e-05, + "loss": 0.0259, + "mean_token_accuracy": 0.9954575836658478, + "num_tokens": 102264295.0, + "step": 33000 + }, + { + "entropy": 0.06518603842705488, + "epoch": 7.693554027275907, + "grad_norm": 1.53125, + "learning_rate": 4.785249724402453e-05, + "loss": 0.0142, + "mean_token_accuracy": 0.9950007736682892, + "num_tokens": 102283898.0, + "step": 33005 + }, + { + "entropy": 0.07277072016149759, + "epoch": 7.694719664296538, + "grad_norm": 0.265625, + "learning_rate": 4.7851641717248574e-05, + "loss": 0.0173, + "mean_token_accuracy": 0.9951655566692352, + "num_tokens": 102301866.0, + "step": 33010 + }, + { + "entropy": 0.04626034093089402, + "epoch": 7.69588530131717, + "grad_norm": 0.412109375, + "learning_rate": 4.785078603611407e-05, + "loss": 0.0067, + "mean_token_accuracy": 0.9977597951889038, + "num_tokens": 102326359.0, + "step": 33015 + }, + { + "entropy": 0.06575447116047144, + "epoch": 7.697050938337801, + "grad_norm": 3.203125, + "learning_rate": 4.784993020063377e-05, + "loss": 0.0205, + "mean_token_accuracy": 0.9942839860916137, + "num_tokens": 102344300.0, + "step": 33020 + }, + { + "entropy": 0.05121599268168211, + "epoch": 7.698216575358433, + "grad_norm": 0.1943359375, + "learning_rate": 4.784907421082046e-05, + "loss": 0.0076, + "mean_token_accuracy": 0.9972567915916443, + "num_tokens": 102369108.0, + "step": 33025 + }, + { + "entropy": 0.06364568127319217, + "epoch": 7.699382212379065, + "grad_norm": 0.423828125, + "learning_rate": 4.784821806668688e-05, + "loss": 0.0252, + "mean_token_accuracy": 0.9911703109741211, + "num_tokens": 102392748.0, + "step": 33030 + }, + { + "entropy": 0.08194961175322532, + "epoch": 7.700547849399697, + "grad_norm": 2.25, + "learning_rate": 4.7847361768245804e-05, + "loss": 0.0176, + "mean_token_accuracy": 0.9945807337760926, + "num_tokens": 102406579.0, + "step": 33035 + }, + { + "entropy": 0.05632104352116585, + "epoch": 7.701713486420329, + "grad_norm": 0.76171875, + "learning_rate": 4.784650531550999e-05, + "loss": 0.0122, + "mean_token_accuracy": 0.9953259468078614, + "num_tokens": 102423760.0, + "step": 33040 + }, + { + "entropy": 0.08072004318237305, + "epoch": 7.702879123440961, + "grad_norm": 0.345703125, + "learning_rate": 4.784564870849223e-05, + "loss": 0.0233, + "mean_token_accuracy": 0.9934710383415222, + "num_tokens": 102445159.0, + "step": 33045 + }, + { + "entropy": 0.0482025190256536, + "epoch": 7.704044760461592, + "grad_norm": 0.94140625, + "learning_rate": 4.7844791947205295e-05, + "loss": 0.0089, + "mean_token_accuracy": 0.9969426095485687, + "num_tokens": 102473665.0, + "step": 33050 + }, + { + "entropy": 0.04185782624408603, + "epoch": 7.705210397482224, + "grad_norm": 1.203125, + "learning_rate": 4.7843935031661936e-05, + "loss": 0.0099, + "mean_token_accuracy": 0.9968387722969055, + "num_tokens": 102502207.0, + "step": 33055 + }, + { + "entropy": 0.055229269759729506, + "epoch": 7.706376034502856, + "grad_norm": 0.240234375, + "learning_rate": 4.7843077961874955e-05, + "loss": 0.0173, + "mean_token_accuracy": 0.9945455610752105, + "num_tokens": 102522659.0, + "step": 33060 + }, + { + "entropy": 0.05512173883616924, + "epoch": 7.7075416715234875, + "grad_norm": 0.33203125, + "learning_rate": 4.7842220737857125e-05, + "loss": 0.0193, + "mean_token_accuracy": 0.9933101654052734, + "num_tokens": 102544870.0, + "step": 33065 + }, + { + "entropy": 0.05951900091022253, + "epoch": 7.70870730854412, + "grad_norm": 1.609375, + "learning_rate": 4.7841363359621225e-05, + "loss": 0.0197, + "mean_token_accuracy": 0.9944490492343903, + "num_tokens": 102556250.0, + "step": 33070 + }, + { + "entropy": 0.05257823131978512, + "epoch": 7.709872945564751, + "grad_norm": 0.37109375, + "learning_rate": 4.784050582718005e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.997491991519928, + "num_tokens": 102598786.0, + "step": 33075 + }, + { + "entropy": 0.0675898913294077, + "epoch": 7.711038582585383, + "grad_norm": 2.625, + "learning_rate": 4.783964814054638e-05, + "loss": 0.0138, + "mean_token_accuracy": 0.9958470046520234, + "num_tokens": 102625008.0, + "step": 33080 + }, + { + "entropy": 0.06182341612875462, + "epoch": 7.712204219606015, + "grad_norm": 0.51171875, + "learning_rate": 4.7838790299732996e-05, + "loss": 0.0105, + "mean_token_accuracy": 0.9968319714069367, + "num_tokens": 102638009.0, + "step": 33085 + }, + { + "entropy": 0.04808502923697233, + "epoch": 7.713369856626646, + "grad_norm": 4.6875, + "learning_rate": 4.78379323047527e-05, + "loss": 0.0134, + "mean_token_accuracy": 0.9957450985908508, + "num_tokens": 102660401.0, + "step": 33090 + }, + { + "entropy": 0.06274218112230301, + "epoch": 7.714535493647278, + "grad_norm": 2.0625, + "learning_rate": 4.783707415561829e-05, + "loss": 0.0291, + "mean_token_accuracy": 0.9942092418670654, + "num_tokens": 102683677.0, + "step": 33095 + }, + { + "entropy": 0.07062279777601362, + "epoch": 7.7157011306679095, + "grad_norm": 1.3359375, + "learning_rate": 4.783621585234255e-05, + "loss": 0.0153, + "mean_token_accuracy": 0.9945529520511627, + "num_tokens": 102704196.0, + "step": 33100 + }, + { + "entropy": 0.06789557654410601, + "epoch": 7.716866767688542, + "grad_norm": 0.6875, + "learning_rate": 4.7835357394938295e-05, + "loss": 0.0274, + "mean_token_accuracy": 0.9927381813526154, + "num_tokens": 102718236.0, + "step": 33105 + }, + { + "entropy": 0.0625371436122805, + "epoch": 7.718032404709174, + "grad_norm": 1.4296875, + "learning_rate": 4.7834498783418305e-05, + "loss": 0.0082, + "mean_token_accuracy": 0.9966192424297333, + "num_tokens": 102742988.0, + "step": 33110 + }, + { + "entropy": 0.07199394702911377, + "epoch": 7.719198041729806, + "grad_norm": 2.84375, + "learning_rate": 4.78336400177954e-05, + "loss": 0.0148, + "mean_token_accuracy": 0.9946137011051178, + "num_tokens": 102753814.0, + "step": 33115 + }, + { + "entropy": 0.0699772285297513, + "epoch": 7.720363678750437, + "grad_norm": 1.9375, + "learning_rate": 4.783278109808238e-05, + "loss": 0.0178, + "mean_token_accuracy": 0.9951143622398376, + "num_tokens": 102770221.0, + "step": 33120 + }, + { + "entropy": 0.09867776576429606, + "epoch": 7.721529315771069, + "grad_norm": 1.59375, + "learning_rate": 4.783192202429205e-05, + "loss": 0.0198, + "mean_token_accuracy": 0.994517570734024, + "num_tokens": 102782732.0, + "step": 33125 + }, + { + "entropy": 0.107102907076478, + "epoch": 7.7226949527917, + "grad_norm": 6.25, + "learning_rate": 4.783106279643722e-05, + "loss": 0.1153, + "mean_token_accuracy": 0.9768224596977234, + "num_tokens": 102803666.0, + "step": 33130 + }, + { + "entropy": 0.056758302915841344, + "epoch": 7.7238605898123325, + "grad_norm": 0.9765625, + "learning_rate": 4.783020341453071e-05, + "loss": 0.0275, + "mean_token_accuracy": 0.9949622571468353, + "num_tokens": 102825348.0, + "step": 33135 + }, + { + "entropy": 0.057881328649818896, + "epoch": 7.725026226832965, + "grad_norm": 0.4921875, + "learning_rate": 4.782934387858533e-05, + "loss": 0.012, + "mean_token_accuracy": 0.9954042911529541, + "num_tokens": 102848920.0, + "step": 33140 + }, + { + "entropy": 0.04482985120266676, + "epoch": 7.726191863853596, + "grad_norm": 3.640625, + "learning_rate": 4.7828484188613896e-05, + "loss": 0.0118, + "mean_token_accuracy": 0.9950476944446563, + "num_tokens": 102874383.0, + "step": 33145 + }, + { + "entropy": 0.047770484909415246, + "epoch": 7.727357500874228, + "grad_norm": 1.9921875, + "learning_rate": 4.782762434462922e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9952480673789978, + "num_tokens": 102902687.0, + "step": 33150 + }, + { + "entropy": 0.06177355572581291, + "epoch": 7.728523137894859, + "grad_norm": 0.228515625, + "learning_rate": 4.782676434664414e-05, + "loss": 0.007, + "mean_token_accuracy": 0.9983032703399658, + "num_tokens": 102923480.0, + "step": 33155 + }, + { + "entropy": 0.03888240284286439, + "epoch": 7.729688774915491, + "grad_norm": 0.5703125, + "learning_rate": 4.782590419467147e-05, + "loss": 0.0082, + "mean_token_accuracy": 0.9970541417598724, + "num_tokens": 102968286.0, + "step": 33160 + }, + { + "entropy": 0.05827982537448406, + "epoch": 7.730854411936123, + "grad_norm": 1.34375, + "learning_rate": 4.782504388872404e-05, + "loss": 0.018, + "mean_token_accuracy": 0.9947967290878296, + "num_tokens": 102980808.0, + "step": 33165 + }, + { + "entropy": 0.06765324827283621, + "epoch": 7.7320200489567545, + "grad_norm": 0.2138671875, + "learning_rate": 4.7824183428814674e-05, + "loss": 0.0093, + "mean_token_accuracy": 0.9967410266399384, + "num_tokens": 102994850.0, + "step": 33170 + }, + { + "entropy": 0.07319668047130108, + "epoch": 7.733185685977387, + "grad_norm": 1.4453125, + "learning_rate": 4.78233228149562e-05, + "loss": 0.0244, + "mean_token_accuracy": 0.9916601479053497, + "num_tokens": 103004716.0, + "step": 33175 + }, + { + "entropy": 0.06414531394839287, + "epoch": 7.734351322998019, + "grad_norm": 0.90625, + "learning_rate": 4.782246204716146e-05, + "loss": 0.0088, + "mean_token_accuracy": 0.9972757875919342, + "num_tokens": 103024745.0, + "step": 33180 + }, + { + "entropy": 0.07337948856875301, + "epoch": 7.73551696001865, + "grad_norm": 1.4765625, + "learning_rate": 4.782160112544328e-05, + "loss": 0.0204, + "mean_token_accuracy": 0.9944812595844269, + "num_tokens": 103043048.0, + "step": 33185 + }, + { + "entropy": 0.07922273678705097, + "epoch": 7.736682597039282, + "grad_norm": 0.82421875, + "learning_rate": 4.7820740049814506e-05, + "loss": 0.0154, + "mean_token_accuracy": 0.9938981294631958, + "num_tokens": 103064617.0, + "step": 33190 + }, + { + "entropy": 0.055163250956684354, + "epoch": 7.737848234059914, + "grad_norm": 0.65234375, + "learning_rate": 4.7819878820287976e-05, + "loss": 0.0087, + "mean_token_accuracy": 0.9972365856170654, + "num_tokens": 103092281.0, + "step": 33195 + }, + { + "entropy": 0.07809178866446018, + "epoch": 7.739013871080545, + "grad_norm": 2.734375, + "learning_rate": 4.781901743687653e-05, + "loss": 0.0176, + "mean_token_accuracy": 0.993689090013504, + "num_tokens": 103101742.0, + "step": 33200 + }, + { + "entropy": 0.05465510156936944, + "epoch": 7.7401795081011775, + "grad_norm": 1.4765625, + "learning_rate": 4.7818155899593015e-05, + "loss": 0.0141, + "mean_token_accuracy": 0.9954380929470062, + "num_tokens": 103137325.0, + "step": 33205 + }, + { + "entropy": 0.05313769178465009, + "epoch": 7.741345145121809, + "grad_norm": 0.5625, + "learning_rate": 4.781729420845027e-05, + "loss": 0.0198, + "mean_token_accuracy": 0.99625164270401, + "num_tokens": 103157786.0, + "step": 33210 + }, + { + "entropy": 0.07087142560631036, + "epoch": 7.742510782142441, + "grad_norm": 1.7421875, + "learning_rate": 4.781643236346115e-05, + "loss": 0.0223, + "mean_token_accuracy": 0.9946645021438598, + "num_tokens": 103174318.0, + "step": 33215 + }, + { + "entropy": 0.058909991011023524, + "epoch": 7.743676419163073, + "grad_norm": 1.75, + "learning_rate": 4.781557036463852e-05, + "loss": 0.0171, + "mean_token_accuracy": 0.9935064792633057, + "num_tokens": 103195835.0, + "step": 33220 + }, + { + "entropy": 0.06861430993303656, + "epoch": 7.744842056183704, + "grad_norm": 0.2255859375, + "learning_rate": 4.7814708211995206e-05, + "loss": 0.0162, + "mean_token_accuracy": 0.995832359790802, + "num_tokens": 103225740.0, + "step": 33225 + }, + { + "entropy": 0.07119888961315154, + "epoch": 7.746007693204336, + "grad_norm": 0.92578125, + "learning_rate": 4.781384590554409e-05, + "loss": 0.0195, + "mean_token_accuracy": 0.9929039359092713, + "num_tokens": 103240560.0, + "step": 33230 + }, + { + "entropy": 0.12398752514272929, + "epoch": 7.7471733302249675, + "grad_norm": 4.78125, + "learning_rate": 4.781298344529801e-05, + "loss": 0.146, + "mean_token_accuracy": 0.9746751546859741, + "num_tokens": 103269285.0, + "step": 33235 + }, + { + "entropy": 0.06968305222690105, + "epoch": 7.7483389672456, + "grad_norm": 2.578125, + "learning_rate": 4.781212083126984e-05, + "loss": 0.0212, + "mean_token_accuracy": 0.9939125597476959, + "num_tokens": 103278316.0, + "step": 33240 + }, + { + "entropy": 0.057888449355959895, + "epoch": 7.749504604266232, + "grad_norm": 4.15625, + "learning_rate": 4.7811258063472433e-05, + "loss": 0.0176, + "mean_token_accuracy": 0.9953479826450348, + "num_tokens": 103301857.0, + "step": 33245 + }, + { + "entropy": 0.05492489393800497, + "epoch": 7.750670241286863, + "grad_norm": 1.1640625, + "learning_rate": 4.781039514191866e-05, + "loss": 0.0132, + "mean_token_accuracy": 0.995477843284607, + "num_tokens": 103330999.0, + "step": 33250 + }, + { + "entropy": 0.05832458529621363, + "epoch": 7.751835878307495, + "grad_norm": 1.984375, + "learning_rate": 4.7809532066621396e-05, + "loss": 0.0085, + "mean_token_accuracy": 0.9947230100631714, + "num_tokens": 103353067.0, + "step": 33255 + }, + { + "entropy": 0.06644787210971118, + "epoch": 7.753001515328127, + "grad_norm": 1.5390625, + "learning_rate": 4.78086688375935e-05, + "loss": 0.0112, + "mean_token_accuracy": 0.9967861711978913, + "num_tokens": 103366334.0, + "step": 33260 + }, + { + "entropy": 0.09004889465868474, + "epoch": 7.754167152348758, + "grad_norm": 4.6875, + "learning_rate": 4.780780545484783e-05, + "loss": 0.0269, + "mean_token_accuracy": 0.992378157377243, + "num_tokens": 103374930.0, + "step": 33265 + }, + { + "entropy": 0.08529338352382183, + "epoch": 7.75533278936939, + "grad_norm": 0.70703125, + "learning_rate": 4.78069419183973e-05, + "loss": 0.0123, + "mean_token_accuracy": 0.9963854551315308, + "num_tokens": 103386330.0, + "step": 33270 + }, + { + "entropy": 0.05681054722517729, + "epoch": 7.7564984263900225, + "grad_norm": 0.6328125, + "learning_rate": 4.780607822825475e-05, + "loss": 0.0077, + "mean_token_accuracy": 0.9969890892505646, + "num_tokens": 103401501.0, + "step": 33275 + }, + { + "entropy": 0.07758706733584404, + "epoch": 7.757664063410654, + "grad_norm": 0.7421875, + "learning_rate": 4.780521438443307e-05, + "loss": 0.0242, + "mean_token_accuracy": 0.9923904359340667, + "num_tokens": 103409858.0, + "step": 33280 + }, + { + "entropy": 0.08151622787117958, + "epoch": 7.758829700431286, + "grad_norm": 0.54296875, + "learning_rate": 4.780435038694515e-05, + "loss": 0.049, + "mean_token_accuracy": 0.9898855566978455, + "num_tokens": 103432904.0, + "step": 33285 + }, + { + "entropy": 0.11001786850392818, + "epoch": 7.759995337451917, + "grad_norm": 3.5, + "learning_rate": 4.780348623580387e-05, + "loss": 0.0171, + "mean_token_accuracy": 0.9956152141094208, + "num_tokens": 103440705.0, + "step": 33290 + }, + { + "entropy": 0.05951671497896314, + "epoch": 7.761160974472549, + "grad_norm": 0.310546875, + "learning_rate": 4.7802621931022105e-05, + "loss": 0.009, + "mean_token_accuracy": 0.9959829747676849, + "num_tokens": 103462340.0, + "step": 33295 + }, + { + "entropy": 0.07016182951629162, + "epoch": 7.762326611493181, + "grad_norm": 1.2578125, + "learning_rate": 4.7801757472612756e-05, + "loss": 0.0148, + "mean_token_accuracy": 0.9939250349998474, + "num_tokens": 103472282.0, + "step": 33300 + }, + { + "entropy": 0.06460611652582884, + "epoch": 7.7634922485138125, + "grad_norm": 0.275390625, + "learning_rate": 4.780089286058871e-05, + "loss": 0.0112, + "mean_token_accuracy": 0.9950837731361389, + "num_tokens": 103501179.0, + "step": 33305 + }, + { + "entropy": 0.06389118535444141, + "epoch": 7.764657885534445, + "grad_norm": 2.09375, + "learning_rate": 4.7800028094962856e-05, + "loss": 0.0089, + "mean_token_accuracy": 0.9971124291419983, + "num_tokens": 103523931.0, + "step": 33310 + }, + { + "entropy": 0.059523440059274436, + "epoch": 7.765823522555077, + "grad_norm": 0.609375, + "learning_rate": 4.779916317574809e-05, + "loss": 0.0077, + "mean_token_accuracy": 0.9953795492649078, + "num_tokens": 103546158.0, + "step": 33315 + }, + { + "entropy": 0.047978917602449656, + "epoch": 7.766989159575708, + "grad_norm": 0.26171875, + "learning_rate": 4.779829810295731e-05, + "loss": 0.0112, + "mean_token_accuracy": 0.9941164374351501, + "num_tokens": 103591627.0, + "step": 33320 + }, + { + "entropy": 0.053133474104106425, + "epoch": 7.76815479659634, + "grad_norm": 0.73046875, + "learning_rate": 4.7797432876603415e-05, + "loss": 0.0101, + "mean_token_accuracy": 0.9951233327388763, + "num_tokens": 103625207.0, + "step": 33325 + }, + { + "entropy": 0.09868288524448872, + "epoch": 7.769320433616972, + "grad_norm": 0.33203125, + "learning_rate": 4.779656749669931e-05, + "loss": 0.0073, + "mean_token_accuracy": 0.9969569325447083, + "num_tokens": 103644136.0, + "step": 33330 + }, + { + "entropy": 0.06480435077100992, + "epoch": 7.770486070637603, + "grad_norm": 0.85546875, + "learning_rate": 4.77957019632579e-05, + "loss": 0.0132, + "mean_token_accuracy": 0.997107309103012, + "num_tokens": 103660756.0, + "step": 33335 + }, + { + "entropy": 0.06808798797428608, + "epoch": 7.771651707658235, + "grad_norm": 0.5, + "learning_rate": 4.779483627629208e-05, + "loss": 0.0135, + "mean_token_accuracy": 0.9981209456920623, + "num_tokens": 103674964.0, + "step": 33340 + }, + { + "entropy": 0.06963288504630327, + "epoch": 7.772817344678867, + "grad_norm": 0.33984375, + "learning_rate": 4.779397043581477e-05, + "loss": 0.0139, + "mean_token_accuracy": 0.996001148223877, + "num_tokens": 103701505.0, + "step": 33345 + }, + { + "entropy": 0.07459334554150701, + "epoch": 7.773982981699499, + "grad_norm": 1.765625, + "learning_rate": 4.779310444183888e-05, + "loss": 0.0245, + "mean_token_accuracy": 0.9941957771778107, + "num_tokens": 103715081.0, + "step": 33350 + }, + { + "entropy": 0.08988549634814262, + "epoch": 7.775148618720131, + "grad_norm": 1.71875, + "learning_rate": 4.7792238294377326e-05, + "loss": 0.0748, + "mean_token_accuracy": 0.9865087509155274, + "num_tokens": 103737406.0, + "step": 33355 + }, + { + "entropy": 0.0655953474342823, + "epoch": 7.776314255740762, + "grad_norm": 2.375, + "learning_rate": 4.7791371993443004e-05, + "loss": 0.0408, + "mean_token_accuracy": 0.9900922536849975, + "num_tokens": 103756899.0, + "step": 33360 + }, + { + "entropy": 0.0665599879808724, + "epoch": 7.777479892761394, + "grad_norm": 1.2421875, + "learning_rate": 4.779050553904886e-05, + "loss": 0.0272, + "mean_token_accuracy": 0.9948042571544647, + "num_tokens": 103776091.0, + "step": 33365 + }, + { + "entropy": 0.05449905479326844, + "epoch": 7.778645529782025, + "grad_norm": 0.4921875, + "learning_rate": 4.778963893120779e-05, + "loss": 0.0141, + "mean_token_accuracy": 0.9956022024154663, + "num_tokens": 103818424.0, + "step": 33370 + }, + { + "entropy": 0.05944770090281963, + "epoch": 7.7798111668026575, + "grad_norm": 2.125, + "learning_rate": 4.7788772169932735e-05, + "loss": 0.0276, + "mean_token_accuracy": 0.9925519466400147, + "num_tokens": 103838159.0, + "step": 33375 + }, + { + "entropy": 0.0551123920828104, + "epoch": 7.78097680382329, + "grad_norm": 0.7734375, + "learning_rate": 4.778790525523661e-05, + "loss": 0.0085, + "mean_token_accuracy": 0.997332113981247, + "num_tokens": 103877255.0, + "step": 33380 + }, + { + "entropy": 0.06190832667052746, + "epoch": 7.782142440843921, + "grad_norm": 0.216796875, + "learning_rate": 4.7787038187132345e-05, + "loss": 0.0143, + "mean_token_accuracy": 0.9960220634937287, + "num_tokens": 103901221.0, + "step": 33385 + }, + { + "entropy": 0.060510965529829264, + "epoch": 7.783308077864553, + "grad_norm": 1.453125, + "learning_rate": 4.778617096563286e-05, + "loss": 0.0146, + "mean_token_accuracy": 0.9958796203136444, + "num_tokens": 103934690.0, + "step": 33390 + }, + { + "entropy": 0.06902567390352488, + "epoch": 7.784473714885185, + "grad_norm": 0.2314453125, + "learning_rate": 4.778530359075111e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9963374018669129, + "num_tokens": 103974900.0, + "step": 33395 + }, + { + "entropy": 0.062181627936661245, + "epoch": 7.785639351905816, + "grad_norm": 2.96875, + "learning_rate": 4.77844360625e-05, + "loss": 0.027, + "mean_token_accuracy": 0.9922132313251495, + "num_tokens": 103998158.0, + "step": 33400 + }, + { + "entropy": 0.06246403036639094, + "epoch": 7.786804988926448, + "grad_norm": 3.265625, + "learning_rate": 4.778356838089248e-05, + "loss": 0.0206, + "mean_token_accuracy": 0.9934264302253724, + "num_tokens": 104010695.0, + "step": 33405 + }, + { + "entropy": 0.058712884597480294, + "epoch": 7.7879706259470804, + "grad_norm": 0.64453125, + "learning_rate": 4.778270054594149e-05, + "loss": 0.02, + "mean_token_accuracy": 0.9937012672424317, + "num_tokens": 104022834.0, + "step": 33410 + }, + { + "entropy": 0.06700698286294937, + "epoch": 7.789136262967712, + "grad_norm": 0.345703125, + "learning_rate": 4.7781832557659975e-05, + "loss": 0.017, + "mean_token_accuracy": 0.9955753743648529, + "num_tokens": 104038301.0, + "step": 33415 + }, + { + "entropy": 0.06809659153223038, + "epoch": 7.790301899988344, + "grad_norm": 0.59375, + "learning_rate": 4.7780964416060866e-05, + "loss": 0.0218, + "mean_token_accuracy": 0.9940990149974823, + "num_tokens": 104049127.0, + "step": 33420 + }, + { + "entropy": 0.06645800778642297, + "epoch": 7.791467537008975, + "grad_norm": 1.46875, + "learning_rate": 4.7780096121157115e-05, + "loss": 0.0292, + "mean_token_accuracy": 0.9924209117889404, + "num_tokens": 104069094.0, + "step": 33425 + }, + { + "entropy": 0.07412951868027448, + "epoch": 7.792633174029607, + "grad_norm": 0.48828125, + "learning_rate": 4.777922767296167e-05, + "loss": 0.0079, + "mean_token_accuracy": 0.9977226316928863, + "num_tokens": 104095882.0, + "step": 33430 + }, + { + "entropy": 0.059772913716733454, + "epoch": 7.793798811050239, + "grad_norm": 0.609375, + "learning_rate": 4.777835907148748e-05, + "loss": 0.0155, + "mean_token_accuracy": 0.9950716972351075, + "num_tokens": 104110924.0, + "step": 33435 + }, + { + "entropy": 0.09825005661696196, + "epoch": 7.79496444807087, + "grad_norm": 1.328125, + "learning_rate": 4.77774903167475e-05, + "loss": 0.0728, + "mean_token_accuracy": 0.9886581361293793, + "num_tokens": 104144096.0, + "step": 33440 + }, + { + "entropy": 0.04443741850554943, + "epoch": 7.7961300850915025, + "grad_norm": 0.255859375, + "learning_rate": 4.777662140875467e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9977137744426727, + "num_tokens": 104184663.0, + "step": 33445 + }, + { + "entropy": 0.0897174721583724, + "epoch": 7.797295722112135, + "grad_norm": 0.97265625, + "learning_rate": 4.7775752347521965e-05, + "loss": 0.0179, + "mean_token_accuracy": 0.9940895915031434, + "num_tokens": 104205061.0, + "step": 33450 + }, + { + "entropy": 0.0796100415289402, + "epoch": 7.798461359132766, + "grad_norm": 1.2890625, + "learning_rate": 4.777488313306234e-05, + "loss": 0.03, + "mean_token_accuracy": 0.9946964383125305, + "num_tokens": 104216186.0, + "step": 33455 + }, + { + "entropy": 0.07518827449530363, + "epoch": 7.799626996153398, + "grad_norm": 3.15625, + "learning_rate": 4.7774013765388745e-05, + "loss": 0.0168, + "mean_token_accuracy": 0.9950263619422912, + "num_tokens": 104225544.0, + "step": 33460 + }, + { + "entropy": 0.06233833208680153, + "epoch": 7.80079263317403, + "grad_norm": 1.40625, + "learning_rate": 4.777314424451416e-05, + "loss": 0.0077, + "mean_token_accuracy": 0.9979846239089966, + "num_tokens": 104249410.0, + "step": 33465 + }, + { + "entropy": 0.059140789741650227, + "epoch": 7.801958270194661, + "grad_norm": 1.453125, + "learning_rate": 4.7772274570451535e-05, + "loss": 0.0143, + "mean_token_accuracy": 0.9955178260803222, + "num_tokens": 104286115.0, + "step": 33470 + }, + { + "entropy": 0.06378463245928287, + "epoch": 7.803123907215293, + "grad_norm": 2.203125, + "learning_rate": 4.777140474321385e-05, + "loss": 0.0189, + "mean_token_accuracy": 0.9947424292564392, + "num_tokens": 104319276.0, + "step": 33475 + }, + { + "entropy": 0.053663753625005486, + "epoch": 7.804289544235925, + "grad_norm": 0.392578125, + "learning_rate": 4.777053476281407e-05, + "loss": 0.0083, + "mean_token_accuracy": 0.9964630961418152, + "num_tokens": 104339566.0, + "step": 33480 + }, + { + "entropy": 0.07534630130976439, + "epoch": 7.805455181256557, + "grad_norm": 1.8984375, + "learning_rate": 4.7769664629265174e-05, + "loss": 0.0159, + "mean_token_accuracy": 0.9969941914081574, + "num_tokens": 104354412.0, + "step": 33485 + }, + { + "entropy": 0.06900246925652027, + "epoch": 7.806620818277189, + "grad_norm": 1.640625, + "learning_rate": 4.7768794342580124e-05, + "loss": 0.0274, + "mean_token_accuracy": 0.9949246644973755, + "num_tokens": 104363262.0, + "step": 33490 + }, + { + "entropy": 0.06793273855000734, + "epoch": 7.80778645529782, + "grad_norm": 0.6953125, + "learning_rate": 4.776792390277191e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9973601698875427, + "num_tokens": 104384275.0, + "step": 33495 + }, + { + "entropy": 0.06260851919651031, + "epoch": 7.808952092318452, + "grad_norm": 2.203125, + "learning_rate": 4.776705330985351e-05, + "loss": 0.0123, + "mean_token_accuracy": 0.9972707688808441, + "num_tokens": 104404175.0, + "step": 33500 + }, + { + "entropy": 0.06454886039718985, + "epoch": 7.810117729339083, + "grad_norm": 1.0546875, + "learning_rate": 4.77661825638379e-05, + "loss": 0.0158, + "mean_token_accuracy": 0.9947829186916352, + "num_tokens": 104423332.0, + "step": 33505 + }, + { + "entropy": 0.037727932911366224, + "epoch": 7.811283366359715, + "grad_norm": 1.140625, + "learning_rate": 4.7765311664738065e-05, + "loss": 0.0053, + "mean_token_accuracy": 0.997829121351242, + "num_tokens": 104457532.0, + "step": 33510 + }, + { + "entropy": 0.06598635371774435, + "epoch": 7.8124490033803475, + "grad_norm": 1.4296875, + "learning_rate": 4.776444061256699e-05, + "loss": 0.0187, + "mean_token_accuracy": 0.9948759615421295, + "num_tokens": 104474783.0, + "step": 33515 + }, + { + "entropy": 0.043385568913072346, + "epoch": 7.813614640400979, + "grad_norm": 1.296875, + "learning_rate": 4.776356940733767e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9977335274219513, + "num_tokens": 104503133.0, + "step": 33520 + }, + { + "entropy": 0.06043797004967928, + "epoch": 7.814780277421611, + "grad_norm": 0.357421875, + "learning_rate": 4.776269804906309e-05, + "loss": 0.0068, + "mean_token_accuracy": 0.997977751493454, + "num_tokens": 104522082.0, + "step": 33525 + }, + { + "entropy": 0.06230600643903017, + "epoch": 7.815945914442243, + "grad_norm": 2.8125, + "learning_rate": 4.776182653775625e-05, + "loss": 0.0187, + "mean_token_accuracy": 0.9945006728172302, + "num_tokens": 104542746.0, + "step": 33530 + }, + { + "entropy": 0.06799766402691602, + "epoch": 7.817111551462874, + "grad_norm": 0.58203125, + "learning_rate": 4.7760954873430146e-05, + "loss": 0.0204, + "mean_token_accuracy": 0.9932066082954407, + "num_tokens": 104559473.0, + "step": 33535 + }, + { + "entropy": 0.059759671241045, + "epoch": 7.818277188483506, + "grad_norm": 0.30859375, + "learning_rate": 4.776008305609776e-05, + "loss": 0.0129, + "mean_token_accuracy": 0.9955129384994507, + "num_tokens": 104583010.0, + "step": 33540 + }, + { + "entropy": 0.06374518619850278, + "epoch": 7.819442825504138, + "grad_norm": 0.25, + "learning_rate": 4.775921108577211e-05, + "loss": 0.0175, + "mean_token_accuracy": 0.9949056446552277, + "num_tokens": 104598695.0, + "step": 33545 + }, + { + "entropy": 0.04200270352885127, + "epoch": 7.82060846252477, + "grad_norm": 0.349609375, + "learning_rate": 4.77583389624662e-05, + "loss": 0.0125, + "mean_token_accuracy": 0.9944587707519531, + "num_tokens": 104624382.0, + "step": 33550 + }, + { + "entropy": 0.0661899745464325, + "epoch": 7.821774099545402, + "grad_norm": 0.82421875, + "learning_rate": 4.775746668619302e-05, + "loss": 0.0199, + "mean_token_accuracy": 0.9944428563117981, + "num_tokens": 104644759.0, + "step": 33555 + }, + { + "entropy": 0.09902186430990696, + "epoch": 7.822939736566033, + "grad_norm": 1.453125, + "learning_rate": 4.7756594256965584e-05, + "loss": 0.0204, + "mean_token_accuracy": 0.9947905838489532, + "num_tokens": 104656623.0, + "step": 33560 + }, + { + "entropy": 0.10063904188573361, + "epoch": 7.824105373586665, + "grad_norm": 1.140625, + "learning_rate": 4.775572167479689e-05, + "loss": 0.0133, + "mean_token_accuracy": 0.9956040382385254, + "num_tokens": 104667933.0, + "step": 33565 + }, + { + "entropy": 0.045405428390949965, + "epoch": 7.825271010607297, + "grad_norm": 0.83984375, + "learning_rate": 4.7754848939699975e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9983605444431305, + "num_tokens": 104690189.0, + "step": 33570 + }, + { + "entropy": 0.06263646613806487, + "epoch": 7.826436647627928, + "grad_norm": 1.4140625, + "learning_rate": 4.775397605168783e-05, + "loss": 0.0126, + "mean_token_accuracy": 0.99606973528862, + "num_tokens": 104713442.0, + "step": 33575 + }, + { + "entropy": 0.060155317559838294, + "epoch": 7.8276022846485604, + "grad_norm": 3.09375, + "learning_rate": 4.775310301077348e-05, + "loss": 0.0159, + "mean_token_accuracy": 0.994491446018219, + "num_tokens": 104734783.0, + "step": 33580 + }, + { + "entropy": 0.047922109439969066, + "epoch": 7.8287679216691926, + "grad_norm": 0.125, + "learning_rate": 4.775222981696995e-05, + "loss": 0.0103, + "mean_token_accuracy": 0.9948363065719604, + "num_tokens": 104763217.0, + "step": 33585 + }, + { + "entropy": 0.0798790443688631, + "epoch": 7.829933558689824, + "grad_norm": 0.98828125, + "learning_rate": 4.775135647029025e-05, + "loss": 0.0131, + "mean_token_accuracy": 0.9965567708015441, + "num_tokens": 104775163.0, + "step": 33590 + }, + { + "entropy": 0.062330286018550396, + "epoch": 7.831099195710456, + "grad_norm": 0.32421875, + "learning_rate": 4.77504829707474e-05, + "loss": 0.0103, + "mean_token_accuracy": 0.9961469352245331, + "num_tokens": 104803548.0, + "step": 33595 + }, + { + "entropy": 0.06328217554837465, + "epoch": 7.832264832731088, + "grad_norm": 2.78125, + "learning_rate": 4.774960931835444e-05, + "loss": 0.017, + "mean_token_accuracy": 0.9947004914283752, + "num_tokens": 104830200.0, + "step": 33600 + }, + { + "entropy": 0.06574259772896766, + "epoch": 7.833430469751719, + "grad_norm": 2.609375, + "learning_rate": 4.7748735513124375e-05, + "loss": 0.0106, + "mean_token_accuracy": 0.9949138760566711, + "num_tokens": 104850235.0, + "step": 33605 + }, + { + "entropy": 0.04857909232378006, + "epoch": 7.834596106772351, + "grad_norm": 0.63671875, + "learning_rate": 4.774786155507026e-05, + "loss": 0.007, + "mean_token_accuracy": 0.9977436184883117, + "num_tokens": 104875446.0, + "step": 33610 + }, + { + "entropy": 0.060628737695515154, + "epoch": 7.8357617437929825, + "grad_norm": 0.57421875, + "learning_rate": 4.774698744420512e-05, + "loss": 0.0156, + "mean_token_accuracy": 0.9954569041728973, + "num_tokens": 104894570.0, + "step": 33615 + }, + { + "entropy": 0.07210960108786821, + "epoch": 7.836927380813615, + "grad_norm": 0.359375, + "learning_rate": 4.774611318054197e-05, + "loss": 0.0136, + "mean_token_accuracy": 0.9972477376461029, + "num_tokens": 104905388.0, + "step": 33620 + }, + { + "entropy": 0.06413645837455988, + "epoch": 7.838093017834247, + "grad_norm": 2.078125, + "learning_rate": 4.774523876409387e-05, + "loss": 0.0268, + "mean_token_accuracy": 0.9931788384914398, + "num_tokens": 104924156.0, + "step": 33625 + }, + { + "entropy": 0.07827889760956168, + "epoch": 7.839258654854878, + "grad_norm": 0.51953125, + "learning_rate": 4.774436419487385e-05, + "loss": 0.035, + "mean_token_accuracy": 0.9918962776660919, + "num_tokens": 104947611.0, + "step": 33630 + }, + { + "entropy": 0.050841915979981424, + "epoch": 7.84042429187551, + "grad_norm": 1.34375, + "learning_rate": 4.7743489472894955e-05, + "loss": 0.0122, + "mean_token_accuracy": 0.9957148313522339, + "num_tokens": 104971187.0, + "step": 33635 + }, + { + "entropy": 0.07467871066182852, + "epoch": 7.841589928896141, + "grad_norm": 1.9765625, + "learning_rate": 4.774261459817022e-05, + "loss": 0.0136, + "mean_token_accuracy": 0.9967439770698547, + "num_tokens": 104982093.0, + "step": 33640 + }, + { + "entropy": 0.06240941435098648, + "epoch": 7.842755565916773, + "grad_norm": 3.59375, + "learning_rate": 4.77417395707127e-05, + "loss": 0.0165, + "mean_token_accuracy": 0.995259428024292, + "num_tokens": 105001148.0, + "step": 33645 + }, + { + "entropy": 0.06917629651725292, + "epoch": 7.8439212029374055, + "grad_norm": 3.234375, + "learning_rate": 4.7740864390535434e-05, + "loss": 0.0178, + "mean_token_accuracy": 0.994339919090271, + "num_tokens": 105015498.0, + "step": 33650 + }, + { + "entropy": 0.051767791528254746, + "epoch": 7.845086839958037, + "grad_norm": 0.6796875, + "learning_rate": 4.773998905765147e-05, + "loss": 0.0071, + "mean_token_accuracy": 0.9966930508613586, + "num_tokens": 105045073.0, + "step": 33655 + }, + { + "entropy": 0.06975855696946383, + "epoch": 7.846252476978669, + "grad_norm": 1.8203125, + "learning_rate": 4.7739113572073876e-05, + "loss": 0.0122, + "mean_token_accuracy": 0.9956272125244141, + "num_tokens": 105072520.0, + "step": 33660 + }, + { + "entropy": 0.07446997575461864, + "epoch": 7.847418113999301, + "grad_norm": 1.5546875, + "learning_rate": 4.77382379338157e-05, + "loss": 0.0257, + "mean_token_accuracy": 0.9942268848419189, + "num_tokens": 105083649.0, + "step": 33665 + }, + { + "entropy": 0.06689784284681081, + "epoch": 7.848583751019932, + "grad_norm": 0.39453125, + "learning_rate": 4.773736214289e-05, + "loss": 0.0076, + "mean_token_accuracy": 0.9982570469379425, + "num_tokens": 105107999.0, + "step": 33670 + }, + { + "entropy": 0.07380356825888157, + "epoch": 7.849749388040564, + "grad_norm": 0.470703125, + "learning_rate": 4.773648619930983e-05, + "loss": 0.0083, + "mean_token_accuracy": 0.996808648109436, + "num_tokens": 105128184.0, + "step": 33675 + }, + { + "entropy": 0.06525698453187942, + "epoch": 7.850915025061196, + "grad_norm": 4.71875, + "learning_rate": 4.7735610103088245e-05, + "loss": 0.0144, + "mean_token_accuracy": 0.9928626179695129, + "num_tokens": 105155039.0, + "step": 33680 + }, + { + "entropy": 0.06640646066516638, + "epoch": 7.8520806620818275, + "grad_norm": 1.6015625, + "learning_rate": 4.7734733854238324e-05, + "loss": 0.0091, + "mean_token_accuracy": 0.9936692476272583, + "num_tokens": 105181828.0, + "step": 33685 + }, + { + "entropy": 0.08131216876208783, + "epoch": 7.85324629910246, + "grad_norm": 0.91015625, + "learning_rate": 4.773385745277313e-05, + "loss": 0.026, + "mean_token_accuracy": 0.9937609195709228, + "num_tokens": 105194395.0, + "step": 33690 + }, + { + "entropy": 0.05827680192887783, + "epoch": 7.854411936123091, + "grad_norm": 1.2734375, + "learning_rate": 4.773298089870573e-05, + "loss": 0.0159, + "mean_token_accuracy": 0.9963181138038635, + "num_tokens": 105207309.0, + "step": 33695 + }, + { + "entropy": 0.07168765515089034, + "epoch": 7.855577573143723, + "grad_norm": 1.703125, + "learning_rate": 4.773210419204919e-05, + "loss": 0.0124, + "mean_token_accuracy": 0.9961957156658172, + "num_tokens": 105220229.0, + "step": 33700 + }, + { + "entropy": 0.062171673867851496, + "epoch": 7.856743210164355, + "grad_norm": 0.2021484375, + "learning_rate": 4.773122733281659e-05, + "loss": 0.0083, + "mean_token_accuracy": 0.9970798075199128, + "num_tokens": 105238170.0, + "step": 33705 + }, + { + "entropy": 0.04381891647353768, + "epoch": 7.857908847184986, + "grad_norm": 0.462890625, + "learning_rate": 4.773035032102099e-05, + "loss": 0.0055, + "mean_token_accuracy": 0.997516006231308, + "num_tokens": 105285631.0, + "step": 33710 + }, + { + "entropy": 0.04537671413272619, + "epoch": 7.859074484205618, + "grad_norm": 0.33203125, + "learning_rate": 4.772947315667549e-05, + "loss": 0.0139, + "mean_token_accuracy": 0.9939434051513671, + "num_tokens": 105310695.0, + "step": 33715 + }, + { + "entropy": 0.05652222605422139, + "epoch": 7.8602401212262505, + "grad_norm": 1.3828125, + "learning_rate": 4.7728595839793155e-05, + "loss": 0.0088, + "mean_token_accuracy": 0.9973005592823029, + "num_tokens": 105328469.0, + "step": 33720 + }, + { + "entropy": 0.0721471224911511, + "epoch": 7.861405758246882, + "grad_norm": 1.078125, + "learning_rate": 4.7727718370387074e-05, + "loss": 0.0242, + "mean_token_accuracy": 0.9938137114048005, + "num_tokens": 105348789.0, + "step": 33725 + }, + { + "entropy": 0.054934839438647035, + "epoch": 7.862571395267514, + "grad_norm": 0.275390625, + "learning_rate": 4.772684074847033e-05, + "loss": 0.0084, + "mean_token_accuracy": 0.9964746117591858, + "num_tokens": 105372261.0, + "step": 33730 + }, + { + "entropy": 0.060262033343315126, + "epoch": 7.863737032288146, + "grad_norm": 0.2255859375, + "learning_rate": 4.7725962974056007e-05, + "loss": 0.0074, + "mean_token_accuracy": 0.99892857670784, + "num_tokens": 105401147.0, + "step": 33735 + }, + { + "entropy": 0.07648681541904807, + "epoch": 7.864902669308777, + "grad_norm": 2.6875, + "learning_rate": 4.772508504715719e-05, + "loss": 0.0147, + "mean_token_accuracy": 0.9954568028450013, + "num_tokens": 105424983.0, + "step": 33740 + }, + { + "entropy": 0.06310393549501896, + "epoch": 7.866068306329409, + "grad_norm": 0.80859375, + "learning_rate": 4.772420696778699e-05, + "loss": 0.0171, + "mean_token_accuracy": 0.9937354981899261, + "num_tokens": 105435754.0, + "step": 33745 + }, + { + "entropy": 0.08068116065114736, + "epoch": 7.8672339433500404, + "grad_norm": 1.125, + "learning_rate": 4.7723328735958475e-05, + "loss": 0.022, + "mean_token_accuracy": 0.9926752507686615, + "num_tokens": 105456206.0, + "step": 33750 + }, + { + "entropy": 0.10241719000041485, + "epoch": 7.8683995803706726, + "grad_norm": 1.609375, + "learning_rate": 4.772245035168475e-05, + "loss": 0.0188, + "mean_token_accuracy": 0.9940951466560364, + "num_tokens": 105475697.0, + "step": 33755 + }, + { + "entropy": 0.05356983579695225, + "epoch": 7.869565217391305, + "grad_norm": 0.359375, + "learning_rate": 4.772157181497892e-05, + "loss": 0.0072, + "mean_token_accuracy": 0.9973737239837647, + "num_tokens": 105507022.0, + "step": 33760 + }, + { + "entropy": 0.08068993408232927, + "epoch": 7.870730854411936, + "grad_norm": 0.2294921875, + "learning_rate": 4.772069312585408e-05, + "loss": 0.0151, + "mean_token_accuracy": 0.9943787693977356, + "num_tokens": 105530124.0, + "step": 33765 + }, + { + "entropy": 0.0713021919131279, + "epoch": 7.871896491432568, + "grad_norm": 0.61328125, + "learning_rate": 4.771981428432333e-05, + "loss": 0.0104, + "mean_token_accuracy": 0.9964679121971131, + "num_tokens": 105542720.0, + "step": 33770 + }, + { + "entropy": 0.0819464897736907, + "epoch": 7.873062128453199, + "grad_norm": 4.3125, + "learning_rate": 4.771893529039978e-05, + "loss": 0.0303, + "mean_token_accuracy": 0.9911946177482605, + "num_tokens": 105563421.0, + "step": 33775 + }, + { + "entropy": 0.07584189437329769, + "epoch": 7.874227765473831, + "grad_norm": 1.484375, + "learning_rate": 4.7718056144096526e-05, + "loss": 0.0135, + "mean_token_accuracy": 0.9951583981513977, + "num_tokens": 105578641.0, + "step": 33780 + }, + { + "entropy": 0.058252211194485426, + "epoch": 7.875393402494463, + "grad_norm": 1.2421875, + "learning_rate": 4.77171768454267e-05, + "loss": 0.017, + "mean_token_accuracy": 0.9964474558830261, + "num_tokens": 105600238.0, + "step": 33785 + }, + { + "entropy": 0.08072509821504355, + "epoch": 7.876559039515095, + "grad_norm": 0.92578125, + "learning_rate": 4.771629739440339e-05, + "loss": 0.0243, + "mean_token_accuracy": 0.995055878162384, + "num_tokens": 105625046.0, + "step": 33790 + }, + { + "entropy": 0.0600465914234519, + "epoch": 7.877724676535727, + "grad_norm": 0.333984375, + "learning_rate": 4.7715417791039726e-05, + "loss": 0.0086, + "mean_token_accuracy": 0.9962651550769805, + "num_tokens": 105643980.0, + "step": 33795 + }, + { + "entropy": 0.08397406414151191, + "epoch": 7.878890313556359, + "grad_norm": 0.87890625, + "learning_rate": 4.771453803534881e-05, + "loss": 0.0222, + "mean_token_accuracy": 0.9933609664440155, + "num_tokens": 105662117.0, + "step": 33800 + }, + { + "entropy": 0.04541693087667227, + "epoch": 7.88005595057699, + "grad_norm": 0.314453125, + "learning_rate": 4.7713658127343776e-05, + "loss": 0.0065, + "mean_token_accuracy": 0.9967815399169921, + "num_tokens": 105695778.0, + "step": 33805 + }, + { + "entropy": 0.07490654885768891, + "epoch": 7.881221587597622, + "grad_norm": 2.296875, + "learning_rate": 4.771277806703773e-05, + "loss": 0.0176, + "mean_token_accuracy": 0.9951118290424347, + "num_tokens": 105707859.0, + "step": 33810 + }, + { + "entropy": 0.07078452426940203, + "epoch": 7.882387224618254, + "grad_norm": 0.95703125, + "learning_rate": 4.771189785444381e-05, + "loss": 0.0158, + "mean_token_accuracy": 0.9953491032123566, + "num_tokens": 105722345.0, + "step": 33815 + }, + { + "entropy": 0.06641866527497768, + "epoch": 7.8835528616388855, + "grad_norm": 0.296875, + "learning_rate": 4.7711017489575134e-05, + "loss": 0.0136, + "mean_token_accuracy": 0.995814448595047, + "num_tokens": 105741946.0, + "step": 33820 + }, + { + "entropy": 0.09044511755928397, + "epoch": 7.884718498659518, + "grad_norm": 1.421875, + "learning_rate": 4.7710136972444816e-05, + "loss": 0.0151, + "mean_token_accuracy": 0.9930476427078248, + "num_tokens": 105764921.0, + "step": 33825 + }, + { + "entropy": 0.05839778780937195, + "epoch": 7.885884135680149, + "grad_norm": 1.3515625, + "learning_rate": 4.770925630306601e-05, + "loss": 0.0093, + "mean_token_accuracy": 0.9965133607387543, + "num_tokens": 105797265.0, + "step": 33830 + }, + { + "entropy": 0.06722655799239874, + "epoch": 7.887049772700781, + "grad_norm": 2.15625, + "learning_rate": 4.770837548145184e-05, + "loss": 0.0203, + "mean_token_accuracy": 0.9924581229686738, + "num_tokens": 105809418.0, + "step": 33835 + }, + { + "entropy": 0.07799172587692738, + "epoch": 7.888215409721413, + "grad_norm": 4.8125, + "learning_rate": 4.770749450761543e-05, + "loss": 0.0218, + "mean_token_accuracy": 0.9934140980243683, + "num_tokens": 105827257.0, + "step": 33840 + }, + { + "entropy": 0.058118250127881764, + "epoch": 7.889381046742044, + "grad_norm": 3.625, + "learning_rate": 4.770661338156993e-05, + "loss": 0.0112, + "mean_token_accuracy": 0.9950781047344208, + "num_tokens": 105852545.0, + "step": 33845 + }, + { + "entropy": 0.06989272115752101, + "epoch": 7.890546683762676, + "grad_norm": 0.5703125, + "learning_rate": 4.7705732103328466e-05, + "loss": 0.0179, + "mean_token_accuracy": 0.9932778656482697, + "num_tokens": 105875208.0, + "step": 33850 + }, + { + "entropy": 0.05829099677503109, + "epoch": 7.891712320783308, + "grad_norm": 1.640625, + "learning_rate": 4.770485067290419e-05, + "loss": 0.0148, + "mean_token_accuracy": 0.9961495161056518, + "num_tokens": 105891957.0, + "step": 33855 + }, + { + "entropy": 0.06487886887043715, + "epoch": 7.89287795780394, + "grad_norm": 2.375, + "learning_rate": 4.7703969090310244e-05, + "loss": 0.0202, + "mean_token_accuracy": 0.9944393396377563, + "num_tokens": 105905309.0, + "step": 33860 + }, + { + "entropy": 0.07422820059582591, + "epoch": 7.894043594824572, + "grad_norm": 0.48046875, + "learning_rate": 4.7703087355559764e-05, + "loss": 0.0076, + "mean_token_accuracy": 0.996603262424469, + "num_tokens": 105929609.0, + "step": 33865 + }, + { + "entropy": 0.0708279337733984, + "epoch": 7.895209231845204, + "grad_norm": 1.3125, + "learning_rate": 4.7702205468665904e-05, + "loss": 0.008, + "mean_token_accuracy": 0.99549121260643, + "num_tokens": 105951474.0, + "step": 33870 + }, + { + "entropy": 0.04628618396818638, + "epoch": 7.896374868865835, + "grad_norm": 0.416015625, + "learning_rate": 4.770132342964182e-05, + "loss": 0.0072, + "mean_token_accuracy": 0.9972758531570435, + "num_tokens": 105982339.0, + "step": 33875 + }, + { + "entropy": 0.08831856437027455, + "epoch": 7.897540505886467, + "grad_norm": 0.73828125, + "learning_rate": 4.7700441238500667e-05, + "loss": 0.0176, + "mean_token_accuracy": 0.996027272939682, + "num_tokens": 105996369.0, + "step": 33880 + }, + { + "entropy": 0.07061892207711935, + "epoch": 7.898706142907098, + "grad_norm": 5.21875, + "learning_rate": 4.769955889525558e-05, + "loss": 0.0129, + "mean_token_accuracy": 0.9928837418556213, + "num_tokens": 106023378.0, + "step": 33885 + }, + { + "entropy": 0.05240669772028923, + "epoch": 7.8998717799277305, + "grad_norm": 0.3515625, + "learning_rate": 4.769867639991974e-05, + "loss": 0.0075, + "mean_token_accuracy": 0.9973884999752045, + "num_tokens": 106049844.0, + "step": 33890 + }, + { + "entropy": 0.05810522306710482, + "epoch": 7.901037416948363, + "grad_norm": 0.35546875, + "learning_rate": 4.769779375250629e-05, + "loss": 0.006, + "mean_token_accuracy": 0.9974211573600769, + "num_tokens": 106083133.0, + "step": 33895 + }, + { + "entropy": 0.057801909372210504, + "epoch": 7.902203053968994, + "grad_norm": 1.234375, + "learning_rate": 4.7696910953028395e-05, + "loss": 0.0072, + "mean_token_accuracy": 0.9975810825824738, + "num_tokens": 106108191.0, + "step": 33900 + }, + { + "entropy": 0.06211001239717007, + "epoch": 7.903368690989626, + "grad_norm": 3.4375, + "learning_rate": 4.769602800149922e-05, + "loss": 0.0133, + "mean_token_accuracy": 0.9950418591499328, + "num_tokens": 106123112.0, + "step": 33905 + }, + { + "entropy": 0.05631450889632106, + "epoch": 7.904534328010257, + "grad_norm": 0.35546875, + "learning_rate": 4.769514489793194e-05, + "loss": 0.0089, + "mean_token_accuracy": 0.9966521799564362, + "num_tokens": 106142631.0, + "step": 33910 + }, + { + "entropy": 0.06154237762093544, + "epoch": 7.905699965030889, + "grad_norm": 0.96484375, + "learning_rate": 4.7694261642339706e-05, + "loss": 0.0126, + "mean_token_accuracy": 0.9955787539482117, + "num_tokens": 106170787.0, + "step": 33915 + }, + { + "entropy": 0.05892001828178763, + "epoch": 7.906865602051521, + "grad_norm": 0.25390625, + "learning_rate": 4.76933782347357e-05, + "loss": 0.0173, + "mean_token_accuracy": 0.994269210100174, + "num_tokens": 106194157.0, + "step": 33920 + }, + { + "entropy": 0.058598081674426795, + "epoch": 7.9080312390721526, + "grad_norm": 1.109375, + "learning_rate": 4.7692494675133094e-05, + "loss": 0.0094, + "mean_token_accuracy": 0.9951113820075989, + "num_tokens": 106219023.0, + "step": 33925 + }, + { + "entropy": 0.07916519045829773, + "epoch": 7.909196876092785, + "grad_norm": 1.2421875, + "learning_rate": 4.769161096354506e-05, + "loss": 0.0256, + "mean_token_accuracy": 0.9923042953014374, + "num_tokens": 106239168.0, + "step": 33930 + }, + { + "entropy": 0.07624795585870743, + "epoch": 7.910362513113417, + "grad_norm": 1.796875, + "learning_rate": 4.769072709998478e-05, + "loss": 0.0191, + "mean_token_accuracy": 0.9953700006008148, + "num_tokens": 106263742.0, + "step": 33935 + }, + { + "entropy": 0.07141708973795176, + "epoch": 7.911528150134048, + "grad_norm": 1.0859375, + "learning_rate": 4.768984308446544e-05, + "loss": 0.0086, + "mean_token_accuracy": 0.9971637487411499, + "num_tokens": 106282014.0, + "step": 33940 + }, + { + "entropy": 0.07009767638519407, + "epoch": 7.91269378715468, + "grad_norm": 1.1796875, + "learning_rate": 4.7688958917000195e-05, + "loss": 0.0142, + "mean_token_accuracy": 0.9976205289363861, + "num_tokens": 106314706.0, + "step": 33945 + }, + { + "entropy": 0.05547423539683223, + "epoch": 7.913859424175312, + "grad_norm": 0.84765625, + "learning_rate": 4.768807459760226e-05, + "loss": 0.0142, + "mean_token_accuracy": 0.9966548085212708, + "num_tokens": 106356405.0, + "step": 33950 + }, + { + "entropy": 0.14163271840661765, + "epoch": 7.915025061195943, + "grad_norm": 2.71875, + "learning_rate": 4.768719012628481e-05, + "loss": 0.1602, + "mean_token_accuracy": 0.9762783288955689, + "num_tokens": 106383544.0, + "step": 33955 + }, + { + "entropy": 0.07611956689506769, + "epoch": 7.9161906982165755, + "grad_norm": 3.171875, + "learning_rate": 4.7686305503061024e-05, + "loss": 0.0154, + "mean_token_accuracy": 0.9944826066493988, + "num_tokens": 106400123.0, + "step": 33960 + }, + { + "entropy": 0.05730560040101409, + "epoch": 7.917356335237207, + "grad_norm": 0.353515625, + "learning_rate": 4.768542072794411e-05, + "loss": 0.0079, + "mean_token_accuracy": 0.9983284890651702, + "num_tokens": 106416830.0, + "step": 33965 + }, + { + "entropy": 0.04461151892319322, + "epoch": 7.918521972257839, + "grad_norm": 0.609375, + "learning_rate": 4.768453580094724e-05, + "loss": 0.0091, + "mean_token_accuracy": 0.9960005044937134, + "num_tokens": 106445076.0, + "step": 33970 + }, + { + "entropy": 0.07526362799108029, + "epoch": 7.919687609278471, + "grad_norm": 1.2734375, + "learning_rate": 4.7683650722083636e-05, + "loss": 0.0117, + "mean_token_accuracy": 0.9961690485477448, + "num_tokens": 106462778.0, + "step": 33975 + }, + { + "entropy": 0.06253966316580772, + "epoch": 7.920853246299102, + "grad_norm": 0.447265625, + "learning_rate": 4.7682765491366484e-05, + "loss": 0.0131, + "mean_token_accuracy": 0.9950137913227082, + "num_tokens": 106496519.0, + "step": 33980 + }, + { + "entropy": 0.05731949470937252, + "epoch": 7.922018883319734, + "grad_norm": 0.267578125, + "learning_rate": 4.768188010880897e-05, + "loss": 0.018, + "mean_token_accuracy": 0.9952464640140534, + "num_tokens": 106523851.0, + "step": 33985 + }, + { + "entropy": 0.0510101712308824, + "epoch": 7.923184520340366, + "grad_norm": 0.341796875, + "learning_rate": 4.768099457442432e-05, + "loss": 0.0107, + "mean_token_accuracy": 0.996779203414917, + "num_tokens": 106546787.0, + "step": 33990 + }, + { + "entropy": 0.0640136405825615, + "epoch": 7.924350157360998, + "grad_norm": 0.462890625, + "learning_rate": 4.7680108888225724e-05, + "loss": 0.0175, + "mean_token_accuracy": 0.9934268534183502, + "num_tokens": 106558030.0, + "step": 33995 + }, + { + "entropy": 0.06371219847351313, + "epoch": 7.92551579438163, + "grad_norm": 1.890625, + "learning_rate": 4.7679223050226396e-05, + "loss": 0.0104, + "mean_token_accuracy": 0.9967857897281647, + "num_tokens": 106575073.0, + "step": 34000 + }, + { + "entropy": 0.06602727882564068, + "epoch": 7.926681431402262, + "grad_norm": 1.2734375, + "learning_rate": 4.7678337060439536e-05, + "loss": 0.0159, + "mean_token_accuracy": 0.9948717474937439, + "num_tokens": 106587825.0, + "step": 34005 + }, + { + "entropy": 0.058292522095143795, + "epoch": 7.927847068422893, + "grad_norm": 1.4140625, + "learning_rate": 4.767745091887837e-05, + "loss": 0.0204, + "mean_token_accuracy": 0.9944105803966522, + "num_tokens": 106617867.0, + "step": 34010 + }, + { + "entropy": 0.07741288328543305, + "epoch": 7.929012705443525, + "grad_norm": 1.921875, + "learning_rate": 4.76765646255561e-05, + "loss": 0.019, + "mean_token_accuracy": 0.9938458442687989, + "num_tokens": 106635957.0, + "step": 34015 + }, + { + "entropy": 0.06108499057590962, + "epoch": 7.930178342464156, + "grad_norm": 1.8984375, + "learning_rate": 4.767567818048594e-05, + "loss": 0.015, + "mean_token_accuracy": 0.9948182106018066, + "num_tokens": 106653606.0, + "step": 34020 + }, + { + "entropy": 0.07961751464754344, + "epoch": 7.931343979484788, + "grad_norm": 0.416015625, + "learning_rate": 4.7674791583681115e-05, + "loss": 0.0084, + "mean_token_accuracy": 0.9968226969242096, + "num_tokens": 106676928.0, + "step": 34025 + }, + { + "entropy": 0.08433617986738681, + "epoch": 7.9325096165054205, + "grad_norm": 0.91015625, + "learning_rate": 4.767390483515485e-05, + "loss": 0.0266, + "mean_token_accuracy": 0.9931154727935791, + "num_tokens": 106687073.0, + "step": 34030 + }, + { + "entropy": 0.05512349987402558, + "epoch": 7.933675253526052, + "grad_norm": 0.5234375, + "learning_rate": 4.767301793492035e-05, + "loss": 0.0156, + "mean_token_accuracy": 0.9950350284576416, + "num_tokens": 106707843.0, + "step": 34035 + }, + { + "entropy": 0.053560327272862196, + "epoch": 7.934840890546684, + "grad_norm": 1.859375, + "learning_rate": 4.767213088299086e-05, + "loss": 0.016, + "mean_token_accuracy": 0.9937402963638305, + "num_tokens": 106739062.0, + "step": 34040 + }, + { + "entropy": 0.06507365573197603, + "epoch": 7.936006527567315, + "grad_norm": 1.3125, + "learning_rate": 4.76712436793796e-05, + "loss": 0.0093, + "mean_token_accuracy": 0.9949987173080445, + "num_tokens": 106753445.0, + "step": 34045 + }, + { + "entropy": 0.05009383130818605, + "epoch": 7.937172164587947, + "grad_norm": 1.0390625, + "learning_rate": 4.767035632409979e-05, + "loss": 0.0083, + "mean_token_accuracy": 0.9968866109848022, + "num_tokens": 106771659.0, + "step": 34050 + }, + { + "entropy": 0.06271151565015316, + "epoch": 7.938337801608579, + "grad_norm": 0.875, + "learning_rate": 4.7669468817164676e-05, + "loss": 0.0102, + "mean_token_accuracy": 0.9934817552566528, + "num_tokens": 106784329.0, + "step": 34055 + }, + { + "entropy": 0.06390997301787138, + "epoch": 7.9395034386292105, + "grad_norm": 0.92578125, + "learning_rate": 4.7668581158587486e-05, + "loss": 0.0174, + "mean_token_accuracy": 0.9963392674922943, + "num_tokens": 106807272.0, + "step": 34060 + }, + { + "entropy": 0.05805385177955032, + "epoch": 7.940669075649843, + "grad_norm": 2.375, + "learning_rate": 4.766769334838146e-05, + "loss": 0.0144, + "mean_token_accuracy": 0.9943688631057739, + "num_tokens": 106835599.0, + "step": 34065 + }, + { + "entropy": 0.06601453013718128, + "epoch": 7.941834712670475, + "grad_norm": 1.0859375, + "learning_rate": 4.7666805386559826e-05, + "loss": 0.0137, + "mean_token_accuracy": 0.9940655350685119, + "num_tokens": 106856149.0, + "step": 34070 + }, + { + "entropy": 0.062201010249555114, + "epoch": 7.943000349691106, + "grad_norm": 0.3125, + "learning_rate": 4.766591727313584e-05, + "loss": 0.0073, + "mean_token_accuracy": 0.9955526292324066, + "num_tokens": 106890142.0, + "step": 34075 + }, + { + "entropy": 0.06021309243515134, + "epoch": 7.944165986711738, + "grad_norm": 0.2431640625, + "learning_rate": 4.7665029008122725e-05, + "loss": 0.0243, + "mean_token_accuracy": 0.994538813829422, + "num_tokens": 106916598.0, + "step": 34080 + }, + { + "entropy": 0.06776752769947052, + "epoch": 7.94533162373237, + "grad_norm": 0.94140625, + "learning_rate": 4.7664140591533745e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9975357413291931, + "num_tokens": 106937426.0, + "step": 34085 + }, + { + "entropy": 0.06775364875793458, + "epoch": 7.946497260753001, + "grad_norm": 2.15625, + "learning_rate": 4.7663252023382145e-05, + "loss": 0.0163, + "mean_token_accuracy": 0.9948492228984833, + "num_tokens": 106948882.0, + "step": 34090 + }, + { + "entropy": 0.07773761413991451, + "epoch": 7.947662897773633, + "grad_norm": 1.8671875, + "learning_rate": 4.766236330368117e-05, + "loss": 0.0231, + "mean_token_accuracy": 0.991237860918045, + "num_tokens": 106963377.0, + "step": 34095 + }, + { + "entropy": 0.06178758256137371, + "epoch": 7.948828534794265, + "grad_norm": 0.87109375, + "learning_rate": 4.7661474432444065e-05, + "loss": 0.015, + "mean_token_accuracy": 0.9954757153987884, + "num_tokens": 106979240.0, + "step": 34100 + }, + { + "entropy": 0.13852319195866586, + "epoch": 7.949994171814897, + "grad_norm": 3.5, + "learning_rate": 4.76605854096841e-05, + "loss": 0.1877, + "mean_token_accuracy": 0.9742585599422455, + "num_tokens": 107002385.0, + "step": 34105 + }, + { + "entropy": 0.08786424957215785, + "epoch": 7.951159808835529, + "grad_norm": 2.59375, + "learning_rate": 4.765969623541452e-05, + "loss": 0.0205, + "mean_token_accuracy": 0.9931396842002869, + "num_tokens": 107011802.0, + "step": 34110 + }, + { + "entropy": 0.06776084508746863, + "epoch": 7.95232544585616, + "grad_norm": 1.125, + "learning_rate": 4.765880690964859e-05, + "loss": 0.0206, + "mean_token_accuracy": 0.9957813143730163, + "num_tokens": 107028043.0, + "step": 34115 + }, + { + "entropy": 0.04915742976590991, + "epoch": 7.953491082876792, + "grad_norm": 0.3671875, + "learning_rate": 4.765791743239957e-05, + "loss": 0.0126, + "mean_token_accuracy": 0.9925608456134796, + "num_tokens": 107059800.0, + "step": 34120 + }, + { + "entropy": 0.056366760190576314, + "epoch": 7.954656719897424, + "grad_norm": 0.46484375, + "learning_rate": 4.765702780368072e-05, + "loss": 0.0092, + "mean_token_accuracy": 0.9966596603393555, + "num_tokens": 107083510.0, + "step": 34125 + }, + { + "entropy": 0.0461616107262671, + "epoch": 7.9558223569180555, + "grad_norm": 0.35546875, + "learning_rate": 4.765613802350532e-05, + "loss": 0.0076, + "mean_token_accuracy": 0.9946865618228913, + "num_tokens": 107123417.0, + "step": 34130 + }, + { + "entropy": 0.07643718775361777, + "epoch": 7.956987993938688, + "grad_norm": 2.640625, + "learning_rate": 4.765524809188661e-05, + "loss": 0.0243, + "mean_token_accuracy": 0.9927464604377747, + "num_tokens": 107142894.0, + "step": 34135 + }, + { + "entropy": 0.06084223799407482, + "epoch": 7.95815363095932, + "grad_norm": 1.0078125, + "learning_rate": 4.765435800883788e-05, + "loss": 0.0115, + "mean_token_accuracy": 0.996390950679779, + "num_tokens": 107163073.0, + "step": 34140 + }, + { + "entropy": 0.0730635855346918, + "epoch": 7.959319267979951, + "grad_norm": 4.9375, + "learning_rate": 4.7653467774372405e-05, + "loss": 0.0116, + "mean_token_accuracy": 0.9954229176044465, + "num_tokens": 107174723.0, + "step": 34145 + }, + { + "entropy": 0.05009271074086428, + "epoch": 7.960484905000583, + "grad_norm": 0.79296875, + "learning_rate": 4.765257738850345e-05, + "loss": 0.0155, + "mean_token_accuracy": 0.9943463981151581, + "num_tokens": 107200870.0, + "step": 34150 + }, + { + "entropy": 0.0630310207605362, + "epoch": 7.961650542021214, + "grad_norm": 1.2265625, + "learning_rate": 4.76516868512443e-05, + "loss": 0.0227, + "mean_token_accuracy": 0.9922509610652923, + "num_tokens": 107216538.0, + "step": 34155 + }, + { + "entropy": 0.05934207225218415, + "epoch": 7.962816179041846, + "grad_norm": 0.287109375, + "learning_rate": 4.7650796162608225e-05, + "loss": 0.0066, + "mean_token_accuracy": 0.9973390638828278, + "num_tokens": 107244740.0, + "step": 34160 + }, + { + "entropy": 0.06261946465820074, + "epoch": 7.9639818160624785, + "grad_norm": 0.71875, + "learning_rate": 4.764990532260851e-05, + "loss": 0.0111, + "mean_token_accuracy": 0.9966804802417755, + "num_tokens": 107268063.0, + "step": 34165 + }, + { + "entropy": 0.05479574520140886, + "epoch": 7.96514745308311, + "grad_norm": 3.34375, + "learning_rate": 4.7649014331258454e-05, + "loss": 0.0201, + "mean_token_accuracy": 0.994753235578537, + "num_tokens": 107284612.0, + "step": 34170 + }, + { + "entropy": 0.062203343212604525, + "epoch": 7.966313090103742, + "grad_norm": 0.400390625, + "learning_rate": 4.764812318857131e-05, + "loss": 0.0106, + "mean_token_accuracy": 0.9971283197402954, + "num_tokens": 107302774.0, + "step": 34175 + }, + { + "entropy": 0.10029302248731256, + "epoch": 7.967478727124373, + "grad_norm": 0.59765625, + "learning_rate": 4.7647231894560405e-05, + "loss": 0.0059, + "mean_token_accuracy": 0.998160594701767, + "num_tokens": 107341090.0, + "step": 34180 + }, + { + "entropy": 0.08249510303139687, + "epoch": 7.968644364145005, + "grad_norm": 4.1875, + "learning_rate": 4.7646340449238995e-05, + "loss": 0.024, + "mean_token_accuracy": 0.9922098696231842, + "num_tokens": 107350811.0, + "step": 34185 + }, + { + "entropy": 0.05340167321264744, + "epoch": 7.969810001165637, + "grad_norm": 0.42578125, + "learning_rate": 4.76454488526204e-05, + "loss": 0.0111, + "mean_token_accuracy": 0.9972420930862427, + "num_tokens": 107381142.0, + "step": 34190 + }, + { + "entropy": 0.056400989554822444, + "epoch": 7.970975638186268, + "grad_norm": 0.474609375, + "learning_rate": 4.7644557104717894e-05, + "loss": 0.0251, + "mean_token_accuracy": 0.994273841381073, + "num_tokens": 107403837.0, + "step": 34195 + }, + { + "entropy": 0.05342116076499224, + "epoch": 7.9721412752069005, + "grad_norm": 0.390625, + "learning_rate": 4.7643665205544786e-05, + "loss": 0.0067, + "mean_token_accuracy": 0.9962201118469238, + "num_tokens": 107430524.0, + "step": 34200 + }, + { + "entropy": 0.0673133933916688, + "epoch": 7.973306912227533, + "grad_norm": 0.53515625, + "learning_rate": 4.764277315511437e-05, + "loss": 0.0095, + "mean_token_accuracy": 0.9935947299003601, + "num_tokens": 107448914.0, + "step": 34205 + }, + { + "entropy": 0.06266302475705743, + "epoch": 7.974472549248164, + "grad_norm": 1.4921875, + "learning_rate": 4.764188095343996e-05, + "loss": 0.0123, + "mean_token_accuracy": 0.9956326484680176, + "num_tokens": 107467145.0, + "step": 34210 + }, + { + "entropy": 0.056773718632757667, + "epoch": 7.975638186268796, + "grad_norm": 0.79296875, + "learning_rate": 4.7640988600534834e-05, + "loss": 0.0135, + "mean_token_accuracy": 0.9961419343948364, + "num_tokens": 107484762.0, + "step": 34215 + }, + { + "entropy": 0.05827885894104838, + "epoch": 7.976803823289428, + "grad_norm": 0.75, + "learning_rate": 4.7640096096412326e-05, + "loss": 0.0176, + "mean_token_accuracy": 0.9965371251106262, + "num_tokens": 107507967.0, + "step": 34220 + }, + { + "entropy": 0.06455299574881793, + "epoch": 7.977969460310059, + "grad_norm": 1.6484375, + "learning_rate": 4.763920344108573e-05, + "loss": 0.0129, + "mean_token_accuracy": 0.994284588098526, + "num_tokens": 107524367.0, + "step": 34225 + }, + { + "entropy": 0.05216892771422863, + "epoch": 7.979135097330691, + "grad_norm": 1.875, + "learning_rate": 4.763831063456837e-05, + "loss": 0.0094, + "mean_token_accuracy": 0.9944133341312409, + "num_tokens": 107549902.0, + "step": 34230 + }, + { + "entropy": 0.046560865826904774, + "epoch": 7.980300734351323, + "grad_norm": 1.1484375, + "learning_rate": 4.7637417676873534e-05, + "loss": 0.0112, + "mean_token_accuracy": 0.996799236536026, + "num_tokens": 107573291.0, + "step": 34235 + }, + { + "entropy": 0.07189444862306119, + "epoch": 7.981466371371955, + "grad_norm": 0.671875, + "learning_rate": 4.7636524568014564e-05, + "loss": 0.0158, + "mean_token_accuracy": 0.9959348559379577, + "num_tokens": 107583447.0, + "step": 34240 + }, + { + "entropy": 0.06508129709400237, + "epoch": 7.982632008392587, + "grad_norm": 1.8984375, + "learning_rate": 4.763563130800476e-05, + "loss": 0.0138, + "mean_token_accuracy": 0.9959682047367096, + "num_tokens": 107604534.0, + "step": 34245 + }, + { + "entropy": 0.04465307705104351, + "epoch": 7.983797645413218, + "grad_norm": 0.35546875, + "learning_rate": 4.763473789685746e-05, + "loss": 0.0142, + "mean_token_accuracy": 0.9941910564899444, + "num_tokens": 107624770.0, + "step": 34250 + }, + { + "entropy": 0.08091672882437706, + "epoch": 7.98496328243385, + "grad_norm": 0.51171875, + "learning_rate": 4.763384433458596e-05, + "loss": 0.0211, + "mean_token_accuracy": 0.9940769791603088, + "num_tokens": 107634611.0, + "step": 34255 + }, + { + "entropy": 0.05613311324268579, + "epoch": 7.986128919454482, + "grad_norm": 1.5, + "learning_rate": 4.763295062120361e-05, + "loss": 0.0139, + "mean_token_accuracy": 0.9961461901664734, + "num_tokens": 107650710.0, + "step": 34260 + }, + { + "entropy": 0.049251684360206126, + "epoch": 7.987294556475113, + "grad_norm": 0.419921875, + "learning_rate": 4.763205675672372e-05, + "loss": 0.0047, + "mean_token_accuracy": 0.9977784931659699, + "num_tokens": 107677762.0, + "step": 34265 + }, + { + "entropy": 0.06999004650861025, + "epoch": 7.9884601934957455, + "grad_norm": 2.484375, + "learning_rate": 4.763116274115963e-05, + "loss": 0.021, + "mean_token_accuracy": 0.9948745965957642, + "num_tokens": 107690370.0, + "step": 34270 + }, + { + "entropy": 0.07125404626131057, + "epoch": 7.989625830516378, + "grad_norm": 0.2353515625, + "learning_rate": 4.7630268574524656e-05, + "loss": 0.0099, + "mean_token_accuracy": 0.9984432697296143, + "num_tokens": 107704023.0, + "step": 34275 + }, + { + "entropy": 0.08081364408135414, + "epoch": 7.990791467537009, + "grad_norm": 2.046875, + "learning_rate": 4.762937425683215e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9953217029571533, + "num_tokens": 107720117.0, + "step": 34280 + }, + { + "entropy": 0.0741135323420167, + "epoch": 7.991957104557641, + "grad_norm": 1.6640625, + "learning_rate": 4.762847978809543e-05, + "loss": 0.0134, + "mean_token_accuracy": 0.9946883201599122, + "num_tokens": 107730835.0, + "step": 34285 + }, + { + "entropy": 0.07226054593920708, + "epoch": 7.993122741578272, + "grad_norm": 1.2265625, + "learning_rate": 4.7627585168327846e-05, + "loss": 0.0169, + "mean_token_accuracy": 0.9945705354213714, + "num_tokens": 107760637.0, + "step": 34290 + }, + { + "entropy": 0.0748120654374361, + "epoch": 7.994288378598904, + "grad_norm": 2.1875, + "learning_rate": 4.762669039754273e-05, + "loss": 0.0241, + "mean_token_accuracy": 0.9938279986381531, + "num_tokens": 107770993.0, + "step": 34295 + }, + { + "entropy": 0.0834178514778614, + "epoch": 7.995454015619536, + "grad_norm": 1.9375, + "learning_rate": 4.7625795475753436e-05, + "loss": 0.054, + "mean_token_accuracy": 0.9899437367916107, + "num_tokens": 107796597.0, + "step": 34300 + }, + { + "entropy": 0.0649189880117774, + "epoch": 7.996619652640168, + "grad_norm": 2.03125, + "learning_rate": 4.76249004029733e-05, + "loss": 0.0144, + "mean_token_accuracy": 0.9953900396823883, + "num_tokens": 107818585.0, + "step": 34305 + }, + { + "entropy": 0.06638593953102827, + "epoch": 7.9977852896608, + "grad_norm": 1.453125, + "learning_rate": 4.762400517921567e-05, + "loss": 0.0159, + "mean_token_accuracy": 0.9956220924854279, + "num_tokens": 107840086.0, + "step": 34310 + }, + { + "entropy": 0.0659774586558342, + "epoch": 7.998950926681431, + "grad_norm": 0.40625, + "learning_rate": 4.7623109804493895e-05, + "loss": 0.0168, + "mean_token_accuracy": 0.9956194818019867, + "num_tokens": 107861379.0, + "step": 34315 + }, + { + "entropy": 0.11034517497238186, + "epoch": 8.0, + "grad_norm": 0.28515625, + "learning_rate": 4.762221427882132e-05, + "loss": 0.0774, + "mean_token_accuracy": 0.9823514488008287, + "num_tokens": 107884500.0, + "step": 34320 + }, + { + "entropy": 0.0570672795176506, + "epoch": 8.001165637020632, + "grad_norm": 0.1298828125, + "learning_rate": 4.7621318602211315e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9987614572048187, + "num_tokens": 107910348.0, + "step": 34325 + }, + { + "entropy": 0.056932621914893386, + "epoch": 8.002331274041264, + "grad_norm": 0.4609375, + "learning_rate": 4.762042277467723e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.9991077303886413, + "num_tokens": 107928390.0, + "step": 34330 + }, + { + "entropy": 0.06185147017240524, + "epoch": 8.003496911061895, + "grad_norm": 0.796875, + "learning_rate": 4.761952679623241e-05, + "loss": 0.0038, + "mean_token_accuracy": 0.9989405035972595, + "num_tokens": 107945191.0, + "step": 34335 + }, + { + "entropy": 0.05853238175623119, + "epoch": 8.004662548082527, + "grad_norm": 0.38671875, + "learning_rate": 4.761863066689022e-05, + "loss": 0.0071, + "mean_token_accuracy": 0.9990803480148316, + "num_tokens": 107965503.0, + "step": 34340 + }, + { + "entropy": 0.07207233114168048, + "epoch": 8.005828185103159, + "grad_norm": 0.4765625, + "learning_rate": 4.761773438666404e-05, + "loss": 0.0042, + "mean_token_accuracy": 0.9984596371650696, + "num_tokens": 107983227.0, + "step": 34345 + }, + { + "entropy": 0.07162175551056862, + "epoch": 8.00699382212379, + "grad_norm": 1.3125, + "learning_rate": 4.761683795556722e-05, + "loss": 0.0103, + "mean_token_accuracy": 0.9979361593723297, + "num_tokens": 108005127.0, + "step": 34350 + }, + { + "entropy": 0.0725046245381236, + "epoch": 8.008159459144423, + "grad_norm": 0.267578125, + "learning_rate": 4.761594137361312e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9999236643314362, + "num_tokens": 108021479.0, + "step": 34355 + }, + { + "entropy": 0.04563398342579603, + "epoch": 8.009325096165055, + "grad_norm": 0.236328125, + "learning_rate": 4.7615044640815123e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9993497133255005, + "num_tokens": 108040627.0, + "step": 34360 + }, + { + "entropy": 0.047726039029657844, + "epoch": 8.010490733185685, + "grad_norm": 0.349609375, + "learning_rate": 4.76141477571866e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9991644740104675, + "num_tokens": 108073823.0, + "step": 34365 + }, + { + "entropy": 0.05970633877441287, + "epoch": 8.011656370206317, + "grad_norm": 0.8984375, + "learning_rate": 4.761325072274092e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9993616282939911, + "num_tokens": 108093022.0, + "step": 34370 + }, + { + "entropy": 0.049816728476434945, + "epoch": 8.01282200722695, + "grad_norm": 1.2578125, + "learning_rate": 4.761235353749146e-05, + "loss": 0.0056, + "mean_token_accuracy": 0.9986774921417236, + "num_tokens": 108119311.0, + "step": 34375 + }, + { + "entropy": 0.05257010804489255, + "epoch": 8.013987644247582, + "grad_norm": 0.26953125, + "learning_rate": 4.7611456201451604e-05, + "loss": 0.0077, + "mean_token_accuracy": 0.9974354028701782, + "num_tokens": 108141035.0, + "step": 34380 + }, + { + "entropy": 0.06397520806640386, + "epoch": 8.015153281268214, + "grad_norm": 0.72265625, + "learning_rate": 4.7610558714634724e-05, + "loss": 0.0022, + "mean_token_accuracy": 1.0, + "num_tokens": 108155271.0, + "step": 34385 + }, + { + "entropy": 0.05282603679224849, + "epoch": 8.016318918288844, + "grad_norm": 0.1328125, + "learning_rate": 4.76096610770542e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9998047232627869, + "num_tokens": 108186139.0, + "step": 34390 + }, + { + "entropy": 0.05000446168705821, + "epoch": 8.017484555309476, + "grad_norm": 0.09423828125, + "learning_rate": 4.7608763288723434e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9998030006885529, + "num_tokens": 108205726.0, + "step": 34395 + }, + { + "entropy": 0.057174905110150574, + "epoch": 8.018650192330108, + "grad_norm": 0.201171875, + "learning_rate": 4.76078653496558e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9996845364570618, + "num_tokens": 108226331.0, + "step": 34400 + }, + { + "entropy": 0.06045698598027229, + "epoch": 8.01981582935074, + "grad_norm": 1.71875, + "learning_rate": 4.760696725986469e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9985334575176239, + "num_tokens": 108245610.0, + "step": 34405 + }, + { + "entropy": 0.042647901270538566, + "epoch": 8.020981466371373, + "grad_norm": 0.0546875, + "learning_rate": 4.760606901936349e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.999316680431366, + "num_tokens": 108277590.0, + "step": 34410 + }, + { + "entropy": 0.06458899248391389, + "epoch": 8.022147103392003, + "grad_norm": 1.9296875, + "learning_rate": 4.7605170628165604e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.9993538677692413, + "num_tokens": 108304890.0, + "step": 34415 + }, + { + "entropy": 0.06744192838668824, + "epoch": 8.023312740412635, + "grad_norm": 0.1630859375, + "learning_rate": 4.760427208628443e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9989646315574646, + "num_tokens": 108323253.0, + "step": 34420 + }, + { + "entropy": 0.05032515674829483, + "epoch": 8.024478377433267, + "grad_norm": 0.453125, + "learning_rate": 4.760337339373336e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.9982992768287658, + "num_tokens": 108358355.0, + "step": 34425 + }, + { + "entropy": 0.062434613704681396, + "epoch": 8.0256440144539, + "grad_norm": 0.5234375, + "learning_rate": 4.76024745505258e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9996152639389038, + "num_tokens": 108382542.0, + "step": 34430 + }, + { + "entropy": 0.05185540029779077, + "epoch": 8.026809651474531, + "grad_norm": 0.490234375, + "learning_rate": 4.7601575556675135e-05, + "loss": 0.0058, + "mean_token_accuracy": 0.9983523845672607, + "num_tokens": 108422836.0, + "step": 34435 + }, + { + "entropy": 0.07135622762143612, + "epoch": 8.027975288495163, + "grad_norm": 0.5, + "learning_rate": 4.76006764121948e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9981033325195312, + "num_tokens": 108436442.0, + "step": 34440 + }, + { + "entropy": 0.06478116279467941, + "epoch": 8.029140925515794, + "grad_norm": 0.064453125, + "learning_rate": 4.759977711709818e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9992549657821655, + "num_tokens": 108462678.0, + "step": 34445 + }, + { + "entropy": 0.049783976096659896, + "epoch": 8.030306562536426, + "grad_norm": 0.11962890625, + "learning_rate": 4.759887767139869e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9983406960964203, + "num_tokens": 108484473.0, + "step": 34450 + }, + { + "entropy": 0.07408700212836265, + "epoch": 8.031472199557058, + "grad_norm": 3.125, + "learning_rate": 4.759797807510975e-05, + "loss": 0.0135, + "mean_token_accuracy": 0.9985843777656556, + "num_tokens": 108492586.0, + "step": 34455 + }, + { + "entropy": 0.1893187090754509, + "epoch": 8.03263783657769, + "grad_norm": 0.1494140625, + "learning_rate": 4.759707832824477e-05, + "loss": 0.3292, + "mean_token_accuracy": 0.9610701024532318, + "num_tokens": 108528181.0, + "step": 34460 + }, + { + "entropy": 0.04200795106589794, + "epoch": 8.033803473598322, + "grad_norm": 0.2490234375, + "learning_rate": 4.7596178430817156e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9991814613342285, + "num_tokens": 108550517.0, + "step": 34465 + }, + { + "entropy": 0.06480789603665471, + "epoch": 8.034969110618952, + "grad_norm": 0.2001953125, + "learning_rate": 4.759527838284035e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9986863672733307, + "num_tokens": 108564419.0, + "step": 34470 + }, + { + "entropy": 0.07080358192324639, + "epoch": 8.036134747639585, + "grad_norm": 0.0517578125, + "learning_rate": 4.7594378184327745e-05, + "loss": 0.0047, + "mean_token_accuracy": 0.9989554643630981, + "num_tokens": 108576162.0, + "step": 34475 + }, + { + "entropy": 0.051558745186775924, + "epoch": 8.037300384660217, + "grad_norm": 0.234375, + "learning_rate": 4.759347783529279e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.999577134847641, + "num_tokens": 108605987.0, + "step": 34480 + }, + { + "entropy": 0.05720545100048184, + "epoch": 8.038466021680849, + "grad_norm": 1.109375, + "learning_rate": 4.759257733574889e-05, + "loss": 0.0082, + "mean_token_accuracy": 0.9988365888595581, + "num_tokens": 108625363.0, + "step": 34485 + }, + { + "entropy": 0.0536104841157794, + "epoch": 8.03963165870148, + "grad_norm": 0.466796875, + "learning_rate": 4.7591676685709486e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.9984508991241455, + "num_tokens": 108659582.0, + "step": 34490 + }, + { + "entropy": 0.06404474405571818, + "epoch": 8.040797295722113, + "grad_norm": 1.0546875, + "learning_rate": 4.7590775885188e-05, + "loss": 0.0071, + "mean_token_accuracy": 0.9984810829162598, + "num_tokens": 108676935.0, + "step": 34495 + }, + { + "entropy": 0.0648849243298173, + "epoch": 8.041962932742743, + "grad_norm": 0.279296875, + "learning_rate": 4.758987493419787e-05, + "loss": 0.0047, + "mean_token_accuracy": 0.998368215560913, + "num_tokens": 108699414.0, + "step": 34500 + }, + { + "entropy": 0.05447390619665384, + "epoch": 8.043128569763375, + "grad_norm": 0.26171875, + "learning_rate": 4.758897383275253e-05, + "loss": 0.0038, + "mean_token_accuracy": 0.9990447640419007, + "num_tokens": 108717800.0, + "step": 34505 + }, + { + "entropy": 0.04812881154939532, + "epoch": 8.044294206784008, + "grad_norm": 0.28515625, + "learning_rate": 4.7588072580865416e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9990270733833313, + "num_tokens": 108737812.0, + "step": 34510 + }, + { + "entropy": 0.030604275315999983, + "epoch": 8.04545984380464, + "grad_norm": 0.171875, + "learning_rate": 4.758717117854997e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9998698234558105, + "num_tokens": 108779109.0, + "step": 34515 + }, + { + "entropy": 0.06884174533188343, + "epoch": 8.046625480825272, + "grad_norm": 0.76171875, + "learning_rate": 4.7586269625819624e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9994652390480041, + "num_tokens": 108790310.0, + "step": 34520 + }, + { + "entropy": 0.06846872791647911, + "epoch": 8.047791117845902, + "grad_norm": 0.455078125, + "learning_rate": 4.7585367922687826e-05, + "loss": 0.0096, + "mean_token_accuracy": 0.996958750486374, + "num_tokens": 108808837.0, + "step": 34525 + }, + { + "entropy": 0.07946307212114334, + "epoch": 8.048956754866534, + "grad_norm": 0.423828125, + "learning_rate": 4.758446606916803e-05, + "loss": 0.0374, + "mean_token_accuracy": 0.9925918638706207, + "num_tokens": 108828868.0, + "step": 34530 + }, + { + "entropy": 0.0731971831060946, + "epoch": 8.050122391887166, + "grad_norm": 0.4609375, + "learning_rate": 4.758356406527367e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9987062692642212, + "num_tokens": 108845014.0, + "step": 34535 + }, + { + "entropy": 0.06874092482030392, + "epoch": 8.051288028907798, + "grad_norm": 0.9609375, + "learning_rate": 4.758266191101821e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9991285920143127, + "num_tokens": 108875202.0, + "step": 34540 + }, + { + "entropy": 0.07375312838703393, + "epoch": 8.05245366592843, + "grad_norm": 0.10546875, + "learning_rate": 4.7581759606415085e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9993589997291565, + "num_tokens": 108887896.0, + "step": 34545 + }, + { + "entropy": 0.06638930989429355, + "epoch": 8.05361930294906, + "grad_norm": 0.1943359375, + "learning_rate": 4.7580857151477775e-05, + "loss": 0.0069, + "mean_token_accuracy": 0.9990043401718139, + "num_tokens": 108906837.0, + "step": 34550 + }, + { + "entropy": 0.05511264381930232, + "epoch": 8.054784939969693, + "grad_norm": 0.1669921875, + "learning_rate": 4.7579954546219715e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9984936416149139, + "num_tokens": 108935559.0, + "step": 34555 + }, + { + "entropy": 0.08696108423173428, + "epoch": 8.055950576990325, + "grad_norm": 1.2890625, + "learning_rate": 4.757905179065437e-05, + "loss": 0.0047, + "mean_token_accuracy": 0.9991724133491516, + "num_tokens": 108946896.0, + "step": 34560 + }, + { + "entropy": 0.06002172166481614, + "epoch": 8.057116214010957, + "grad_norm": 1.4765625, + "learning_rate": 4.75781488847952e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.9990639746189117, + "num_tokens": 108973470.0, + "step": 34565 + }, + { + "entropy": 0.06480908133089543, + "epoch": 8.05828185103159, + "grad_norm": 0.21875, + "learning_rate": 4.757724582865568e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.999623441696167, + "num_tokens": 108994182.0, + "step": 34570 + }, + { + "entropy": 0.18375195730477573, + "epoch": 8.059447488052221, + "grad_norm": 4.6875, + "learning_rate": 4.7576342622249263e-05, + "loss": 0.2482, + "mean_token_accuracy": 0.9745356976985932, + "num_tokens": 109016581.0, + "step": 34575 + }, + { + "entropy": 0.049759179912507535, + "epoch": 8.060613125072852, + "grad_norm": 0.06591796875, + "learning_rate": 4.757543926558943e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9999882936477661, + "num_tokens": 109042756.0, + "step": 34580 + }, + { + "entropy": 0.06753350887447596, + "epoch": 8.061778762093484, + "grad_norm": 0.25390625, + "learning_rate": 4.757453575868963e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9990081429481507, + "num_tokens": 109061726.0, + "step": 34585 + }, + { + "entropy": 0.053464246727526185, + "epoch": 8.062944399114116, + "grad_norm": 0.240234375, + "learning_rate": 4.757363210156336e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9998391628265381, + "num_tokens": 109097127.0, + "step": 34590 + }, + { + "entropy": 0.061327016353607176, + "epoch": 8.064110036134748, + "grad_norm": 0.58203125, + "learning_rate": 4.757272829422407e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9997431099414825, + "num_tokens": 109122097.0, + "step": 34595 + }, + { + "entropy": 0.05932259801775217, + "epoch": 8.06527567315538, + "grad_norm": 2.328125, + "learning_rate": 4.7571824336685266e-05, + "loss": 0.0085, + "mean_token_accuracy": 0.9978394627571106, + "num_tokens": 109135503.0, + "step": 34600 + }, + { + "entropy": 0.05971184335649014, + "epoch": 8.06644131017601, + "grad_norm": 1.1015625, + "learning_rate": 4.75709202289604e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.999554580450058, + "num_tokens": 109160791.0, + "step": 34605 + }, + { + "entropy": 0.14114007484167815, + "epoch": 8.067606947196643, + "grad_norm": 3.96875, + "learning_rate": 4.757001597106297e-05, + "loss": 0.1574, + "mean_token_accuracy": 0.978978055715561, + "num_tokens": 109184846.0, + "step": 34610 + }, + { + "entropy": 0.0502942712046206, + "epoch": 8.068772584217275, + "grad_norm": 1.6484375, + "learning_rate": 4.7569111563006464e-05, + "loss": 0.0055, + "mean_token_accuracy": 0.9975858092308044, + "num_tokens": 109208910.0, + "step": 34615 + }, + { + "entropy": 0.06697574276477099, + "epoch": 8.069938221237907, + "grad_norm": 1.03125, + "learning_rate": 4.756820700480435e-05, + "loss": 0.0077, + "mean_token_accuracy": 0.997665673494339, + "num_tokens": 109232222.0, + "step": 34620 + }, + { + "entropy": 0.05864384537562728, + "epoch": 8.071103858258539, + "grad_norm": 0.173828125, + "learning_rate": 4.756730229647014e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9985059797763824, + "num_tokens": 109261238.0, + "step": 34625 + }, + { + "entropy": 0.05310507658869028, + "epoch": 8.072269495279171, + "grad_norm": 2.8125, + "learning_rate": 4.756639743801729e-05, + "loss": 0.0096, + "mean_token_accuracy": 0.9981181919574738, + "num_tokens": 109293773.0, + "step": 34630 + }, + { + "entropy": 0.055432358756661415, + "epoch": 8.073435132299801, + "grad_norm": 0.31640625, + "learning_rate": 4.7565492429459327e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9994365215301514, + "num_tokens": 109305075.0, + "step": 34635 + }, + { + "entropy": 0.06921327225863934, + "epoch": 8.074600769320433, + "grad_norm": 0.1806640625, + "learning_rate": 4.756458727080973e-05, + "loss": 0.0364, + "mean_token_accuracy": 0.9931345105171203, + "num_tokens": 109328180.0, + "step": 34640 + }, + { + "entropy": 0.09400631617754698, + "epoch": 8.075766406341065, + "grad_norm": 0.11474609375, + "learning_rate": 4.7563681962082e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9998493969440461, + "num_tokens": 109340085.0, + "step": 34645 + }, + { + "entropy": 0.06292316848412156, + "epoch": 8.076932043361698, + "grad_norm": 0.78515625, + "learning_rate": 4.756277650328963e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9987617790699005, + "num_tokens": 109361790.0, + "step": 34650 + }, + { + "entropy": 0.04522802149876952, + "epoch": 8.07809768038233, + "grad_norm": 0.1240234375, + "learning_rate": 4.756187089444613e-05, + "loss": 0.0083, + "mean_token_accuracy": 0.9972607851028442, + "num_tokens": 109391813.0, + "step": 34655 + }, + { + "entropy": 0.07555988952517509, + "epoch": 8.07926331740296, + "grad_norm": 0.35546875, + "learning_rate": 4.7560965135565e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9993071317672729, + "num_tokens": 109424065.0, + "step": 34660 + }, + { + "entropy": 0.07265737438574434, + "epoch": 8.080428954423592, + "grad_norm": 2.0625, + "learning_rate": 4.756005922665975e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.998919528722763, + "num_tokens": 109435647.0, + "step": 34665 + }, + { + "entropy": 0.06973394704982638, + "epoch": 8.081594591444224, + "grad_norm": 0.5, + "learning_rate": 4.7559153167743886e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9993018686771393, + "num_tokens": 109455917.0, + "step": 34670 + }, + { + "entropy": 0.06335199475288392, + "epoch": 8.082760228464856, + "grad_norm": 0.201171875, + "learning_rate": 4.7558246958830916e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9993453681468963, + "num_tokens": 109466263.0, + "step": 34675 + }, + { + "entropy": 0.058289121463894845, + "epoch": 8.083925865485488, + "grad_norm": 1.4453125, + "learning_rate": 4.755734059993436e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9986499547958374, + "num_tokens": 109479761.0, + "step": 34680 + }, + { + "entropy": 0.06979324482381344, + "epoch": 8.085091502506119, + "grad_norm": 2.59375, + "learning_rate": 4.755643409106772e-05, + "loss": 0.0075, + "mean_token_accuracy": 0.9977883517742157, + "num_tokens": 109493751.0, + "step": 34685 + }, + { + "entropy": 0.06401132261380553, + "epoch": 8.08625713952675, + "grad_norm": 0.263671875, + "learning_rate": 4.755552743224453e-05, + "loss": 0.0091, + "mean_token_accuracy": 0.9982850253582001, + "num_tokens": 109514572.0, + "step": 34690 + }, + { + "entropy": 0.07194490609690547, + "epoch": 8.087422776547383, + "grad_norm": 0.47265625, + "learning_rate": 4.7554620623478294e-05, + "loss": 0.0208, + "mean_token_accuracy": 0.9971080601215363, + "num_tokens": 109539657.0, + "step": 34695 + }, + { + "entropy": 0.06034817099571228, + "epoch": 8.088588413568015, + "grad_norm": 0.33203125, + "learning_rate": 4.755371366478255e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9994564294815064, + "num_tokens": 109560886.0, + "step": 34700 + }, + { + "entropy": 0.05557254049926996, + "epoch": 8.089754050588647, + "grad_norm": 0.39453125, + "learning_rate": 4.755280655617081e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.9980686604976654, + "num_tokens": 109591798.0, + "step": 34705 + }, + { + "entropy": 0.05787776857614517, + "epoch": 8.09091968760928, + "grad_norm": 0.439453125, + "learning_rate": 4.75518992976566e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9983800172805786, + "num_tokens": 109614304.0, + "step": 34710 + }, + { + "entropy": 0.07849300522357225, + "epoch": 8.09208532462991, + "grad_norm": 0.58203125, + "learning_rate": 4.755099188925346e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.999658054113388, + "num_tokens": 109628888.0, + "step": 34715 + }, + { + "entropy": 0.05869809268042445, + "epoch": 8.093250961650542, + "grad_norm": 0.1923828125, + "learning_rate": 4.7550084330974906e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9989194750785828, + "num_tokens": 109655835.0, + "step": 34720 + }, + { + "entropy": 0.06568187102675438, + "epoch": 8.094416598671174, + "grad_norm": 1.84375, + "learning_rate": 4.7549176622834476e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.9967410624027252, + "num_tokens": 109675362.0, + "step": 34725 + }, + { + "entropy": 0.07911296645179391, + "epoch": 8.095582235691806, + "grad_norm": 0.1171875, + "learning_rate": 4.754826876484572e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.998859316110611, + "num_tokens": 109695961.0, + "step": 34730 + }, + { + "entropy": 0.05620421562343836, + "epoch": 8.096747872712438, + "grad_norm": 0.6171875, + "learning_rate": 4.754736075702216e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9992101728916168, + "num_tokens": 109727604.0, + "step": 34735 + }, + { + "entropy": 0.06297731064260007, + "epoch": 8.097913509733068, + "grad_norm": 0.47265625, + "learning_rate": 4.754645259937733e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9980818867683411, + "num_tokens": 109747309.0, + "step": 34740 + }, + { + "entropy": 0.06897974638268352, + "epoch": 8.0990791467537, + "grad_norm": 0.59375, + "learning_rate": 4.754554429192479e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9988614618778229, + "num_tokens": 109769049.0, + "step": 34745 + }, + { + "entropy": 0.056776642613112924, + "epoch": 8.100244783774333, + "grad_norm": 1.3203125, + "learning_rate": 4.754463583467808e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9992092728614808, + "num_tokens": 109792119.0, + "step": 34750 + }, + { + "entropy": 0.07602698244154453, + "epoch": 8.101410420794965, + "grad_norm": 0.33203125, + "learning_rate": 4.754372722765073e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9994484543800354, + "num_tokens": 109801617.0, + "step": 34755 + }, + { + "entropy": 0.0649228509515524, + "epoch": 8.102576057815597, + "grad_norm": 1.4765625, + "learning_rate": 4.7542818470856295e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9990556597709656, + "num_tokens": 109812394.0, + "step": 34760 + }, + { + "entropy": 0.07589486986398697, + "epoch": 8.103741694836229, + "grad_norm": 0.6015625, + "learning_rate": 4.754190956430834e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9998491406440735, + "num_tokens": 109831667.0, + "step": 34765 + }, + { + "entropy": 0.0709638643078506, + "epoch": 8.10490733185686, + "grad_norm": 0.5703125, + "learning_rate": 4.7541000508020415e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9997025489807129, + "num_tokens": 109857628.0, + "step": 34770 + }, + { + "entropy": 0.09596365876495838, + "epoch": 8.106072968877491, + "grad_norm": 0.89453125, + "learning_rate": 4.7540091302006065e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9993020176887513, + "num_tokens": 109867038.0, + "step": 34775 + }, + { + "entropy": 0.04796134922653437, + "epoch": 8.107238605898123, + "grad_norm": 0.365234375, + "learning_rate": 4.753918194627886e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.999389237165451, + "num_tokens": 109894744.0, + "step": 34780 + }, + { + "entropy": 0.0685146126896143, + "epoch": 8.108404242918756, + "grad_norm": 1.59375, + "learning_rate": 4.753827244085234e-05, + "loss": 0.0105, + "mean_token_accuracy": 0.9986864030361176, + "num_tokens": 109905112.0, + "step": 34785 + }, + { + "entropy": 0.050441629253327847, + "epoch": 8.109569879939388, + "grad_norm": 0.275390625, + "learning_rate": 4.7537362785740084e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9994386732578278, + "num_tokens": 109927501.0, + "step": 34790 + }, + { + "entropy": 0.05915358606725931, + "epoch": 8.110735516960018, + "grad_norm": 0.2177734375, + "learning_rate": 4.7536452980955656e-05, + "loss": 0.0075, + "mean_token_accuracy": 0.9971899330615998, + "num_tokens": 109956240.0, + "step": 34795 + }, + { + "entropy": 0.058942811191082, + "epoch": 8.11190115398065, + "grad_norm": 0.466796875, + "learning_rate": 4.7535543026512616e-05, + "loss": 0.0083, + "mean_token_accuracy": 0.9983623147010803, + "num_tokens": 109971126.0, + "step": 34800 + }, + { + "entropy": 0.06020897924900055, + "epoch": 8.113066791001282, + "grad_norm": 0.263671875, + "learning_rate": 4.753463292242453e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9993130803108216, + "num_tokens": 109992921.0, + "step": 34805 + }, + { + "entropy": 0.05796087365597487, + "epoch": 8.114232428021914, + "grad_norm": 1.1875, + "learning_rate": 4.7533722668704975e-05, + "loss": 0.0038, + "mean_token_accuracy": 0.9987258195877076, + "num_tokens": 110003456.0, + "step": 34810 + }, + { + "entropy": 0.074308517947793, + "epoch": 8.115398065042546, + "grad_norm": 1.7890625, + "learning_rate": 4.7532812265367534e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9994267642498016, + "num_tokens": 110013095.0, + "step": 34815 + }, + { + "entropy": 0.06139457672834396, + "epoch": 8.116563702063177, + "grad_norm": 0.41015625, + "learning_rate": 4.753190171242576e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.998884916305542, + "num_tokens": 110026782.0, + "step": 34820 + }, + { + "entropy": 0.05485517792403698, + "epoch": 8.117729339083809, + "grad_norm": 1.109375, + "learning_rate": 4.753099100989324e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9990606665611267, + "num_tokens": 110039842.0, + "step": 34825 + }, + { + "entropy": 0.07333013117313385, + "epoch": 8.11889497610444, + "grad_norm": 0.51171875, + "learning_rate": 4.753008015778357e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.9987470507621765, + "num_tokens": 110053856.0, + "step": 34830 + }, + { + "entropy": 0.05965627012774348, + "epoch": 8.120060613125073, + "grad_norm": 0.4140625, + "learning_rate": 4.752916915611031e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9996311247348786, + "num_tokens": 110069531.0, + "step": 34835 + }, + { + "entropy": 0.04760171640664339, + "epoch": 8.121226250145705, + "grad_norm": 0.6875, + "learning_rate": 4.7528258004887064e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9989043712615967, + "num_tokens": 110088907.0, + "step": 34840 + }, + { + "entropy": 0.050096172746270894, + "epoch": 8.122391887166337, + "grad_norm": 0.380859375, + "learning_rate": 4.75273467041274e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9991250276565552, + "num_tokens": 110114696.0, + "step": 34845 + }, + { + "entropy": 0.06456055156886578, + "epoch": 8.123557524186968, + "grad_norm": 1.1015625, + "learning_rate": 4.752643525384491e-05, + "loss": 0.0042, + "mean_token_accuracy": 0.9991252899169922, + "num_tokens": 110127399.0, + "step": 34850 + }, + { + "entropy": 0.059490696713328364, + "epoch": 8.1247231612076, + "grad_norm": 1.8828125, + "learning_rate": 4.7525523654053194e-05, + "loss": 0.0116, + "mean_token_accuracy": 0.9978362441062927, + "num_tokens": 110146673.0, + "step": 34855 + }, + { + "entropy": 0.07245905827730895, + "epoch": 8.125888798228232, + "grad_norm": 0.24609375, + "learning_rate": 4.752461190476584e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9997872352600098, + "num_tokens": 110158175.0, + "step": 34860 + }, + { + "entropy": 0.0838863905519247, + "epoch": 8.127054435248864, + "grad_norm": 0.8359375, + "learning_rate": 4.752370000599644e-05, + "loss": 0.0058, + "mean_token_accuracy": 0.9980123996734619, + "num_tokens": 110167746.0, + "step": 34865 + }, + { + "entropy": 0.06200479865074158, + "epoch": 8.128220072269496, + "grad_norm": 0.28515625, + "learning_rate": 4.752278795775861e-05, + "loss": 0.006, + "mean_token_accuracy": 0.9985685169696807, + "num_tokens": 110196220.0, + "step": 34870 + }, + { + "entropy": 0.08028090875595809, + "epoch": 8.129385709290126, + "grad_norm": 0.1845703125, + "learning_rate": 4.752187576006593e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9991875410079956, + "num_tokens": 110209683.0, + "step": 34875 + }, + { + "entropy": 0.05563276670873165, + "epoch": 8.130551346310758, + "grad_norm": 3.0, + "learning_rate": 4.7520963412932004e-05, + "loss": 0.007, + "mean_token_accuracy": 0.9985905766487122, + "num_tokens": 110240404.0, + "step": 34880 + }, + { + "entropy": 0.06231335056945682, + "epoch": 8.13171698333139, + "grad_norm": 0.1875, + "learning_rate": 4.752005091637044e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9995314836502075, + "num_tokens": 110267847.0, + "step": 34885 + }, + { + "entropy": 0.054864462465047836, + "epoch": 8.132882620352023, + "grad_norm": 0.2412109375, + "learning_rate": 4.751913827039485e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9984963834285736, + "num_tokens": 110281664.0, + "step": 34890 + }, + { + "entropy": 0.056126052886247633, + "epoch": 8.134048257372655, + "grad_norm": 0.2333984375, + "learning_rate": 4.751822547501884e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9994036316871643, + "num_tokens": 110311704.0, + "step": 34895 + }, + { + "entropy": 0.053292475175112486, + "epoch": 8.135213894393287, + "grad_norm": 2.25, + "learning_rate": 4.751731253025602e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.9993162214756012, + "num_tokens": 110334636.0, + "step": 34900 + }, + { + "entropy": 0.046154945995658635, + "epoch": 8.136379531413917, + "grad_norm": 0.27734375, + "learning_rate": 4.751639943612e-05, + "loss": 0.006, + "mean_token_accuracy": 0.998881858587265, + "num_tokens": 110352633.0, + "step": 34905 + }, + { + "entropy": 0.07911163456737995, + "epoch": 8.13754516843455, + "grad_norm": 1.8359375, + "learning_rate": 4.7515486192624404e-05, + "loss": 0.0116, + "mean_token_accuracy": 0.9959693968296051, + "num_tokens": 110363195.0, + "step": 34910 + }, + { + "entropy": 0.06139131411910057, + "epoch": 8.138710805455181, + "grad_norm": 1.2109375, + "learning_rate": 4.7514572799782845e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9995906531810761, + "num_tokens": 110377365.0, + "step": 34915 + }, + { + "entropy": 0.05661859530955553, + "epoch": 8.139876442475813, + "grad_norm": 0.6640625, + "learning_rate": 4.7513659257608937e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9978150308132172, + "num_tokens": 110393155.0, + "step": 34920 + }, + { + "entropy": 0.059014659747481345, + "epoch": 8.141042079496446, + "grad_norm": 2.59375, + "learning_rate": 4.7512745566116306e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.9981638669967652, + "num_tokens": 110415426.0, + "step": 34925 + }, + { + "entropy": 0.05055002924054861, + "epoch": 8.142207716517076, + "grad_norm": 0.74609375, + "learning_rate": 4.751183172531858e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.9986936986446381, + "num_tokens": 110440385.0, + "step": 34930 + }, + { + "entropy": 0.09240026883780957, + "epoch": 8.143373353537708, + "grad_norm": 0.9296875, + "learning_rate": 4.7510917735229395e-05, + "loss": 0.0066, + "mean_token_accuracy": 0.9980834245681762, + "num_tokens": 110448900.0, + "step": 34935 + }, + { + "entropy": 0.0694688574410975, + "epoch": 8.14453899055834, + "grad_norm": 1.25, + "learning_rate": 4.7510003595862354e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.999321436882019, + "num_tokens": 110467815.0, + "step": 34940 + }, + { + "entropy": 0.06604335568845272, + "epoch": 8.145704627578972, + "grad_norm": 0.74609375, + "learning_rate": 4.7509089307231114e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9987660408020019, + "num_tokens": 110484450.0, + "step": 34945 + }, + { + "entropy": 0.04780478095635772, + "epoch": 8.146870264599604, + "grad_norm": 0.65625, + "learning_rate": 4.75081748693493e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9983278274536133, + "num_tokens": 110505707.0, + "step": 34950 + }, + { + "entropy": 0.08801990495994687, + "epoch": 8.148035901620235, + "grad_norm": 0.21484375, + "learning_rate": 4.750726028223054e-05, + "loss": 0.0361, + "mean_token_accuracy": 0.9941394925117493, + "num_tokens": 110533523.0, + "step": 34955 + }, + { + "entropy": 0.093077028170228, + "epoch": 8.149201538640867, + "grad_norm": 0.359375, + "learning_rate": 4.7506345545888475e-05, + "loss": 0.008, + "mean_token_accuracy": 0.997499966621399, + "num_tokens": 110541514.0, + "step": 34960 + }, + { + "entropy": 0.06018462534993887, + "epoch": 8.150367175661499, + "grad_norm": 0.408203125, + "learning_rate": 4.750543066033675e-05, + "loss": 0.0071, + "mean_token_accuracy": 0.9981537163257599, + "num_tokens": 110571866.0, + "step": 34965 + }, + { + "entropy": 0.06054795626550913, + "epoch": 8.151532812682131, + "grad_norm": 0.2060546875, + "learning_rate": 4.750451562558901e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.9983173251152039, + "num_tokens": 110590691.0, + "step": 34970 + }, + { + "entropy": 0.08489367999136448, + "epoch": 8.152698449702763, + "grad_norm": 0.2001953125, + "learning_rate": 4.7503600441658886e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9990243017673492, + "num_tokens": 110607698.0, + "step": 34975 + }, + { + "entropy": 0.06136819664388895, + "epoch": 8.153864086723395, + "grad_norm": 2.328125, + "learning_rate": 4.750268510856003e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9964813113212585, + "num_tokens": 110622126.0, + "step": 34980 + }, + { + "entropy": 0.054417230654507875, + "epoch": 8.155029723744025, + "grad_norm": 0.28515625, + "learning_rate": 4.750176962630611e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9996447026729584, + "num_tokens": 110655232.0, + "step": 34985 + }, + { + "entropy": 0.08106433693319559, + "epoch": 8.156195360764658, + "grad_norm": 2.03125, + "learning_rate": 4.750085399491075e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9984701454639435, + "num_tokens": 110666627.0, + "step": 34990 + }, + { + "entropy": 0.06782528571784496, + "epoch": 8.15736099778529, + "grad_norm": 0.5546875, + "learning_rate": 4.7499938214387616e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9992859661579132, + "num_tokens": 110676515.0, + "step": 34995 + }, + { + "entropy": 0.059601149149239065, + "epoch": 8.158526634805922, + "grad_norm": 0.45703125, + "learning_rate": 4.7499022284750367e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9995229601860046, + "num_tokens": 110701167.0, + "step": 35000 + }, + { + "entropy": 0.061528265848755835, + "epoch": 8.159692271826554, + "grad_norm": 0.1337890625, + "learning_rate": 4.749810620601265e-05, + "loss": 0.0076, + "mean_token_accuracy": 0.9985450446605683, + "num_tokens": 110728803.0, + "step": 35005 + }, + { + "entropy": 0.0531819636002183, + "epoch": 8.160857908847184, + "grad_norm": 0.173828125, + "learning_rate": 4.749718997818814e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9998791217803955, + "num_tokens": 110749977.0, + "step": 35010 + }, + { + "entropy": 0.06418558629229665, + "epoch": 8.162023545867816, + "grad_norm": 0.267578125, + "learning_rate": 4.749627360129048e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9985103130340576, + "num_tokens": 110773354.0, + "step": 35015 + }, + { + "entropy": 0.05865657143294811, + "epoch": 8.163189182888448, + "grad_norm": 0.326171875, + "learning_rate": 4.749535707533335e-05, + "loss": 0.0111, + "mean_token_accuracy": 0.9982908666133881, + "num_tokens": 110793997.0, + "step": 35020 + }, + { + "entropy": 0.04337018961086869, + "epoch": 8.16435481990908, + "grad_norm": 1.09375, + "learning_rate": 4.749444040033042e-05, + "loss": 0.0047, + "mean_token_accuracy": 0.9983787119388581, + "num_tokens": 110833879.0, + "step": 35025 + }, + { + "entropy": 0.05562132876366377, + "epoch": 8.165520456929713, + "grad_norm": 0.419921875, + "learning_rate": 4.749352357629534e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9996047914028168, + "num_tokens": 110858174.0, + "step": 35030 + }, + { + "entropy": 0.06040436141192913, + "epoch": 8.166686093950345, + "grad_norm": 0.64453125, + "learning_rate": 4.74926066032418e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9999040007591248, + "num_tokens": 110869837.0, + "step": 35035 + }, + { + "entropy": 0.05392250213772058, + "epoch": 8.167851730970975, + "grad_norm": 0.26953125, + "learning_rate": 4.7491689481183454e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9994827568531036, + "num_tokens": 110885984.0, + "step": 35040 + }, + { + "entropy": 0.05607498260214925, + "epoch": 8.169017367991607, + "grad_norm": 0.65625, + "learning_rate": 4.7490772210134007e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9998294651508332, + "num_tokens": 110915562.0, + "step": 35045 + }, + { + "entropy": 0.06420133570209145, + "epoch": 8.17018300501224, + "grad_norm": 0.466796875, + "learning_rate": 4.7489854790107104e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9996419370174408, + "num_tokens": 110942745.0, + "step": 35050 + }, + { + "entropy": 0.047397821210324764, + "epoch": 8.171348642032871, + "grad_norm": 1.265625, + "learning_rate": 4.7488937221116446e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9984205842018128, + "num_tokens": 110963960.0, + "step": 35055 + }, + { + "entropy": 0.0502863303758204, + "epoch": 8.172514279053503, + "grad_norm": 0.30078125, + "learning_rate": 4.748801950317571e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9994450092315674, + "num_tokens": 110988319.0, + "step": 35060 + }, + { + "entropy": 0.05009726490825415, + "epoch": 8.173679916074134, + "grad_norm": 1.703125, + "learning_rate": 4.748710163629858e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9994385838508606, + "num_tokens": 111018445.0, + "step": 35065 + }, + { + "entropy": 0.07948374161496759, + "epoch": 8.174845553094766, + "grad_norm": 0.09912109375, + "learning_rate": 4.748618362049875e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9990148901939392, + "num_tokens": 111044428.0, + "step": 35070 + }, + { + "entropy": 0.06243461025878787, + "epoch": 8.176011190115398, + "grad_norm": 1.8203125, + "learning_rate": 4.7485265455789894e-05, + "loss": 0.0065, + "mean_token_accuracy": 0.998746132850647, + "num_tokens": 111070780.0, + "step": 35075 + }, + { + "entropy": 0.061110925208777186, + "epoch": 8.17717682713603, + "grad_norm": 5.25, + "learning_rate": 4.748434714218571e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9985956192016602, + "num_tokens": 111088837.0, + "step": 35080 + }, + { + "entropy": 0.06326413620263338, + "epoch": 8.178342464156662, + "grad_norm": 1.7890625, + "learning_rate": 4.74834286796999e-05, + "loss": 0.0057, + "mean_token_accuracy": 0.9978280127048492, + "num_tokens": 111115422.0, + "step": 35085 + }, + { + "entropy": 0.05515581658110023, + "epoch": 8.179508101177293, + "grad_norm": 1.2890625, + "learning_rate": 4.7482510068346145e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9984103560447692, + "num_tokens": 111129777.0, + "step": 35090 + }, + { + "entropy": 0.06779031567275524, + "epoch": 8.180673738197925, + "grad_norm": 1.0546875, + "learning_rate": 4.748159130813816e-05, + "loss": 0.0117, + "mean_token_accuracy": 0.9983104467391968, + "num_tokens": 111145557.0, + "step": 35095 + }, + { + "entropy": 0.05313057005405426, + "epoch": 8.181839375218557, + "grad_norm": 0.0771484375, + "learning_rate": 4.748067239908963e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9984862327575683, + "num_tokens": 111169517.0, + "step": 35100 + }, + { + "entropy": 0.07514633145183325, + "epoch": 8.183005012239189, + "grad_norm": 0.90625, + "learning_rate": 4.747975334121426e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.9984767377376557, + "num_tokens": 111182469.0, + "step": 35105 + }, + { + "entropy": 0.0630564677529037, + "epoch": 8.184170649259821, + "grad_norm": 0.1298828125, + "learning_rate": 4.7478834134525756e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9989075541496277, + "num_tokens": 111214410.0, + "step": 35110 + }, + { + "entropy": 0.058083107136189936, + "epoch": 8.185336286280453, + "grad_norm": 0.734375, + "learning_rate": 4.747791477903783e-05, + "loss": 0.0103, + "mean_token_accuracy": 0.9987756073474884, + "num_tokens": 111234767.0, + "step": 35115 + }, + { + "entropy": 0.04894097140058875, + "epoch": 8.186501923301083, + "grad_norm": 0.6328125, + "learning_rate": 4.7476995274764186e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9996597945690155, + "num_tokens": 111261638.0, + "step": 35120 + }, + { + "entropy": 0.06282207742333412, + "epoch": 8.187667560321715, + "grad_norm": 1.21875, + "learning_rate": 4.747607562171854e-05, + "loss": 0.0075, + "mean_token_accuracy": 0.9990973234176636, + "num_tokens": 111271401.0, + "step": 35125 + }, + { + "entropy": 0.07177103348076344, + "epoch": 8.188833197342348, + "grad_norm": 2.234375, + "learning_rate": 4.74751558199146e-05, + "loss": 0.005, + "mean_token_accuracy": 0.9983224630355835, + "num_tokens": 111280641.0, + "step": 35130 + }, + { + "entropy": 0.09346720837056637, + "epoch": 8.18999883436298, + "grad_norm": 0.0859375, + "learning_rate": 4.747423586936608e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9991428554058075, + "num_tokens": 111288976.0, + "step": 35135 + }, + { + "entropy": 0.06719689685851335, + "epoch": 8.191164471383612, + "grad_norm": 2.5625, + "learning_rate": 4.74733157700867e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9985256791114807, + "num_tokens": 111298730.0, + "step": 35140 + }, + { + "entropy": 0.08412168696522712, + "epoch": 8.192330108404242, + "grad_norm": 2.921875, + "learning_rate": 4.7472395522090186e-05, + "loss": 0.0323, + "mean_token_accuracy": 0.9948005080223083, + "num_tokens": 111317180.0, + "step": 35145 + }, + { + "entropy": 0.06580277308821678, + "epoch": 8.193495745424874, + "grad_norm": 0.7890625, + "learning_rate": 4.747147512539025e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9991651713848114, + "num_tokens": 111331434.0, + "step": 35150 + }, + { + "entropy": 0.041099204868078235, + "epoch": 8.194661382445506, + "grad_norm": 1.75, + "learning_rate": 4.747055458000063e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9992423474788665, + "num_tokens": 111369635.0, + "step": 35155 + }, + { + "entropy": 0.0656563414260745, + "epoch": 8.195827019466138, + "grad_norm": 0.3203125, + "learning_rate": 4.7469633885935037e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9993948400020599, + "num_tokens": 111391309.0, + "step": 35160 + }, + { + "entropy": 0.06426733545958996, + "epoch": 8.19699265648677, + "grad_norm": 1.3828125, + "learning_rate": 4.746871304320721e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.999400395154953, + "num_tokens": 111406533.0, + "step": 35165 + }, + { + "entropy": 0.06955592483282089, + "epoch": 8.198158293507403, + "grad_norm": 3.875, + "learning_rate": 4.746779205183088e-05, + "loss": 0.0143, + "mean_token_accuracy": 0.9961112380027771, + "num_tokens": 111418189.0, + "step": 35170 + }, + { + "entropy": 0.045057310909032824, + "epoch": 8.199323930528033, + "grad_norm": 0.328125, + "learning_rate": 4.746687091181977e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9985878467559814, + "num_tokens": 111449378.0, + "step": 35175 + }, + { + "entropy": 0.049578765965998174, + "epoch": 8.200489567548665, + "grad_norm": 0.40234375, + "learning_rate": 4.7465949623187635e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9987337410449981, + "num_tokens": 111474869.0, + "step": 35180 + }, + { + "entropy": 0.04917066413909197, + "epoch": 8.201655204569297, + "grad_norm": 0.173828125, + "learning_rate": 4.74650281859482e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9996753096580505, + "num_tokens": 111502113.0, + "step": 35185 + }, + { + "entropy": 0.07581342617049813, + "epoch": 8.20282084158993, + "grad_norm": 0.40234375, + "learning_rate": 4.74641066001152e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9993069410324097, + "num_tokens": 111519087.0, + "step": 35190 + }, + { + "entropy": 0.051918433699756864, + "epoch": 8.203986478610561, + "grad_norm": 2.921875, + "learning_rate": 4.7463184865702386e-05, + "loss": 0.0062, + "mean_token_accuracy": 0.9992042243480682, + "num_tokens": 111557289.0, + "step": 35195 + }, + { + "entropy": 0.06914200708270073, + "epoch": 8.205152115631192, + "grad_norm": 2.671875, + "learning_rate": 4.74622629827235e-05, + "loss": 0.0062, + "mean_token_accuracy": 0.9993867039680481, + "num_tokens": 111567208.0, + "step": 35200 + }, + { + "entropy": 0.07257007220759988, + "epoch": 8.206317752651824, + "grad_norm": 1.3125, + "learning_rate": 4.746134095119229e-05, + "loss": 0.0114, + "mean_token_accuracy": 0.9968818962574005, + "num_tokens": 111579305.0, + "step": 35205 + }, + { + "entropy": 0.0569312437903136, + "epoch": 8.207483389672456, + "grad_norm": 0.98046875, + "learning_rate": 4.7460418771122505e-05, + "loss": 0.0097, + "mean_token_accuracy": 0.9975073218345643, + "num_tokens": 111610993.0, + "step": 35210 + }, + { + "entropy": 0.06311609046533703, + "epoch": 8.208649026693088, + "grad_norm": 0.68359375, + "learning_rate": 4.7459496442527895e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9987077414989471, + "num_tokens": 111635088.0, + "step": 35215 + }, + { + "entropy": 0.05613302402198315, + "epoch": 8.20981466371372, + "grad_norm": 2.25, + "learning_rate": 4.7458573965422206e-05, + "loss": 0.0047, + "mean_token_accuracy": 0.9986424922943116, + "num_tokens": 111648821.0, + "step": 35220 + }, + { + "entropy": 0.06687151882797479, + "epoch": 8.21098030073435, + "grad_norm": 0.224609375, + "learning_rate": 4.745765133981921e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9990479350090027, + "num_tokens": 111680389.0, + "step": 35225 + }, + { + "entropy": 0.05649174377322197, + "epoch": 8.212145937754983, + "grad_norm": 1.8125, + "learning_rate": 4.745672856573265e-05, + "loss": 0.0075, + "mean_token_accuracy": 0.9981228053569794, + "num_tokens": 111703894.0, + "step": 35230 + }, + { + "entropy": 0.054273920319974425, + "epoch": 8.213311574775615, + "grad_norm": 2.078125, + "learning_rate": 4.7455805643176295e-05, + "loss": 0.0074, + "mean_token_accuracy": 0.9993931233882904, + "num_tokens": 111726855.0, + "step": 35235 + }, + { + "entropy": 0.049174747243523595, + "epoch": 8.214477211796247, + "grad_norm": 0.2021484375, + "learning_rate": 4.74548825721639e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9997669696807862, + "num_tokens": 111771810.0, + "step": 35240 + }, + { + "entropy": 0.08168933633714914, + "epoch": 8.215642848816879, + "grad_norm": 0.203125, + "learning_rate": 4.745395935270923e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9992255866527557, + "num_tokens": 111795911.0, + "step": 35245 + }, + { + "entropy": 0.08296185713261366, + "epoch": 8.216808485837511, + "grad_norm": 0.103515625, + "learning_rate": 4.745303598482606e-05, + "loss": 0.0151, + "mean_token_accuracy": 0.995855188369751, + "num_tokens": 111826674.0, + "step": 35250 + }, + { + "entropy": 0.07323675518855452, + "epoch": 8.217974122858141, + "grad_norm": 0.97265625, + "learning_rate": 4.7452112468528156e-05, + "loss": 0.0149, + "mean_token_accuracy": 0.997208696603775, + "num_tokens": 111856193.0, + "step": 35255 + }, + { + "entropy": 0.059016761183738706, + "epoch": 8.219139759878773, + "grad_norm": 0.1572265625, + "learning_rate": 4.7451188803829284e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9998023688793183, + "num_tokens": 111865608.0, + "step": 35260 + }, + { + "entropy": 0.054262810340151194, + "epoch": 8.220305396899406, + "grad_norm": 0.765625, + "learning_rate": 4.745026499074322e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9982050716876983, + "num_tokens": 111886789.0, + "step": 35265 + }, + { + "entropy": 0.05554175898432732, + "epoch": 8.221471033920038, + "grad_norm": 0.515625, + "learning_rate": 4.744934102928373e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9989612340927124, + "num_tokens": 111906176.0, + "step": 35270 + }, + { + "entropy": 0.045644909329712394, + "epoch": 8.22263667094067, + "grad_norm": 0.203125, + "learning_rate": 4.7448416919464607e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9997182667255402, + "num_tokens": 111942352.0, + "step": 35275 + }, + { + "entropy": 0.06484599094837903, + "epoch": 8.2238023079613, + "grad_norm": 0.2470703125, + "learning_rate": 4.744749266129962e-05, + "loss": 0.0099, + "mean_token_accuracy": 0.9974075615406036, + "num_tokens": 111964101.0, + "step": 35280 + }, + { + "entropy": 0.05839154152199626, + "epoch": 8.224967944981932, + "grad_norm": 1.1875, + "learning_rate": 4.744656825480257e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9989475786685944, + "num_tokens": 111989065.0, + "step": 35285 + }, + { + "entropy": 0.07809292711317539, + "epoch": 8.226133582002564, + "grad_norm": 0.7265625, + "learning_rate": 4.744564369998721e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9986956357955933, + "num_tokens": 112000207.0, + "step": 35290 + }, + { + "entropy": 0.050143054034560916, + "epoch": 8.227299219023196, + "grad_norm": 1.203125, + "learning_rate": 4.7444718996867356e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.9980408549308777, + "num_tokens": 112021884.0, + "step": 35295 + }, + { + "entropy": 0.05025668404996395, + "epoch": 8.228464856043828, + "grad_norm": 0.125, + "learning_rate": 4.744379414545678e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9998918652534485, + "num_tokens": 112042919.0, + "step": 35300 + }, + { + "entropy": 0.0611870632506907, + "epoch": 8.22963049306446, + "grad_norm": 0.02685546875, + "learning_rate": 4.744286914576927e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9994702398777008, + "num_tokens": 112065484.0, + "step": 35305 + }, + { + "entropy": 0.07436528988182545, + "epoch": 8.230796130085091, + "grad_norm": 0.1767578125, + "learning_rate": 4.744194399781863e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9982783615589141, + "num_tokens": 112078227.0, + "step": 35310 + }, + { + "entropy": 0.042807167023420335, + "epoch": 8.231961767105723, + "grad_norm": 0.416015625, + "learning_rate": 4.744101870161866e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9995752155780793, + "num_tokens": 112103737.0, + "step": 35315 + }, + { + "entropy": 0.05692212600260973, + "epoch": 8.233127404126355, + "grad_norm": 1.6484375, + "learning_rate": 4.744009325718314e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9979093134403229, + "num_tokens": 112123907.0, + "step": 35320 + }, + { + "entropy": 0.05411262274719775, + "epoch": 8.234293041146987, + "grad_norm": 2.875, + "learning_rate": 4.7439167664525876e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9989370942115784, + "num_tokens": 112154557.0, + "step": 35325 + }, + { + "entropy": 0.060514886677265164, + "epoch": 8.23545867816762, + "grad_norm": 0.7421875, + "learning_rate": 4.743824192366068e-05, + "loss": 0.0069, + "mean_token_accuracy": 0.9985116302967072, + "num_tokens": 112179242.0, + "step": 35330 + }, + { + "entropy": 0.05649018418043852, + "epoch": 8.23662431518825, + "grad_norm": 0.09814453125, + "learning_rate": 4.743731603460134e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9998935043811799, + "num_tokens": 112210248.0, + "step": 35335 + }, + { + "entropy": 0.06745369862765074, + "epoch": 8.237789952208882, + "grad_norm": 0.08642578125, + "learning_rate": 4.743638999736169e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9993011593818665, + "num_tokens": 112228105.0, + "step": 35340 + }, + { + "entropy": 0.05459690997377038, + "epoch": 8.238955589229514, + "grad_norm": 0.50390625, + "learning_rate": 4.74354638119555e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9975790500640869, + "num_tokens": 112243364.0, + "step": 35345 + }, + { + "entropy": 0.0657209831289947, + "epoch": 8.240121226250146, + "grad_norm": 2.484375, + "learning_rate": 4.743453747839661e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9995322525501251, + "num_tokens": 112274116.0, + "step": 35350 + }, + { + "entropy": 0.05723834093660116, + "epoch": 8.241286863270778, + "grad_norm": 0.255859375, + "learning_rate": 4.743361099669882e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9994523048400878, + "num_tokens": 112290556.0, + "step": 35355 + }, + { + "entropy": 0.059019094612449405, + "epoch": 8.242452500291408, + "grad_norm": 0.1796875, + "learning_rate": 4.743268436687595e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9999679744243621, + "num_tokens": 112312452.0, + "step": 35360 + }, + { + "entropy": 0.05994194708764553, + "epoch": 8.24361813731204, + "grad_norm": 0.416015625, + "learning_rate": 4.743175758894182e-05, + "loss": 0.0039, + "mean_token_accuracy": 0.9986020386219024, + "num_tokens": 112324430.0, + "step": 35365 + }, + { + "entropy": 0.07612526267766953, + "epoch": 8.244783774332673, + "grad_norm": 0.458984375, + "learning_rate": 4.743083066291024e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9977519989013672, + "num_tokens": 112363036.0, + "step": 35370 + }, + { + "entropy": 0.05744869448244572, + "epoch": 8.245949411353305, + "grad_norm": 1.359375, + "learning_rate": 4.7429903588795044e-05, + "loss": 0.0038, + "mean_token_accuracy": 0.9983863711357117, + "num_tokens": 112376745.0, + "step": 35375 + }, + { + "entropy": 0.07884750552475453, + "epoch": 8.247115048373937, + "grad_norm": 0.138671875, + "learning_rate": 4.742897636661005e-05, + "loss": 0.0098, + "mean_token_accuracy": 0.9982732713222504, + "num_tokens": 112395514.0, + "step": 35380 + }, + { + "entropy": 0.056860837060958146, + "epoch": 8.248280685394569, + "grad_norm": 0.19921875, + "learning_rate": 4.742804899636908e-05, + "loss": 0.0136, + "mean_token_accuracy": 0.9961659133434295, + "num_tokens": 112411532.0, + "step": 35385 + }, + { + "entropy": 0.13037989661097527, + "epoch": 8.2494463224152, + "grad_norm": 0.64453125, + "learning_rate": 4.742712147808597e-05, + "loss": 0.1419, + "mean_token_accuracy": 0.9767031192779541, + "num_tokens": 112433524.0, + "step": 35390 + }, + { + "entropy": 0.0718831043690443, + "epoch": 8.250611959435831, + "grad_norm": 0.69140625, + "learning_rate": 4.742619381177455e-05, + "loss": 0.0053, + "mean_token_accuracy": 0.9996219515800476, + "num_tokens": 112443322.0, + "step": 35395 + }, + { + "entropy": 0.04233479611575604, + "epoch": 8.251777596456463, + "grad_norm": 0.2470703125, + "learning_rate": 4.742526599744865e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9995525121688843, + "num_tokens": 112478937.0, + "step": 35400 + }, + { + "entropy": 0.07610894702374935, + "epoch": 8.252943233477096, + "grad_norm": 0.1650390625, + "learning_rate": 4.74243380351221e-05, + "loss": 0.0131, + "mean_token_accuracy": 0.9966908633708954, + "num_tokens": 112495918.0, + "step": 35405 + }, + { + "entropy": 0.08622381035238505, + "epoch": 8.254108870497728, + "grad_norm": 0.84375, + "learning_rate": 4.742340992480875e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9995142340660095, + "num_tokens": 112514007.0, + "step": 35410 + }, + { + "entropy": 0.06141715543344617, + "epoch": 8.255274507518358, + "grad_norm": 0.58984375, + "learning_rate": 4.7422481666522423e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9987404525279999, + "num_tokens": 112537499.0, + "step": 35415 + }, + { + "entropy": 0.08039052113890648, + "epoch": 8.25644014453899, + "grad_norm": 1.4921875, + "learning_rate": 4.7421553260276973e-05, + "loss": 0.0056, + "mean_token_accuracy": 0.9989081561565399, + "num_tokens": 112546351.0, + "step": 35420 + }, + { + "entropy": 0.056267809588462114, + "epoch": 8.257605781559622, + "grad_norm": 2.890625, + "learning_rate": 4.742062470608625e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9991233944892883, + "num_tokens": 112575017.0, + "step": 35425 + }, + { + "entropy": 0.05321586560457945, + "epoch": 8.258771418580254, + "grad_norm": 0.85546875, + "learning_rate": 4.741969600396408e-05, + "loss": 0.0066, + "mean_token_accuracy": 0.9979334831237793, + "num_tokens": 112604847.0, + "step": 35430 + }, + { + "entropy": 0.04675264293327928, + "epoch": 8.259937055600886, + "grad_norm": 0.20703125, + "learning_rate": 4.741876715392433e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9998992204666137, + "num_tokens": 112626161.0, + "step": 35435 + }, + { + "entropy": 0.08684096895158291, + "epoch": 8.261102692621517, + "grad_norm": 2.0, + "learning_rate": 4.7417838155980835e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9993700981140137, + "num_tokens": 112635620.0, + "step": 35440 + }, + { + "entropy": 0.06560825593769551, + "epoch": 8.262268329642149, + "grad_norm": 1.203125, + "learning_rate": 4.7416909010147456e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9992285370826721, + "num_tokens": 112661918.0, + "step": 35445 + }, + { + "entropy": 0.058400828018784524, + "epoch": 8.263433966662781, + "grad_norm": 0.51953125, + "learning_rate": 4.7415979716438055e-05, + "loss": 0.0088, + "mean_token_accuracy": 0.9980306982994079, + "num_tokens": 112681038.0, + "step": 35450 + }, + { + "entropy": 0.09542725849896669, + "epoch": 8.264599603683413, + "grad_norm": 3.40625, + "learning_rate": 4.7415050274866483e-05, + "loss": 0.0969, + "mean_token_accuracy": 0.9781103134155273, + "num_tokens": 112701804.0, + "step": 35455 + }, + { + "entropy": 0.05795524828135967, + "epoch": 8.265765240704045, + "grad_norm": 1.53125, + "learning_rate": 4.741412068544659e-05, + "loss": 0.005, + "mean_token_accuracy": 0.9989598095417023, + "num_tokens": 112719943.0, + "step": 35460 + }, + { + "entropy": 0.05817284304648638, + "epoch": 8.266930877724677, + "grad_norm": 0.23046875, + "learning_rate": 4.741319094819226e-05, + "loss": 0.0075, + "mean_token_accuracy": 0.9974438846111298, + "num_tokens": 112735325.0, + "step": 35465 + }, + { + "entropy": 0.06478349603712559, + "epoch": 8.268096514745308, + "grad_norm": 0.03857421875, + "learning_rate": 4.741226106311733e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9993962347507477, + "num_tokens": 112751179.0, + "step": 35470 + }, + { + "entropy": 0.04304729863069952, + "epoch": 8.26926215176594, + "grad_norm": 0.302734375, + "learning_rate": 4.7411331030235684e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9986756980419159, + "num_tokens": 112785380.0, + "step": 35475 + }, + { + "entropy": 0.039424076396971944, + "epoch": 8.270427788786572, + "grad_norm": 0.23828125, + "learning_rate": 4.741040084956118e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9992639780044555, + "num_tokens": 112809960.0, + "step": 35480 + }, + { + "entropy": 0.09679261557757854, + "epoch": 8.271593425807204, + "grad_norm": 0.333984375, + "learning_rate": 4.74094705211077e-05, + "loss": 0.0673, + "mean_token_accuracy": 0.9878584563732147, + "num_tokens": 112840958.0, + "step": 35485 + }, + { + "entropy": 0.059682253934443, + "epoch": 8.272759062827836, + "grad_norm": 0.33984375, + "learning_rate": 4.74085400448891e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9995380342006683, + "num_tokens": 112866924.0, + "step": 35490 + }, + { + "entropy": 0.05326059451326728, + "epoch": 8.273924699848466, + "grad_norm": 0.84765625, + "learning_rate": 4.7407609420919275e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9986121416091919, + "num_tokens": 112890112.0, + "step": 35495 + }, + { + "entropy": 0.07074670540168881, + "epoch": 8.275090336869098, + "grad_norm": 0.201171875, + "learning_rate": 4.740667864921209e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9993785321712494, + "num_tokens": 112917769.0, + "step": 35500 + }, + { + "entropy": 0.05106313647702336, + "epoch": 8.27625597388973, + "grad_norm": 0.765625, + "learning_rate": 4.7405747729781416e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.9989130139350891, + "num_tokens": 112934554.0, + "step": 35505 + }, + { + "entropy": 0.0686172442510724, + "epoch": 8.277421610910363, + "grad_norm": 0.55859375, + "learning_rate": 4.740481666264115e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9985272109508514, + "num_tokens": 112956525.0, + "step": 35510 + }, + { + "entropy": 0.06048934049904346, + "epoch": 8.278587247930995, + "grad_norm": 0.283203125, + "learning_rate": 4.740388544780517e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9995243310928345, + "num_tokens": 112979928.0, + "step": 35515 + }, + { + "entropy": 0.09104947121813893, + "epoch": 8.279752884951627, + "grad_norm": 0.58203125, + "learning_rate": 4.740295408528737e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.9983532786369324, + "num_tokens": 112999457.0, + "step": 35520 + }, + { + "entropy": 0.04913658211007714, + "epoch": 8.280918521972257, + "grad_norm": 0.279296875, + "learning_rate": 4.740202257510162e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9999531686306, + "num_tokens": 113026574.0, + "step": 35525 + }, + { + "entropy": 0.06502987816929817, + "epoch": 8.28208415899289, + "grad_norm": 0.2265625, + "learning_rate": 4.7401090917261826e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9996796190738678, + "num_tokens": 113046634.0, + "step": 35530 + }, + { + "entropy": 0.05888591511175036, + "epoch": 8.283249796013521, + "grad_norm": 0.4765625, + "learning_rate": 4.740015911178187e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9976631879806519, + "num_tokens": 113066170.0, + "step": 35535 + }, + { + "entropy": 0.08687619585543871, + "epoch": 8.284415433034154, + "grad_norm": 0.1650390625, + "learning_rate": 4.739922715867565e-05, + "loss": 0.0061, + "mean_token_accuracy": 0.9987577140331269, + "num_tokens": 113076695.0, + "step": 35540 + }, + { + "entropy": 0.06502792998217047, + "epoch": 8.285581070054786, + "grad_norm": 0.265625, + "learning_rate": 4.739829505795707e-05, + "loss": 0.0065, + "mean_token_accuracy": 0.9979210734367371, + "num_tokens": 113112146.0, + "step": 35545 + }, + { + "entropy": 0.060326622892171146, + "epoch": 8.286746707075416, + "grad_norm": 1.921875, + "learning_rate": 4.739736280964002e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9994000792503357, + "num_tokens": 113137726.0, + "step": 35550 + }, + { + "entropy": 0.0668079487979412, + "epoch": 8.287912344096048, + "grad_norm": 0.859375, + "learning_rate": 4.7396430413738394e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9990176737308503, + "num_tokens": 113163502.0, + "step": 35555 + }, + { + "entropy": 0.051899380423128604, + "epoch": 8.28907798111668, + "grad_norm": 0.26171875, + "learning_rate": 4.7395497870266115e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9989886045455932, + "num_tokens": 113182112.0, + "step": 35560 + }, + { + "entropy": 0.0725651178508997, + "epoch": 8.290243618137312, + "grad_norm": 0.166015625, + "learning_rate": 4.7394565179237084e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9993403315544128, + "num_tokens": 113203507.0, + "step": 35565 + }, + { + "entropy": 0.04745776057243347, + "epoch": 8.291409255157944, + "grad_norm": 0.1953125, + "learning_rate": 4.739363234066519e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9995150685310363, + "num_tokens": 113221559.0, + "step": 35570 + }, + { + "entropy": 0.051142162969335915, + "epoch": 8.292574892178575, + "grad_norm": 0.15625, + "learning_rate": 4.739269935456437e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9998042106628418, + "num_tokens": 113252839.0, + "step": 35575 + }, + { + "entropy": 0.06874603973701596, + "epoch": 8.293740529199207, + "grad_norm": 0.314453125, + "learning_rate": 4.739176622094852e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9991725265979767, + "num_tokens": 113265678.0, + "step": 35580 + }, + { + "entropy": 0.103728087246418, + "epoch": 8.294906166219839, + "grad_norm": 0.189453125, + "learning_rate": 4.7390832939831554e-05, + "loss": 0.0906, + "mean_token_accuracy": 0.9850226819515229, + "num_tokens": 113286175.0, + "step": 35585 + }, + { + "entropy": 0.05010626185685396, + "epoch": 8.296071803240471, + "grad_norm": 0.2431640625, + "learning_rate": 4.7389899511227395e-05, + "loss": 0.0062, + "mean_token_accuracy": 0.9987168490886689, + "num_tokens": 113303583.0, + "step": 35590 + }, + { + "entropy": 0.07682762825861573, + "epoch": 8.297237440261103, + "grad_norm": 0.671875, + "learning_rate": 4.7388965935149955e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9988443732261658, + "num_tokens": 113328321.0, + "step": 35595 + }, + { + "entropy": 0.06315315756946802, + "epoch": 8.298403077281735, + "grad_norm": 0.578125, + "learning_rate": 4.7388032211613166e-05, + "loss": 0.0083, + "mean_token_accuracy": 0.9949438393115997, + "num_tokens": 113340699.0, + "step": 35600 + }, + { + "entropy": 0.08139186827465891, + "epoch": 8.299568714302366, + "grad_norm": 0.953125, + "learning_rate": 4.738709834063094e-05, + "loss": 0.0085, + "mean_token_accuracy": 0.9987073481082916, + "num_tokens": 113361554.0, + "step": 35605 + }, + { + "entropy": 0.06355111561715603, + "epoch": 8.300734351322998, + "grad_norm": 0.10595703125, + "learning_rate": 4.738616432221721e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9995387196540833, + "num_tokens": 113372946.0, + "step": 35610 + }, + { + "entropy": 0.054753214679658414, + "epoch": 8.30189998834363, + "grad_norm": 0.173828125, + "learning_rate": 4.7385230156385894e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9989207983016968, + "num_tokens": 113411917.0, + "step": 35615 + }, + { + "entropy": 0.08477168828248978, + "epoch": 8.303065625364262, + "grad_norm": 1.390625, + "learning_rate": 4.738429584315093e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9975354373455048, + "num_tokens": 113421543.0, + "step": 35620 + }, + { + "entropy": 0.03909264667890966, + "epoch": 8.304231262384894, + "grad_norm": 0.3515625, + "learning_rate": 4.738336138252625e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9994762778282166, + "num_tokens": 113453973.0, + "step": 35625 + }, + { + "entropy": 0.048014458548277614, + "epoch": 8.305396899405524, + "grad_norm": 0.890625, + "learning_rate": 4.738242677452578e-05, + "loss": 0.0039, + "mean_token_accuracy": 0.9986834228038788, + "num_tokens": 113473666.0, + "step": 35630 + }, + { + "entropy": 0.07936519216746092, + "epoch": 8.306562536426156, + "grad_norm": 0.9765625, + "learning_rate": 4.7381492019163475e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9991873800754547, + "num_tokens": 113488056.0, + "step": 35635 + }, + { + "entropy": 0.05001498758792877, + "epoch": 8.307728173446788, + "grad_norm": 0.92578125, + "learning_rate": 4.7380557116453255e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9998580873012543, + "num_tokens": 113525080.0, + "step": 35640 + }, + { + "entropy": 0.07388973170891404, + "epoch": 8.30889381046742, + "grad_norm": 1.0234375, + "learning_rate": 4.737962206640907e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9986834764480591, + "num_tokens": 113538373.0, + "step": 35645 + }, + { + "entropy": 0.05318211195990443, + "epoch": 8.310059447488053, + "grad_norm": 0.353515625, + "learning_rate": 4.737868686904485e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9992851078510284, + "num_tokens": 113563745.0, + "step": 35650 + }, + { + "entropy": 0.04736358020454645, + "epoch": 8.311225084508685, + "grad_norm": 1.2734375, + "learning_rate": 4.737775152437456e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9990275919437408, + "num_tokens": 113594405.0, + "step": 35655 + }, + { + "entropy": 0.055001238361001015, + "epoch": 8.312390721529315, + "grad_norm": 1.4921875, + "learning_rate": 4.737681603241214e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9982485055923462, + "num_tokens": 113611583.0, + "step": 35660 + }, + { + "entropy": 0.059361847769469026, + "epoch": 8.313556358549947, + "grad_norm": 0.322265625, + "learning_rate": 4.737588039317153e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9991229712963104, + "num_tokens": 113638985.0, + "step": 35665 + }, + { + "entropy": 0.06605250053107739, + "epoch": 8.31472199557058, + "grad_norm": 0.38671875, + "learning_rate": 4.7374944606666694e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9987422049045562, + "num_tokens": 113649240.0, + "step": 35670 + }, + { + "entropy": 0.08314755260944366, + "epoch": 8.315887632591211, + "grad_norm": 1.8515625, + "learning_rate": 4.737400867291158e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9990105986595154, + "num_tokens": 113657025.0, + "step": 35675 + }, + { + "entropy": 0.051615030877292155, + "epoch": 8.317053269611844, + "grad_norm": 0.734375, + "learning_rate": 4.737307259192014e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9991748809814454, + "num_tokens": 113680681.0, + "step": 35680 + }, + { + "entropy": 0.06697340840473771, + "epoch": 8.318218906632474, + "grad_norm": 0.31640625, + "learning_rate": 4.737213636370635e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9999058306217193, + "num_tokens": 113696503.0, + "step": 35685 + }, + { + "entropy": 0.07072263630107045, + "epoch": 8.319384543653106, + "grad_norm": 0.1982421875, + "learning_rate": 4.737119998828415e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9999394178390503, + "num_tokens": 113711994.0, + "step": 35690 + }, + { + "entropy": 0.059878239221870896, + "epoch": 8.320550180673738, + "grad_norm": 2.4375, + "learning_rate": 4.737026346566751e-05, + "loss": 0.0153, + "mean_token_accuracy": 0.9969691872596741, + "num_tokens": 113730409.0, + "step": 35695 + }, + { + "entropy": 0.06803951859474182, + "epoch": 8.32171581769437, + "grad_norm": 1.4375, + "learning_rate": 4.7369326795870394e-05, + "loss": 0.0019, + "mean_token_accuracy": 1.0, + "num_tokens": 113739693.0, + "step": 35700 + }, + { + "entropy": 0.056852425914257765, + "epoch": 8.322881454715002, + "grad_norm": 0.2490234375, + "learning_rate": 4.736838997890678e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9997595429420472, + "num_tokens": 113772385.0, + "step": 35705 + }, + { + "entropy": 0.06514825280755758, + "epoch": 8.324047091735633, + "grad_norm": 0.494140625, + "learning_rate": 4.7367453014790627e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9989059031009674, + "num_tokens": 113785680.0, + "step": 35710 + }, + { + "entropy": 0.07101746341213584, + "epoch": 8.325212728756265, + "grad_norm": 1.3984375, + "learning_rate": 4.73665159035359e-05, + "loss": 0.0062, + "mean_token_accuracy": 0.9984484255313874, + "num_tokens": 113802287.0, + "step": 35715 + }, + { + "entropy": 0.06135486271232367, + "epoch": 8.326378365776897, + "grad_norm": 0.52734375, + "learning_rate": 4.736557864515658e-05, + "loss": 0.0069, + "mean_token_accuracy": 0.9988792181015015, + "num_tokens": 113826041.0, + "step": 35720 + }, + { + "entropy": 0.04972120532765985, + "epoch": 8.327544002797529, + "grad_norm": 0.265625, + "learning_rate": 4.7364641239666654e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9995426177978516, + "num_tokens": 113865421.0, + "step": 35725 + }, + { + "entropy": 0.07017171997576951, + "epoch": 8.328709639818161, + "grad_norm": 0.216796875, + "learning_rate": 4.736370368708008e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.999553245306015, + "num_tokens": 113885947.0, + "step": 35730 + }, + { + "entropy": 0.058057605568319556, + "epoch": 8.329875276838793, + "grad_norm": 0.2119140625, + "learning_rate": 4.736276598741086e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9994979918003082, + "num_tokens": 113913705.0, + "step": 35735 + }, + { + "entropy": 0.09026005379855633, + "epoch": 8.331040913859423, + "grad_norm": 2.78125, + "learning_rate": 4.7361828140672956e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9987558662891388, + "num_tokens": 113924075.0, + "step": 35740 + }, + { + "entropy": 0.08890927508473397, + "epoch": 8.332206550880056, + "grad_norm": 0.314453125, + "learning_rate": 4.736089014688037e-05, + "loss": 0.0074, + "mean_token_accuracy": 0.9987519264221192, + "num_tokens": 113944931.0, + "step": 35745 + }, + { + "entropy": 0.05832267887890339, + "epoch": 8.333372187900688, + "grad_norm": 0.1806640625, + "learning_rate": 4.735995200604707e-05, + "loss": 0.0075, + "mean_token_accuracy": 0.9988953292369842, + "num_tokens": 113968947.0, + "step": 35750 + }, + { + "entropy": 0.05671592140570283, + "epoch": 8.33453782492132, + "grad_norm": 0.2890625, + "learning_rate": 4.735901371818706e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.998402863740921, + "num_tokens": 113996374.0, + "step": 35755 + }, + { + "entropy": 0.05853022150695324, + "epoch": 8.335703461941952, + "grad_norm": 0.2197265625, + "learning_rate": 4.735807528331432e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.998811411857605, + "num_tokens": 114010996.0, + "step": 35760 + }, + { + "entropy": 0.06083117621019483, + "epoch": 8.336869098962582, + "grad_norm": 0.185546875, + "learning_rate": 4.7357136701442864e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9989865601062775, + "num_tokens": 114050055.0, + "step": 35765 + }, + { + "entropy": 0.053792219050228594, + "epoch": 8.338034735983214, + "grad_norm": 0.10400390625, + "learning_rate": 4.735619797258666e-05, + "loss": 0.0042, + "mean_token_accuracy": 0.9987802326679229, + "num_tokens": 114079476.0, + "step": 35770 + }, + { + "entropy": 0.07619495354592801, + "epoch": 8.339200373003846, + "grad_norm": 1.421875, + "learning_rate": 4.735525909675972e-05, + "loss": 0.0061, + "mean_token_accuracy": 0.9981731057167054, + "num_tokens": 114089238.0, + "step": 35775 + }, + { + "entropy": 0.06971075274050235, + "epoch": 8.340366010024479, + "grad_norm": 0.78125, + "learning_rate": 4.735432007397605e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9989834904670716, + "num_tokens": 114109164.0, + "step": 35780 + }, + { + "entropy": 0.046845695050433275, + "epoch": 8.34153164704511, + "grad_norm": 0.39453125, + "learning_rate": 4.735338090424965e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9987184941768646, + "num_tokens": 114152383.0, + "step": 35785 + }, + { + "entropy": 0.06434694388881326, + "epoch": 8.342697284065743, + "grad_norm": 0.328125, + "learning_rate": 4.735244158759452e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9994235217571259, + "num_tokens": 114170104.0, + "step": 35790 + }, + { + "entropy": 0.0462822025641799, + "epoch": 8.343862921086373, + "grad_norm": 0.267578125, + "learning_rate": 4.735150212402466e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9997472882270813, + "num_tokens": 114195725.0, + "step": 35795 + }, + { + "entropy": 0.06865362599492073, + "epoch": 8.345028558107005, + "grad_norm": 0.30859375, + "learning_rate": 4.735056251355409e-05, + "loss": 0.0081, + "mean_token_accuracy": 0.996992152929306, + "num_tokens": 114217589.0, + "step": 35800 + }, + { + "entropy": 0.0582847012206912, + "epoch": 8.346194195127637, + "grad_norm": 0.53515625, + "learning_rate": 4.734962275619681e-05, + "loss": 0.0065, + "mean_token_accuracy": 0.9987275958061218, + "num_tokens": 114232250.0, + "step": 35805 + }, + { + "entropy": 0.05756397508084774, + "epoch": 8.34735983214827, + "grad_norm": 0.1748046875, + "learning_rate": 4.734868285196685e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9999210953712463, + "num_tokens": 114250794.0, + "step": 35810 + }, + { + "entropy": 0.06038464680314064, + "epoch": 8.348525469168901, + "grad_norm": 0.74609375, + "learning_rate": 4.7347742800878206e-05, + "loss": 0.0078, + "mean_token_accuracy": 0.9986655592918396, + "num_tokens": 114265626.0, + "step": 35815 + }, + { + "entropy": 0.04220847636461258, + "epoch": 8.349691106189532, + "grad_norm": 2.3125, + "learning_rate": 4.734680260294491e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9986125826835632, + "num_tokens": 114297113.0, + "step": 35820 + }, + { + "entropy": 0.05641756169497967, + "epoch": 8.350856743210164, + "grad_norm": 1.4609375, + "learning_rate": 4.734586225818098e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9988239586353302, + "num_tokens": 114312306.0, + "step": 35825 + }, + { + "entropy": 0.05419354699552059, + "epoch": 8.352022380230796, + "grad_norm": 1.53125, + "learning_rate": 4.7344921766600425e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9972829222679138, + "num_tokens": 114338143.0, + "step": 35830 + }, + { + "entropy": 0.061890093795955184, + "epoch": 8.353188017251428, + "grad_norm": 0.1689453125, + "learning_rate": 4.734398112821728e-05, + "loss": 0.0047, + "mean_token_accuracy": 0.9984643816947937, + "num_tokens": 114360916.0, + "step": 35835 + }, + { + "entropy": 0.0586331375874579, + "epoch": 8.35435365427206, + "grad_norm": 1.5546875, + "learning_rate": 4.7343040343045586e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9992785274982452, + "num_tokens": 114374324.0, + "step": 35840 + }, + { + "entropy": 0.09843996288254857, + "epoch": 8.35551929129269, + "grad_norm": 0.30078125, + "learning_rate": 4.7342099411099336e-05, + "loss": 0.0065, + "mean_token_accuracy": 0.99708451628685, + "num_tokens": 114392560.0, + "step": 35845 + }, + { + "entropy": 0.0483152624219656, + "epoch": 8.356684928313323, + "grad_norm": 0.396484375, + "learning_rate": 4.734115833239259e-05, + "loss": 0.0093, + "mean_token_accuracy": 0.9988516390323638, + "num_tokens": 114415426.0, + "step": 35850 + }, + { + "entropy": 0.061218463350087404, + "epoch": 8.357850565333955, + "grad_norm": 0.341796875, + "learning_rate": 4.734021710693938e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9996902167797088, + "num_tokens": 114435183.0, + "step": 35855 + }, + { + "entropy": 0.06922294609248639, + "epoch": 8.359016202354587, + "grad_norm": 2.484375, + "learning_rate": 4.7339275734753717e-05, + "loss": 0.0038, + "mean_token_accuracy": 0.9989567220211029, + "num_tokens": 114469970.0, + "step": 35860 + }, + { + "entropy": 0.052051532082259655, + "epoch": 8.360181839375219, + "grad_norm": 0.1611328125, + "learning_rate": 4.7338334215849664e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9997234284877777, + "num_tokens": 114490221.0, + "step": 35865 + }, + { + "entropy": 0.0687698122113943, + "epoch": 8.361347476395851, + "grad_norm": 0.6953125, + "learning_rate": 4.7337392550241246e-05, + "loss": 0.0069, + "mean_token_accuracy": 0.999000883102417, + "num_tokens": 114509080.0, + "step": 35870 + }, + { + "entropy": 0.05277932183817029, + "epoch": 8.362513113416481, + "grad_norm": 0.09326171875, + "learning_rate": 4.73364507379425e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9998419165611268, + "num_tokens": 114527650.0, + "step": 35875 + }, + { + "entropy": 0.059004738088697196, + "epoch": 8.363678750437114, + "grad_norm": 1.171875, + "learning_rate": 4.73355087789675e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9990606307983398, + "num_tokens": 114549999.0, + "step": 35880 + }, + { + "entropy": 0.05168069005012512, + "epoch": 8.364844387457746, + "grad_norm": 0.47265625, + "learning_rate": 4.733456667333025e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9995020270347595, + "num_tokens": 114571806.0, + "step": 35885 + }, + { + "entropy": 0.053898480255156755, + "epoch": 8.366010024478378, + "grad_norm": 0.859375, + "learning_rate": 4.7333624421044834e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9992337822914124, + "num_tokens": 114591540.0, + "step": 35890 + }, + { + "entropy": 0.06580785913392902, + "epoch": 8.36717566149901, + "grad_norm": 0.447265625, + "learning_rate": 4.733268202212527e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9986721754074097, + "num_tokens": 114608012.0, + "step": 35895 + }, + { + "entropy": 0.07490675896406174, + "epoch": 8.36834129851964, + "grad_norm": 2.40625, + "learning_rate": 4.733173947658564e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.9986124873161316, + "num_tokens": 114628929.0, + "step": 35900 + }, + { + "entropy": 0.0590736048296094, + "epoch": 8.369506935540272, + "grad_norm": 0.1337890625, + "learning_rate": 4.733079678443999e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9995599031448364, + "num_tokens": 114641901.0, + "step": 35905 + }, + { + "entropy": 0.08702043211087584, + "epoch": 8.370672572560904, + "grad_norm": 0.2080078125, + "learning_rate": 4.7329853945702366e-05, + "loss": 0.0047, + "mean_token_accuracy": 0.9992570400238037, + "num_tokens": 114662085.0, + "step": 35910 + }, + { + "entropy": 0.06151698585599661, + "epoch": 8.371838209581536, + "grad_norm": 3.65625, + "learning_rate": 4.7328910960386834e-05, + "loss": 0.0083, + "mean_token_accuracy": 0.9983277082443237, + "num_tokens": 114675531.0, + "step": 35915 + }, + { + "entropy": 0.045510869659483436, + "epoch": 8.373003846602169, + "grad_norm": 0.52734375, + "learning_rate": 4.732796782850746e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9992880523204803, + "num_tokens": 114698590.0, + "step": 35920 + }, + { + "entropy": 0.06477670334279537, + "epoch": 8.3741694836228, + "grad_norm": 3.53125, + "learning_rate": 4.73270245500783e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9987984895706177, + "num_tokens": 114710536.0, + "step": 35925 + }, + { + "entropy": 0.0671735213138163, + "epoch": 8.375335120643431, + "grad_norm": 0.5625, + "learning_rate": 4.732608112511343e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9994207143783569, + "num_tokens": 114738586.0, + "step": 35930 + }, + { + "entropy": 0.07747904891148209, + "epoch": 8.376500757664063, + "grad_norm": 0.1796875, + "learning_rate": 4.732513755362691e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9974416434764862, + "num_tokens": 114758164.0, + "step": 35935 + }, + { + "entropy": 0.041080075595527885, + "epoch": 8.377666394684695, + "grad_norm": 0.396484375, + "learning_rate": 4.73241938356328e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9994053184986115, + "num_tokens": 114787467.0, + "step": 35940 + }, + { + "entropy": 0.0725083589553833, + "epoch": 8.378832031705327, + "grad_norm": 4.25, + "learning_rate": 4.73232499711452e-05, + "loss": 0.0147, + "mean_token_accuracy": 0.997485488653183, + "num_tokens": 114798080.0, + "step": 35945 + }, + { + "entropy": 0.04100953293964267, + "epoch": 8.37999766872596, + "grad_norm": 0.2314453125, + "learning_rate": 4.732230596017816e-05, + "loss": 0.0096, + "mean_token_accuracy": 0.9985480844974518, + "num_tokens": 114834393.0, + "step": 35950 + }, + { + "entropy": 0.06174456924200058, + "epoch": 8.38116330574659, + "grad_norm": 0.11962890625, + "learning_rate": 4.732136180274576e-05, + "loss": 0.0082, + "mean_token_accuracy": 0.9990587592124939, + "num_tokens": 114847723.0, + "step": 35955 + }, + { + "entropy": 0.072079146374017, + "epoch": 8.382328942767222, + "grad_norm": 1.375, + "learning_rate": 4.732041749886209e-05, + "loss": 0.0065, + "mean_token_accuracy": 0.9978023052215577, + "num_tokens": 114861770.0, + "step": 35960 + }, + { + "entropy": 0.05037323208525777, + "epoch": 8.383494579787854, + "grad_norm": 0.361328125, + "learning_rate": 4.731947304854122e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9989248156547547, + "num_tokens": 114887553.0, + "step": 35965 + }, + { + "entropy": 0.07071936894208193, + "epoch": 8.384660216808486, + "grad_norm": 1.9765625, + "learning_rate": 4.731852845179724e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.9990245401859283, + "num_tokens": 114910384.0, + "step": 35970 + }, + { + "entropy": 0.04616665868088603, + "epoch": 8.385825853829118, + "grad_norm": 0.1845703125, + "learning_rate": 4.731758370864423e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9997251272201538, + "num_tokens": 114954754.0, + "step": 35975 + }, + { + "entropy": 0.07068393416702748, + "epoch": 8.386991490849748, + "grad_norm": 0.09814453125, + "learning_rate": 4.731663881909628e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9998482525348663, + "num_tokens": 114964733.0, + "step": 35980 + }, + { + "entropy": 0.0473877920769155, + "epoch": 8.38815712787038, + "grad_norm": 0.228515625, + "learning_rate": 4.731569378316749e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.999399745464325, + "num_tokens": 114980569.0, + "step": 35985 + }, + { + "entropy": 0.051515743136405945, + "epoch": 8.389322764891013, + "grad_norm": 0.23046875, + "learning_rate": 4.731474860087193e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9992481112480164, + "num_tokens": 115022823.0, + "step": 35990 + }, + { + "entropy": 0.04221371039748192, + "epoch": 8.390488401911645, + "grad_norm": 0.154296875, + "learning_rate": 4.731380327222371e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9989139676094055, + "num_tokens": 115044356.0, + "step": 35995 + }, + { + "entropy": 0.061717442143708466, + "epoch": 8.391654038932277, + "grad_norm": 1.1875, + "learning_rate": 4.7312857797236925e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.9987213313579559, + "num_tokens": 115065703.0, + "step": 36000 + }, + { + "entropy": 0.07057226775214076, + "epoch": 8.392819675952909, + "grad_norm": 2.0625, + "learning_rate": 4.731191217592567e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9992031335830689, + "num_tokens": 115092467.0, + "step": 36005 + }, + { + "entropy": 0.08439392279833555, + "epoch": 8.39398531297354, + "grad_norm": 0.265625, + "learning_rate": 4.731096640830405e-05, + "loss": 0.0336, + "mean_token_accuracy": 0.9949727892875672, + "num_tokens": 115130036.0, + "step": 36010 + }, + { + "entropy": 0.04997500581666827, + "epoch": 8.395150949994171, + "grad_norm": 1.109375, + "learning_rate": 4.7310020494386156e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9988471925258636, + "num_tokens": 115150815.0, + "step": 36015 + }, + { + "entropy": 0.07026242911815643, + "epoch": 8.396316587014804, + "grad_norm": 0.1328125, + "learning_rate": 4.730907443418611e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9996152281761169, + "num_tokens": 115160160.0, + "step": 36020 + }, + { + "entropy": 0.05926717910915613, + "epoch": 8.397482224035436, + "grad_norm": 0.3203125, + "learning_rate": 4.730812822771801e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.9973244667053223, + "num_tokens": 115177388.0, + "step": 36025 + }, + { + "entropy": 0.06058362293988466, + "epoch": 8.398647861056068, + "grad_norm": 0.1904296875, + "learning_rate": 4.730718187499595e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9996544599533081, + "num_tokens": 115193357.0, + "step": 36030 + }, + { + "entropy": 0.08812159774824976, + "epoch": 8.399813498076698, + "grad_norm": 2.484375, + "learning_rate": 4.730623537603408e-05, + "loss": 0.0112, + "mean_token_accuracy": 0.9962131261825562, + "num_tokens": 115209695.0, + "step": 36035 + }, + { + "entropy": 0.0533585537225008, + "epoch": 8.40097913509733, + "grad_norm": 0.84375, + "learning_rate": 4.730528873084648e-05, + "loss": 0.0082, + "mean_token_accuracy": 0.9977275252342224, + "num_tokens": 115233579.0, + "step": 36040 + }, + { + "entropy": 0.04185967352241278, + "epoch": 8.402144772117962, + "grad_norm": 0.6953125, + "learning_rate": 4.730434193944727e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9987737834453583, + "num_tokens": 115254772.0, + "step": 36045 + }, + { + "entropy": 0.06632612012326718, + "epoch": 8.403310409138594, + "grad_norm": 0.84375, + "learning_rate": 4.730339500185059e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9984843134880066, + "num_tokens": 115266893.0, + "step": 36050 + }, + { + "entropy": 0.0712279126048088, + "epoch": 8.404476046159226, + "grad_norm": 3.84375, + "learning_rate": 4.7302447918070536e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9988064527511596, + "num_tokens": 115292056.0, + "step": 36055 + }, + { + "entropy": 0.05562030803412199, + "epoch": 8.405641683179859, + "grad_norm": 0.5625, + "learning_rate": 4.730150068812124e-05, + "loss": 0.0047, + "mean_token_accuracy": 0.9988888800144196, + "num_tokens": 115312811.0, + "step": 36060 + }, + { + "entropy": 0.06178484875708819, + "epoch": 8.406807320200489, + "grad_norm": 0.076171875, + "learning_rate": 4.730055331201683e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9996735990047455, + "num_tokens": 115328524.0, + "step": 36065 + }, + { + "entropy": 0.04668997749686241, + "epoch": 8.407972957221121, + "grad_norm": 0.5234375, + "learning_rate": 4.729960578977143e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9984108448028565, + "num_tokens": 115349273.0, + "step": 36070 + }, + { + "entropy": 0.06319479513913392, + "epoch": 8.409138594241753, + "grad_norm": 0.369140625, + "learning_rate": 4.729865812139916e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9999687492847442, + "num_tokens": 115369544.0, + "step": 36075 + }, + { + "entropy": 0.05766276512295008, + "epoch": 8.410304231262385, + "grad_norm": 0.10595703125, + "learning_rate": 4.729771030691417e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9991033136844635, + "num_tokens": 115383256.0, + "step": 36080 + }, + { + "entropy": 0.04213843066245317, + "epoch": 8.411469868283017, + "grad_norm": 0.91015625, + "learning_rate": 4.7296762346330576e-05, + "loss": 0.0038, + "mean_token_accuracy": 0.998986440896988, + "num_tokens": 115415891.0, + "step": 36085 + }, + { + "entropy": 0.07678450532257557, + "epoch": 8.412635505303648, + "grad_norm": 1.203125, + "learning_rate": 4.7295814239662525e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9996045112609864, + "num_tokens": 115426929.0, + "step": 36090 + }, + { + "entropy": 0.05705181276425719, + "epoch": 8.41380114232428, + "grad_norm": 0.84765625, + "learning_rate": 4.729486598692414e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.9984883546829224, + "num_tokens": 115451739.0, + "step": 36095 + }, + { + "entropy": 0.05112549578770995, + "epoch": 8.414966779344912, + "grad_norm": 0.2265625, + "learning_rate": 4.729391758812958e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9999086081981658, + "num_tokens": 115480556.0, + "step": 36100 + }, + { + "entropy": 0.04785581501200795, + "epoch": 8.416132416365544, + "grad_norm": 0.48046875, + "learning_rate": 4.729296904329298e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9994737505912781, + "num_tokens": 115497081.0, + "step": 36105 + }, + { + "entropy": 0.06632670313119889, + "epoch": 8.417298053386176, + "grad_norm": 0.76953125, + "learning_rate": 4.7292020352428477e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9995335042476654, + "num_tokens": 115506729.0, + "step": 36110 + }, + { + "entropy": 0.059855165053159, + "epoch": 8.418463690406806, + "grad_norm": 0.05908203125, + "learning_rate": 4.729107151555022e-05, + "loss": 0.008, + "mean_token_accuracy": 0.9979570627212524, + "num_tokens": 115520456.0, + "step": 36115 + }, + { + "entropy": 0.0672213020734489, + "epoch": 8.419629327427439, + "grad_norm": 0.2041015625, + "learning_rate": 4.7290122532672366e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9995971262454987, + "num_tokens": 115537752.0, + "step": 36120 + }, + { + "entropy": 0.05041234586387873, + "epoch": 8.42079496444807, + "grad_norm": 0.95703125, + "learning_rate": 4.728917340380905e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9997573018074035, + "num_tokens": 115563759.0, + "step": 36125 + }, + { + "entropy": 0.05824168622493744, + "epoch": 8.421960601468703, + "grad_norm": 0.125, + "learning_rate": 4.7288224128974445e-05, + "loss": 0.0039, + "mean_token_accuracy": 0.998965859413147, + "num_tokens": 115577018.0, + "step": 36130 + }, + { + "entropy": 0.06886055655777454, + "epoch": 8.423126238489335, + "grad_norm": 0.3046875, + "learning_rate": 4.728727470818269e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9999520361423493, + "num_tokens": 115588761.0, + "step": 36135 + }, + { + "entropy": 0.06945041492581368, + "epoch": 8.424291875509967, + "grad_norm": 0.173828125, + "learning_rate": 4.728632514144796e-05, + "loss": 0.0076, + "mean_token_accuracy": 0.9981557667255402, + "num_tokens": 115611607.0, + "step": 36140 + }, + { + "entropy": 0.07045771069824695, + "epoch": 8.425457512530597, + "grad_norm": 0.279296875, + "learning_rate": 4.728537542878439e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.9992258012294769, + "num_tokens": 115628332.0, + "step": 36145 + }, + { + "entropy": 0.052793185226619244, + "epoch": 8.42662314955123, + "grad_norm": 0.26953125, + "learning_rate": 4.728442557020616e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.9986692845821381, + "num_tokens": 115658797.0, + "step": 36150 + }, + { + "entropy": 0.05012588379904628, + "epoch": 8.427788786571861, + "grad_norm": 1.9609375, + "learning_rate": 4.7283475565727424e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.999320387840271, + "num_tokens": 115687971.0, + "step": 36155 + }, + { + "entropy": 0.07577522285282612, + "epoch": 8.428954423592494, + "grad_norm": 0.52734375, + "learning_rate": 4.7282525415362354e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9989284574985504, + "num_tokens": 115701010.0, + "step": 36160 + }, + { + "entropy": 0.0740378656424582, + "epoch": 8.430120060613126, + "grad_norm": 1.859375, + "learning_rate": 4.7281575119125124e-05, + "loss": 0.0062, + "mean_token_accuracy": 0.9979305446147919, + "num_tokens": 115718645.0, + "step": 36165 + }, + { + "entropy": 0.06981781832873821, + "epoch": 8.431285697633756, + "grad_norm": 0.203125, + "learning_rate": 4.7280624677029886e-05, + "loss": 0.0078, + "mean_token_accuracy": 0.9985656261444091, + "num_tokens": 115739479.0, + "step": 36170 + }, + { + "entropy": 0.07181575652211905, + "epoch": 8.432451334654388, + "grad_norm": 0.419921875, + "learning_rate": 4.7279674089090833e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.999373197555542, + "num_tokens": 115760286.0, + "step": 36175 + }, + { + "entropy": 0.07071037925779819, + "epoch": 8.43361697167502, + "grad_norm": 1.5859375, + "learning_rate": 4.727872335532212e-05, + "loss": 0.0061, + "mean_token_accuracy": 0.9976220965385437, + "num_tokens": 115769961.0, + "step": 36180 + }, + { + "entropy": 0.09611797733232379, + "epoch": 8.434782608695652, + "grad_norm": 5.28125, + "learning_rate": 4.727777247573794e-05, + "loss": 0.0296, + "mean_token_accuracy": 0.9970374286174775, + "num_tokens": 115795424.0, + "step": 36185 + }, + { + "entropy": 0.04846240486949682, + "epoch": 8.435948245716284, + "grad_norm": 0.94140625, + "learning_rate": 4.727682145035246e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9988361775875092, + "num_tokens": 115831652.0, + "step": 36190 + }, + { + "entropy": 0.057739018369466065, + "epoch": 8.437113882736917, + "grad_norm": 0.1474609375, + "learning_rate": 4.727587027917987e-05, + "loss": 0.0083, + "mean_token_accuracy": 0.9980798184871673, + "num_tokens": 115858353.0, + "step": 36195 + }, + { + "entropy": 0.06929563917219639, + "epoch": 8.438279519757547, + "grad_norm": 1.90625, + "learning_rate": 4.727491896223435e-05, + "loss": 0.0136, + "mean_token_accuracy": 0.9970984637737275, + "num_tokens": 115887873.0, + "step": 36200 + }, + { + "entropy": 0.05829632971435785, + "epoch": 8.439445156778179, + "grad_norm": 0.2080078125, + "learning_rate": 4.727396749953009e-05, + "loss": 0.005, + "mean_token_accuracy": 0.9983768701553345, + "num_tokens": 115904952.0, + "step": 36205 + }, + { + "entropy": 0.061249536275863645, + "epoch": 8.440610793798811, + "grad_norm": 0.98828125, + "learning_rate": 4.727301589108127e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9995396554470062, + "num_tokens": 115916917.0, + "step": 36210 + }, + { + "entropy": 0.05658513549715281, + "epoch": 8.441776430819443, + "grad_norm": 2.78125, + "learning_rate": 4.7272064136902085e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.998557734489441, + "num_tokens": 115934530.0, + "step": 36215 + }, + { + "entropy": 0.049229751247912644, + "epoch": 8.442942067840075, + "grad_norm": 0.83984375, + "learning_rate": 4.727111223700672e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9983462572097779, + "num_tokens": 115974940.0, + "step": 36220 + }, + { + "entropy": 0.06888475380837918, + "epoch": 8.444107704860706, + "grad_norm": 0.6015625, + "learning_rate": 4.727016019140938e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9989739239215851, + "num_tokens": 115997818.0, + "step": 36225 + }, + { + "entropy": 0.06635741349309683, + "epoch": 8.445273341881338, + "grad_norm": 1.6875, + "learning_rate": 4.7269208000124256e-05, + "loss": 0.0167, + "mean_token_accuracy": 0.9960039258003235, + "num_tokens": 116007111.0, + "step": 36230 + }, + { + "entropy": 0.06352852582931519, + "epoch": 8.44643897890197, + "grad_norm": 0.671875, + "learning_rate": 4.726825566316555e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9989105463027954, + "num_tokens": 116019964.0, + "step": 36235 + }, + { + "entropy": 0.06089718565344811, + "epoch": 8.447604615922602, + "grad_norm": 0.369140625, + "learning_rate": 4.726730318054745e-05, + "loss": 0.0099, + "mean_token_accuracy": 0.9988169968128204, + "num_tokens": 116045247.0, + "step": 36240 + }, + { + "entropy": 0.044029067549854514, + "epoch": 8.448770252943234, + "grad_norm": 0.75390625, + "learning_rate": 4.726635055228418e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9987780928611756, + "num_tokens": 116067328.0, + "step": 36245 + }, + { + "entropy": 0.050043806247413156, + "epoch": 8.449935889963864, + "grad_norm": 0.2099609375, + "learning_rate": 4.726539777838993e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9993367373943329, + "num_tokens": 116086931.0, + "step": 36250 + }, + { + "entropy": 0.060326000954955813, + "epoch": 8.451101526984496, + "grad_norm": 0.408203125, + "learning_rate": 4.726444485887891e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9995485246181488, + "num_tokens": 116112104.0, + "step": 36255 + }, + { + "entropy": 0.06028610188513994, + "epoch": 8.452267164005129, + "grad_norm": 2.734375, + "learning_rate": 4.726349179376533e-05, + "loss": 0.0055, + "mean_token_accuracy": 0.9986335098743438, + "num_tokens": 116128103.0, + "step": 36260 + }, + { + "entropy": 0.07007607705891132, + "epoch": 8.45343280102576, + "grad_norm": 0.765625, + "learning_rate": 4.7262538583063404e-05, + "loss": 0.0042, + "mean_token_accuracy": 0.9987308144569397, + "num_tokens": 116147244.0, + "step": 36265 + }, + { + "entropy": 0.0728133057244122, + "epoch": 8.454598438046393, + "grad_norm": 1.046875, + "learning_rate": 4.726158522678734e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9991988897323608, + "num_tokens": 116165431.0, + "step": 36270 + }, + { + "entropy": 0.0499640004709363, + "epoch": 8.455764075067025, + "grad_norm": 1.6171875, + "learning_rate": 4.726063172495137e-05, + "loss": 0.0039, + "mean_token_accuracy": 0.9992352545261383, + "num_tokens": 116184587.0, + "step": 36275 + }, + { + "entropy": 0.06305303145200014, + "epoch": 8.456929712087655, + "grad_norm": 0.058349609375, + "learning_rate": 4.725967807756969e-05, + "loss": 0.0059, + "mean_token_accuracy": 0.9978918075561524, + "num_tokens": 116197306.0, + "step": 36280 + }, + { + "entropy": 0.06247878428548574, + "epoch": 8.458095349108287, + "grad_norm": 0.2001953125, + "learning_rate": 4.725872428465653e-05, + "loss": 0.0078, + "mean_token_accuracy": 0.9982529759407044, + "num_tokens": 116215388.0, + "step": 36285 + }, + { + "entropy": 0.05539757264778018, + "epoch": 8.45926098612892, + "grad_norm": 0.3515625, + "learning_rate": 4.725777034622611e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9996000409126282, + "num_tokens": 116242407.0, + "step": 36290 + }, + { + "entropy": 0.05653154449537397, + "epoch": 8.460426623149552, + "grad_norm": 0.2294921875, + "learning_rate": 4.7256816262292665e-05, + "loss": 0.0085, + "mean_token_accuracy": 0.9972741007804871, + "num_tokens": 116268238.0, + "step": 36295 + }, + { + "entropy": 0.077280986122787, + "epoch": 8.461592260170184, + "grad_norm": 0.037109375, + "learning_rate": 4.725586203287041e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9990314662456512, + "num_tokens": 116282961.0, + "step": 36300 + }, + { + "entropy": 0.05508853131905198, + "epoch": 8.462757897190814, + "grad_norm": 0.380859375, + "learning_rate": 4.725490765797358e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9997860372066498, + "num_tokens": 116311225.0, + "step": 36305 + }, + { + "entropy": 0.058704734221100806, + "epoch": 8.463923534211446, + "grad_norm": 0.16015625, + "learning_rate": 4.725395313761641e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9997250258922576, + "num_tokens": 116325286.0, + "step": 36310 + }, + { + "entropy": 0.05704579222947359, + "epoch": 8.465089171232078, + "grad_norm": 0.2890625, + "learning_rate": 4.725299847181312e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9993097305297851, + "num_tokens": 116338229.0, + "step": 36315 + }, + { + "entropy": 0.05245425468310714, + "epoch": 8.46625480825271, + "grad_norm": 0.095703125, + "learning_rate": 4.725204366057796e-05, + "loss": 0.0115, + "mean_token_accuracy": 0.9979504764080047, + "num_tokens": 116356198.0, + "step": 36320 + }, + { + "entropy": 0.05498585319146514, + "epoch": 8.467420445273342, + "grad_norm": 0.2119140625, + "learning_rate": 4.725108870392516e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9978257954120636, + "num_tokens": 116381242.0, + "step": 36325 + }, + { + "entropy": 0.08065539970993996, + "epoch": 8.468586082293974, + "grad_norm": 0.51171875, + "learning_rate": 4.725013360186895e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9993955552577972, + "num_tokens": 116394168.0, + "step": 36330 + }, + { + "entropy": 0.07359606567770242, + "epoch": 8.469751719314605, + "grad_norm": 2.1875, + "learning_rate": 4.72491783544236e-05, + "loss": 0.0133, + "mean_token_accuracy": 0.9973003745079041, + "num_tokens": 116428556.0, + "step": 36335 + }, + { + "entropy": 0.061815372481942174, + "epoch": 8.470917356335237, + "grad_norm": 0.59375, + "learning_rate": 4.724822296160332e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9987725913524628, + "num_tokens": 116443891.0, + "step": 36340 + }, + { + "entropy": 0.0534939656034112, + "epoch": 8.472082993355869, + "grad_norm": 0.146484375, + "learning_rate": 4.7247267423422386e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9995495617389679, + "num_tokens": 116468462.0, + "step": 36345 + }, + { + "entropy": 0.058882415667176245, + "epoch": 8.473248630376501, + "grad_norm": 0.12890625, + "learning_rate": 4.7246311739895035e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9995469570159912, + "num_tokens": 116485331.0, + "step": 36350 + }, + { + "entropy": 0.06126846736297011, + "epoch": 8.474414267397133, + "grad_norm": 0.185546875, + "learning_rate": 4.7245355911035506e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9989485740661621, + "num_tokens": 116499399.0, + "step": 36355 + }, + { + "entropy": 0.06335963495075703, + "epoch": 8.475579904417764, + "grad_norm": 0.9921875, + "learning_rate": 4.724439993685807e-05, + "loss": 0.0053, + "mean_token_accuracy": 0.9982250690460205, + "num_tokens": 116514649.0, + "step": 36360 + }, + { + "entropy": 0.06798462234437466, + "epoch": 8.476745541438396, + "grad_norm": 2.21875, + "learning_rate": 4.724344381737697e-05, + "loss": 0.0097, + "mean_token_accuracy": 0.9951762199401856, + "num_tokens": 116524731.0, + "step": 36365 + }, + { + "entropy": 0.047466957662254575, + "epoch": 8.477911178459028, + "grad_norm": 0.2890625, + "learning_rate": 4.7242487552606475e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9994238436222076, + "num_tokens": 116555864.0, + "step": 36370 + }, + { + "entropy": 0.055692866630852225, + "epoch": 8.47907681547966, + "grad_norm": 0.4765625, + "learning_rate": 4.724153114256083e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9994192600250245, + "num_tokens": 116575010.0, + "step": 36375 + }, + { + "entropy": 0.06099151102825999, + "epoch": 8.480242452500292, + "grad_norm": 0.1552734375, + "learning_rate": 4.7240574587254304e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9995291829109192, + "num_tokens": 116592779.0, + "step": 36380 + }, + { + "entropy": 0.06979598198086023, + "epoch": 8.481408089520922, + "grad_norm": 0.91015625, + "learning_rate": 4.723961788670117e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9991238117218018, + "num_tokens": 116612943.0, + "step": 36385 + }, + { + "entropy": 0.054224473610520366, + "epoch": 8.482573726541554, + "grad_norm": 0.484375, + "learning_rate": 4.7238661040915675e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9984688460826874, + "num_tokens": 116636565.0, + "step": 36390 + }, + { + "entropy": 0.054101691022515296, + "epoch": 8.483739363562186, + "grad_norm": 0.546875, + "learning_rate": 4.7237704049912095e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9988676548004151, + "num_tokens": 116657239.0, + "step": 36395 + }, + { + "entropy": 0.06508222203701734, + "epoch": 8.484905000582819, + "grad_norm": 0.59765625, + "learning_rate": 4.723674691370471e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9983641326427459, + "num_tokens": 116670991.0, + "step": 36400 + }, + { + "entropy": 0.060155941732227804, + "epoch": 8.48607063760345, + "grad_norm": 0.73046875, + "learning_rate": 4.723578963230777e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9993346929550171, + "num_tokens": 116684413.0, + "step": 36405 + }, + { + "entropy": 0.06790580712258816, + "epoch": 8.487236274624083, + "grad_norm": 1.3125, + "learning_rate": 4.723483220573557e-05, + "loss": 0.0047, + "mean_token_accuracy": 0.9989775061607361, + "num_tokens": 116697951.0, + "step": 36410 + }, + { + "entropy": 0.09916951693594456, + "epoch": 8.488401911644713, + "grad_norm": 2.015625, + "learning_rate": 4.723387463400238e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9989169955253601, + "num_tokens": 116705777.0, + "step": 36415 + }, + { + "entropy": 0.05130702285096049, + "epoch": 8.489567548665345, + "grad_norm": 1.75, + "learning_rate": 4.723291691712248e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9996195018291474, + "num_tokens": 116730161.0, + "step": 36420 + }, + { + "entropy": 0.04962422214448452, + "epoch": 8.490733185685977, + "grad_norm": 0.2451171875, + "learning_rate": 4.723195905511015e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9991441249847413, + "num_tokens": 116748972.0, + "step": 36425 + }, + { + "entropy": 0.04863788112998009, + "epoch": 8.49189882270661, + "grad_norm": 0.2373046875, + "learning_rate": 4.723100104797967e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9993934810161591, + "num_tokens": 116778203.0, + "step": 36430 + }, + { + "entropy": 0.04388864999637008, + "epoch": 8.493064459727242, + "grad_norm": 0.3203125, + "learning_rate": 4.723004289574533e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9993597865104675, + "num_tokens": 116802243.0, + "step": 36435 + }, + { + "entropy": 0.06913177091628313, + "epoch": 8.494230096747872, + "grad_norm": 0.51953125, + "learning_rate": 4.722908459842141e-05, + "loss": 0.005, + "mean_token_accuracy": 0.998873645067215, + "num_tokens": 116827393.0, + "step": 36440 + }, + { + "entropy": 0.07518032267689705, + "epoch": 8.495395733768504, + "grad_norm": 3.625, + "learning_rate": 4.7228126156022216e-05, + "loss": 0.0068, + "mean_token_accuracy": 0.9983530879020691, + "num_tokens": 116838490.0, + "step": 36445 + }, + { + "entropy": 0.07093150406144559, + "epoch": 8.496561370789136, + "grad_norm": 0.76171875, + "learning_rate": 4.722716756856203e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9985154151916504, + "num_tokens": 116855780.0, + "step": 36450 + }, + { + "entropy": 0.06148874796926975, + "epoch": 8.497727007809768, + "grad_norm": 0.271484375, + "learning_rate": 4.7226208836055136e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.9992707073688507, + "num_tokens": 116873275.0, + "step": 36455 + }, + { + "entropy": 0.05633387309499085, + "epoch": 8.4988926448304, + "grad_norm": 0.357421875, + "learning_rate": 4.7225249958515836e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9990444958209992, + "num_tokens": 116897277.0, + "step": 36460 + }, + { + "entropy": 0.06432565553113818, + "epoch": 8.500058281851032, + "grad_norm": 2.546875, + "learning_rate": 4.7224290935958444e-05, + "loss": 0.0059, + "mean_token_accuracy": 0.9976148426532745, + "num_tokens": 116919333.0, + "step": 36465 + }, + { + "entropy": 0.05420015938580036, + "epoch": 8.501223918871663, + "grad_norm": 0.1484375, + "learning_rate": 4.722333176839724e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9995829820632934, + "num_tokens": 116933248.0, + "step": 36470 + }, + { + "entropy": 0.055879263393580916, + "epoch": 8.502389555892295, + "grad_norm": 0.2392578125, + "learning_rate": 4.7222372455846536e-05, + "loss": 0.0077, + "mean_token_accuracy": 0.9986678183078765, + "num_tokens": 116946484.0, + "step": 36475 + }, + { + "entropy": 0.07837851643562317, + "epoch": 8.503555192912927, + "grad_norm": 0.58203125, + "learning_rate": 4.7221412998320636e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9984042167663574, + "num_tokens": 116956517.0, + "step": 36480 + }, + { + "entropy": 0.07617838717997075, + "epoch": 8.504720829933559, + "grad_norm": 2.390625, + "learning_rate": 4.722045339583385e-05, + "loss": 0.0038, + "mean_token_accuracy": 0.9988938868045807, + "num_tokens": 116977417.0, + "step": 36485 + }, + { + "entropy": 0.04583414709195495, + "epoch": 8.505886466954191, + "grad_norm": 0.75390625, + "learning_rate": 4.7219493648400474e-05, + "loss": 0.0071, + "mean_token_accuracy": 0.9980126678943634, + "num_tokens": 117002359.0, + "step": 36490 + }, + { + "entropy": 0.041270645707845686, + "epoch": 8.507052103974821, + "grad_norm": 0.193359375, + "learning_rate": 4.7218533756034835e-05, + "loss": 0.0068, + "mean_token_accuracy": 0.9986809670925141, + "num_tokens": 117035657.0, + "step": 36495 + }, + { + "entropy": 0.05645858906209469, + "epoch": 8.508217740995454, + "grad_norm": 1.6171875, + "learning_rate": 4.7217573718751243e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.999339473247528, + "num_tokens": 117052075.0, + "step": 36500 + }, + { + "entropy": 0.0792703942861408, + "epoch": 8.509383378016086, + "grad_norm": 0.099609375, + "learning_rate": 4.7216613536564005e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9995384693145752, + "num_tokens": 117070170.0, + "step": 36505 + }, + { + "entropy": 0.058775024861097334, + "epoch": 8.510549015036718, + "grad_norm": 0.1650390625, + "learning_rate": 4.7215653209487444e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9987652480602265, + "num_tokens": 117088783.0, + "step": 36510 + }, + { + "entropy": 0.052767443284392354, + "epoch": 8.51171465205735, + "grad_norm": 1.125, + "learning_rate": 4.721469273753588e-05, + "loss": 0.0071, + "mean_token_accuracy": 0.9984904825687408, + "num_tokens": 117114387.0, + "step": 36515 + }, + { + "entropy": 0.05728981187567115, + "epoch": 8.51288028907798, + "grad_norm": 0.2890625, + "learning_rate": 4.721373212072364e-05, + "loss": 0.0049, + "mean_token_accuracy": 0.9978182494640351, + "num_tokens": 117135319.0, + "step": 36520 + }, + { + "entropy": 0.07348077250644565, + "epoch": 8.514045926098612, + "grad_norm": 2.03125, + "learning_rate": 4.721277135906504e-05, + "loss": 0.0077, + "mean_token_accuracy": 0.9993664264678955, + "num_tokens": 117158108.0, + "step": 36525 + }, + { + "entropy": 0.04832636620849371, + "epoch": 8.515211563119244, + "grad_norm": 0.439453125, + "learning_rate": 4.7211810452574415e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.997581273317337, + "num_tokens": 117186605.0, + "step": 36530 + }, + { + "entropy": 0.06315313652157784, + "epoch": 8.516377200139877, + "grad_norm": 0.171875, + "learning_rate": 4.721084940126609e-05, + "loss": 0.0011, + "mean_token_accuracy": 1.0, + "num_tokens": 117197974.0, + "step": 36535 + }, + { + "entropy": 0.06376685537397861, + "epoch": 8.517542837160509, + "grad_norm": 1.6015625, + "learning_rate": 4.720988820515439e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9981851637363434, + "num_tokens": 117208905.0, + "step": 36540 + }, + { + "entropy": 0.05537942638620734, + "epoch": 8.51870847418114, + "grad_norm": 0.296875, + "learning_rate": 4.7208926864253655e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9994171023368835, + "num_tokens": 117232817.0, + "step": 36545 + }, + { + "entropy": 0.05915795974433422, + "epoch": 8.519874111201771, + "grad_norm": 0.2255859375, + "learning_rate": 4.7207965378578215e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9996034562587738, + "num_tokens": 117245430.0, + "step": 36550 + }, + { + "entropy": 0.05175903402268887, + "epoch": 8.521039748222403, + "grad_norm": 0.18359375, + "learning_rate": 4.720700374814241e-05, + "loss": 0.0038, + "mean_token_accuracy": 0.9987463653087616, + "num_tokens": 117270560.0, + "step": 36555 + }, + { + "entropy": 0.06156531311571598, + "epoch": 8.522205385243035, + "grad_norm": 0.6484375, + "learning_rate": 4.720604197296058e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9983774304389954, + "num_tokens": 117280633.0, + "step": 36560 + }, + { + "entropy": 0.0584191894158721, + "epoch": 8.523371022263667, + "grad_norm": 0.421875, + "learning_rate": 4.720508005304706e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9979436278343201, + "num_tokens": 117301478.0, + "step": 36565 + }, + { + "entropy": 0.05222648493945599, + "epoch": 8.5245366592843, + "grad_norm": 0.1904296875, + "learning_rate": 4.720411798841621e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9989825844764709, + "num_tokens": 117313296.0, + "step": 36570 + }, + { + "entropy": 0.056239666417241096, + "epoch": 8.52570229630493, + "grad_norm": 2.21875, + "learning_rate": 4.720315577908236e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9994934022426605, + "num_tokens": 117336909.0, + "step": 36575 + }, + { + "entropy": 0.07502004820853472, + "epoch": 8.526867933325562, + "grad_norm": 1.9609375, + "learning_rate": 4.720219342505986e-05, + "loss": 0.016, + "mean_token_accuracy": 0.9961349189281463, + "num_tokens": 117354325.0, + "step": 36580 + }, + { + "entropy": 0.08849140480160714, + "epoch": 8.528033570346194, + "grad_norm": 3.296875, + "learning_rate": 4.7201230926363065e-05, + "loss": 0.0082, + "mean_token_accuracy": 0.9983762741088867, + "num_tokens": 117364550.0, + "step": 36585 + }, + { + "entropy": 0.05067680682986975, + "epoch": 8.529199207366826, + "grad_norm": 0.69140625, + "learning_rate": 4.720026828300632e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9997664391994476, + "num_tokens": 117384001.0, + "step": 36590 + }, + { + "entropy": 0.07895645070821047, + "epoch": 8.530364844387458, + "grad_norm": 0.337890625, + "learning_rate": 4.7199305495003995e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.998946338891983, + "num_tokens": 117397445.0, + "step": 36595 + }, + { + "entropy": 0.049645941238850355, + "epoch": 8.53153048140809, + "grad_norm": 3.4375, + "learning_rate": 4.7198342562370436e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.9991376340389252, + "num_tokens": 117420151.0, + "step": 36600 + }, + { + "entropy": 0.06408117078244686, + "epoch": 8.53269611842872, + "grad_norm": 0.408203125, + "learning_rate": 4.719737948512e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9996393918991089, + "num_tokens": 117434708.0, + "step": 36605 + }, + { + "entropy": 0.06821114458143711, + "epoch": 8.533861755449353, + "grad_norm": 1.5234375, + "learning_rate": 4.719641626326704e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9991869747638702, + "num_tokens": 117445971.0, + "step": 36610 + }, + { + "entropy": 0.05026884274557233, + "epoch": 8.535027392469985, + "grad_norm": 0.2177734375, + "learning_rate": 4.719545289682594e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.998915296792984, + "num_tokens": 117478835.0, + "step": 36615 + }, + { + "entropy": 0.05111234989017248, + "epoch": 8.536193029490617, + "grad_norm": 0.07666015625, + "learning_rate": 4.719448938581105e-05, + "loss": 0.0073, + "mean_token_accuracy": 0.998233848810196, + "num_tokens": 117510505.0, + "step": 36620 + }, + { + "entropy": 0.056202891655266284, + "epoch": 8.537358666511249, + "grad_norm": 0.3671875, + "learning_rate": 4.719352573023674e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9993539452552795, + "num_tokens": 117535677.0, + "step": 36625 + }, + { + "entropy": 0.05016994327306747, + "epoch": 8.53852430353188, + "grad_norm": 0.353515625, + "learning_rate": 4.719256193011739e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9991106808185577, + "num_tokens": 117561357.0, + "step": 36630 + }, + { + "entropy": 0.04156949240714312, + "epoch": 8.539689940552512, + "grad_norm": 0.2060546875, + "learning_rate": 4.719159798546736e-05, + "loss": 0.0042, + "mean_token_accuracy": 0.9992599308490753, + "num_tokens": 117594891.0, + "step": 36635 + }, + { + "entropy": 0.04975649286061525, + "epoch": 8.540855577573144, + "grad_norm": 0.25390625, + "learning_rate": 4.719063389630103e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9999047815799713, + "num_tokens": 117627847.0, + "step": 36640 + }, + { + "entropy": 0.060001560300588605, + "epoch": 8.542021214593776, + "grad_norm": 1.1328125, + "learning_rate": 4.7189669662632765e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9989645898342132, + "num_tokens": 117641464.0, + "step": 36645 + }, + { + "entropy": 0.07360632680356502, + "epoch": 8.543186851614408, + "grad_norm": 0.68359375, + "learning_rate": 4.7188705284476955e-05, + "loss": 0.004, + "mean_token_accuracy": 0.998823744058609, + "num_tokens": 117650419.0, + "step": 36650 + }, + { + "entropy": 0.07439129706472158, + "epoch": 8.544352488635038, + "grad_norm": 0.5234375, + "learning_rate": 4.7187740761847974e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9991745054721832, + "num_tokens": 117666198.0, + "step": 36655 + }, + { + "entropy": 0.07077626138925552, + "epoch": 8.54551812565567, + "grad_norm": 0.4140625, + "learning_rate": 4.718677609476021e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9988212764263154, + "num_tokens": 117685003.0, + "step": 36660 + }, + { + "entropy": 0.06650689067319035, + "epoch": 8.546683762676302, + "grad_norm": 0.2490234375, + "learning_rate": 4.7185811283228046e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9989586055278779, + "num_tokens": 117708501.0, + "step": 36665 + }, + { + "entropy": 0.05205532372929156, + "epoch": 8.547849399696934, + "grad_norm": 3.265625, + "learning_rate": 4.7184846327265865e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9995387613773346, + "num_tokens": 117729794.0, + "step": 36670 + }, + { + "entropy": 0.05996727123856545, + "epoch": 8.549015036717567, + "grad_norm": 0.296875, + "learning_rate": 4.718388122688806e-05, + "loss": 0.0042, + "mean_token_accuracy": 0.9982151925563812, + "num_tokens": 117759396.0, + "step": 36675 + }, + { + "entropy": 0.08831366430968046, + "epoch": 8.550180673738199, + "grad_norm": 0.8984375, + "learning_rate": 4.718291598210902e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9998996675014495, + "num_tokens": 117771998.0, + "step": 36680 + }, + { + "entropy": 0.049416807992383835, + "epoch": 8.551346310758829, + "grad_norm": 0.859375, + "learning_rate": 4.7181950592943134e-05, + "loss": 0.0084, + "mean_token_accuracy": 0.997665536403656, + "num_tokens": 117817626.0, + "step": 36685 + }, + { + "entropy": 0.07952840253710747, + "epoch": 8.552511947779461, + "grad_norm": 0.34375, + "learning_rate": 4.718098505940481e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.999513429403305, + "num_tokens": 117831508.0, + "step": 36690 + }, + { + "entropy": 0.06541987545788289, + "epoch": 8.553677584800093, + "grad_norm": 0.421875, + "learning_rate": 4.7180019381508435e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9980966448783875, + "num_tokens": 117859078.0, + "step": 36695 + }, + { + "entropy": 0.06116861402988434, + "epoch": 8.554843221820725, + "grad_norm": 0.74609375, + "learning_rate": 4.717905355926841e-05, + "loss": 0.0155, + "mean_token_accuracy": 0.9986841559410096, + "num_tokens": 117871420.0, + "step": 36700 + }, + { + "entropy": 0.0664717435836792, + "epoch": 8.556008858841357, + "grad_norm": 0.265625, + "learning_rate": 4.717808759269914e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.9988701105117798, + "num_tokens": 117882255.0, + "step": 36705 + }, + { + "entropy": 0.0731908904388547, + "epoch": 8.557174495861988, + "grad_norm": 2.828125, + "learning_rate": 4.717712148181503e-05, + "loss": 0.0065, + "mean_token_accuracy": 0.9986431419849395, + "num_tokens": 117899097.0, + "step": 36710 + }, + { + "entropy": 0.05931155104190111, + "epoch": 8.55834013288262, + "grad_norm": 1.34375, + "learning_rate": 4.7176155226630476e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9985710024833679, + "num_tokens": 117919256.0, + "step": 36715 + }, + { + "entropy": 0.05108649954199791, + "epoch": 8.559505769903252, + "grad_norm": 0.2177734375, + "learning_rate": 4.7175188827159897e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9996058583259583, + "num_tokens": 117940881.0, + "step": 36720 + }, + { + "entropy": 0.05063903266564011, + "epoch": 8.560671406923884, + "grad_norm": 2.296875, + "learning_rate": 4.71742222834177e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9989172637462616, + "num_tokens": 117958438.0, + "step": 36725 + }, + { + "entropy": 0.04010082418099046, + "epoch": 8.561837043944516, + "grad_norm": 0.2353515625, + "learning_rate": 4.71732555954183e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9995945632457733, + "num_tokens": 117985829.0, + "step": 36730 + }, + { + "entropy": 0.03602319527417421, + "epoch": 8.563002680965148, + "grad_norm": 0.169921875, + "learning_rate": 4.717228876317611e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9995147943496704, + "num_tokens": 118015283.0, + "step": 36735 + }, + { + "entropy": 0.041568309720605615, + "epoch": 8.564168317985779, + "grad_norm": 0.435546875, + "learning_rate": 4.7171321786705544e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.9983395159244537, + "num_tokens": 118055251.0, + "step": 36740 + }, + { + "entropy": 0.09163246341049672, + "epoch": 8.56533395500641, + "grad_norm": 0.11572265625, + "learning_rate": 4.717035466602103e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9996281206607819, + "num_tokens": 118073912.0, + "step": 36745 + }, + { + "entropy": 0.050749783497303726, + "epoch": 8.566499592027043, + "grad_norm": 0.703125, + "learning_rate": 4.7169387401136976e-05, + "loss": 0.005, + "mean_token_accuracy": 0.9987952232360839, + "num_tokens": 118095700.0, + "step": 36750 + }, + { + "entropy": 0.06472921185195446, + "epoch": 8.567665229047675, + "grad_norm": 0.369140625, + "learning_rate": 4.7168419992067816e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.999310165643692, + "num_tokens": 118106780.0, + "step": 36755 + }, + { + "entropy": 0.0620553707703948, + "epoch": 8.568830866068307, + "grad_norm": 0.435546875, + "learning_rate": 4.716745243882797e-05, + "loss": 0.005, + "mean_token_accuracy": 0.9971357882022858, + "num_tokens": 118127361.0, + "step": 36760 + }, + { + "entropy": 0.0793293721973896, + "epoch": 8.569996503088937, + "grad_norm": 1.9140625, + "learning_rate": 4.7166484741431865e-05, + "loss": 0.0073, + "mean_token_accuracy": 0.9988360643386841, + "num_tokens": 118142418.0, + "step": 36765 + }, + { + "entropy": 0.05423343572765589, + "epoch": 8.57116214010957, + "grad_norm": 0.7890625, + "learning_rate": 4.7165516899893934e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9994962751865387, + "num_tokens": 118154318.0, + "step": 36770 + }, + { + "entropy": 0.060041078738868235, + "epoch": 8.572327777130202, + "grad_norm": 0.2255859375, + "learning_rate": 4.716454891422861e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9993019044399262, + "num_tokens": 118176358.0, + "step": 36775 + }, + { + "entropy": 0.06099687637761235, + "epoch": 8.573493414150834, + "grad_norm": 1.5234375, + "learning_rate": 4.716358078445033e-05, + "loss": 0.0062, + "mean_token_accuracy": 0.9983492612838745, + "num_tokens": 118195838.0, + "step": 36780 + }, + { + "entropy": 0.06963885650038719, + "epoch": 8.574659051171466, + "grad_norm": 2.875, + "learning_rate": 4.716261251057352e-05, + "loss": 0.0069, + "mean_token_accuracy": 0.9993766665458679, + "num_tokens": 118206136.0, + "step": 36785 + }, + { + "entropy": 0.052096801623702046, + "epoch": 8.575824688192096, + "grad_norm": 0.27734375, + "learning_rate": 4.7161644092612624e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9995839893817902, + "num_tokens": 118227387.0, + "step": 36790 + }, + { + "entropy": 0.04954950464889407, + "epoch": 8.576990325212728, + "grad_norm": 0.29296875, + "learning_rate": 4.7160675530582084e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9993766367435455, + "num_tokens": 118253458.0, + "step": 36795 + }, + { + "entropy": 0.0848664847202599, + "epoch": 8.57815596223336, + "grad_norm": 0.7421875, + "learning_rate": 4.715970682449634e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9990188479423523, + "num_tokens": 118277832.0, + "step": 36800 + }, + { + "entropy": 0.05834921356290579, + "epoch": 8.579321599253992, + "grad_norm": 2.234375, + "learning_rate": 4.715873797436984e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9983950853347778, + "num_tokens": 118291990.0, + "step": 36805 + }, + { + "entropy": 0.0686840882524848, + "epoch": 8.580487236274625, + "grad_norm": 0.349609375, + "learning_rate": 4.715776898021702e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9983816087245941, + "num_tokens": 118316014.0, + "step": 36810 + }, + { + "entropy": 0.07611836092546582, + "epoch": 8.581652873295257, + "grad_norm": 1.21875, + "learning_rate": 4.715679984205236e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.9976979672908783, + "num_tokens": 118333823.0, + "step": 36815 + }, + { + "entropy": 0.07102014757692814, + "epoch": 8.582818510315887, + "grad_norm": 0.87109375, + "learning_rate": 4.715583055989027e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9995433807373046, + "num_tokens": 118343033.0, + "step": 36820 + }, + { + "entropy": 0.049099778011441234, + "epoch": 8.583984147336519, + "grad_norm": 0.53515625, + "learning_rate": 4.715486113374523e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9993889331817627, + "num_tokens": 118362082.0, + "step": 36825 + }, + { + "entropy": 0.05649897027760744, + "epoch": 8.585149784357151, + "grad_norm": 0.283203125, + "learning_rate": 4.715389156363169e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9999586164951324, + "num_tokens": 118376154.0, + "step": 36830 + }, + { + "entropy": 0.050150804314762355, + "epoch": 8.586315421377783, + "grad_norm": 0.1259765625, + "learning_rate": 4.71529218495641e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9996409952640534, + "num_tokens": 118407431.0, + "step": 36835 + }, + { + "entropy": 0.07235877737402915, + "epoch": 8.587481058398415, + "grad_norm": 5.21875, + "learning_rate": 4.715195199155694e-05, + "loss": 0.0092, + "mean_token_accuracy": 0.9982260286808013, + "num_tokens": 118426403.0, + "step": 36840 + }, + { + "entropy": 0.05380323426797986, + "epoch": 8.588646695419046, + "grad_norm": 1.1171875, + "learning_rate": 4.7150981989624646e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9996935486793518, + "num_tokens": 118440910.0, + "step": 36845 + }, + { + "entropy": 0.06442772559821605, + "epoch": 8.589812332439678, + "grad_norm": 0.74609375, + "learning_rate": 4.71500118437817e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9995631098747253, + "num_tokens": 118454440.0, + "step": 36850 + }, + { + "entropy": 0.06424436261877417, + "epoch": 8.59097796946031, + "grad_norm": 0.5234375, + "learning_rate": 4.714904155404256e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9995743334293365, + "num_tokens": 118477017.0, + "step": 36855 + }, + { + "entropy": 0.06407613288611173, + "epoch": 8.592143606480942, + "grad_norm": 0.384765625, + "learning_rate": 4.71480711204217e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9990883469581604, + "num_tokens": 118498748.0, + "step": 36860 + }, + { + "entropy": 0.06708194902166724, + "epoch": 8.593309243501574, + "grad_norm": 0.2490234375, + "learning_rate": 4.7147100542933585e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.9988902091979981, + "num_tokens": 118511220.0, + "step": 36865 + }, + { + "entropy": 0.05490873893722892, + "epoch": 8.594474880522206, + "grad_norm": 0.19140625, + "learning_rate": 4.714612982159269e-05, + "loss": 0.0099, + "mean_token_accuracy": 0.9985246181488037, + "num_tokens": 118531407.0, + "step": 36870 + }, + { + "entropy": 0.05289556067436933, + "epoch": 8.595640517542837, + "grad_norm": 0.4609375, + "learning_rate": 4.71451589564135e-05, + "loss": 0.0076, + "mean_token_accuracy": 0.9989979445934296, + "num_tokens": 118553472.0, + "step": 36875 + }, + { + "entropy": 0.056383601669222114, + "epoch": 8.596806154563469, + "grad_norm": 1.4453125, + "learning_rate": 4.714418794741048e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.999051034450531, + "num_tokens": 118571136.0, + "step": 36880 + }, + { + "entropy": 0.07303898371756076, + "epoch": 8.5979717915841, + "grad_norm": 1.1015625, + "learning_rate": 4.714321679459811e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9996551752090455, + "num_tokens": 118580619.0, + "step": 36885 + }, + { + "entropy": 0.06205555312335491, + "epoch": 8.599137428604733, + "grad_norm": 0.2294921875, + "learning_rate": 4.7142245497990863e-05, + "loss": 0.0108, + "mean_token_accuracy": 0.9981782078742981, + "num_tokens": 118602080.0, + "step": 36890 + }, + { + "entropy": 0.05046365726739168, + "epoch": 8.600303065625365, + "grad_norm": 0.1552734375, + "learning_rate": 4.7141274057603246e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9998971283435821, + "num_tokens": 118632524.0, + "step": 36895 + }, + { + "entropy": 0.058700887486338615, + "epoch": 8.601468702645995, + "grad_norm": 1.828125, + "learning_rate": 4.714030247344973e-05, + "loss": 0.0067, + "mean_token_accuracy": 0.9982360363006592, + "num_tokens": 118644958.0, + "step": 36900 + }, + { + "entropy": 0.061605300009250644, + "epoch": 8.602634339666627, + "grad_norm": 2.953125, + "learning_rate": 4.713933074554479e-05, + "loss": 0.0121, + "mean_token_accuracy": 0.9974478423595429, + "num_tokens": 118668839.0, + "step": 36905 + }, + { + "entropy": 0.0644808927550912, + "epoch": 8.60379997668726, + "grad_norm": 2.796875, + "learning_rate": 4.713835887390295e-05, + "loss": 0.004, + "mean_token_accuracy": 0.998391056060791, + "num_tokens": 118690493.0, + "step": 36910 + }, + { + "entropy": 0.06565331649035215, + "epoch": 8.604965613707892, + "grad_norm": 0.9921875, + "learning_rate": 4.713738685853867e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9990341901779175, + "num_tokens": 118713103.0, + "step": 36915 + }, + { + "entropy": 0.056130803655833006, + "epoch": 8.606131250728524, + "grad_norm": 0.345703125, + "learning_rate": 4.713641469946646e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.9994325637817383, + "num_tokens": 118739582.0, + "step": 36920 + }, + { + "entropy": 0.04501139735803008, + "epoch": 8.607296887749154, + "grad_norm": 0.2392578125, + "learning_rate": 4.713544239670082e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9992862045764923, + "num_tokens": 118767796.0, + "step": 36925 + }, + { + "entropy": 0.061400901339948175, + "epoch": 8.608462524769786, + "grad_norm": 1.3515625, + "learning_rate": 4.7134469950256234e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9987517178058625, + "num_tokens": 118783178.0, + "step": 36930 + }, + { + "entropy": 0.08621167857199907, + "epoch": 8.609628161790418, + "grad_norm": 0.470703125, + "learning_rate": 4.713349736014721e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9996699929237366, + "num_tokens": 118798519.0, + "step": 36935 + }, + { + "entropy": 0.07519685104489326, + "epoch": 8.61079379881105, + "grad_norm": 0.154296875, + "learning_rate": 4.713252462638825e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9999246597290039, + "num_tokens": 118812634.0, + "step": 36940 + }, + { + "entropy": 0.07892098985612392, + "epoch": 8.611959435831682, + "grad_norm": 0.5390625, + "learning_rate": 4.7131551748993865e-05, + "loss": 0.0042, + "mean_token_accuracy": 0.9985935688018799, + "num_tokens": 118823025.0, + "step": 36945 + }, + { + "entropy": 0.06437665317207575, + "epoch": 8.613125072852315, + "grad_norm": 0.3046875, + "learning_rate": 4.7130578727978555e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.999948388338089, + "num_tokens": 118845104.0, + "step": 36950 + }, + { + "entropy": 0.06747263586148619, + "epoch": 8.614290709872945, + "grad_norm": 0.44921875, + "learning_rate": 4.7129605563356826e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9994589805603027, + "num_tokens": 118891752.0, + "step": 36955 + }, + { + "entropy": 0.05360950659960508, + "epoch": 8.615456346893577, + "grad_norm": 0.62890625, + "learning_rate": 4.71286322551432e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.998487788438797, + "num_tokens": 118911281.0, + "step": 36960 + }, + { + "entropy": 0.06426534093916417, + "epoch": 8.616621983914209, + "grad_norm": 0.8203125, + "learning_rate": 4.712765880335218e-05, + "loss": 0.0082, + "mean_token_accuracy": 0.997918164730072, + "num_tokens": 118921605.0, + "step": 36965 + }, + { + "entropy": 0.055563436821103096, + "epoch": 8.617787620934841, + "grad_norm": 1.65625, + "learning_rate": 4.712668520799829e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.99852534532547, + "num_tokens": 118934730.0, + "step": 36970 + }, + { + "entropy": 0.06757491566240788, + "epoch": 8.618953257955473, + "grad_norm": 3.203125, + "learning_rate": 4.712571146909604e-05, + "loss": 0.0055, + "mean_token_accuracy": 0.9983723163604736, + "num_tokens": 118946537.0, + "step": 36975 + }, + { + "entropy": 0.05890776924788952, + "epoch": 8.620118894976104, + "grad_norm": 0.984375, + "learning_rate": 4.712473758665995e-05, + "loss": 0.0049, + "mean_token_accuracy": 0.9985151171684266, + "num_tokens": 118957927.0, + "step": 36980 + }, + { + "entropy": 0.05017476119101048, + "epoch": 8.621284531996736, + "grad_norm": 1.3671875, + "learning_rate": 4.712376356070456e-05, + "loss": 0.0042, + "mean_token_accuracy": 0.9995585024356842, + "num_tokens": 118980899.0, + "step": 36985 + }, + { + "entropy": 0.06962464861571789, + "epoch": 8.622450169017368, + "grad_norm": 1.0625, + "learning_rate": 4.712278939124437e-05, + "loss": 0.0069, + "mean_token_accuracy": 0.998376590013504, + "num_tokens": 118992807.0, + "step": 36990 + }, + { + "entropy": 0.07402192037552595, + "epoch": 8.623615806038, + "grad_norm": 0.388671875, + "learning_rate": 4.7121815078293926e-05, + "loss": 0.0074, + "mean_token_accuracy": 0.9978413581848145, + "num_tokens": 119012183.0, + "step": 36995 + }, + { + "entropy": 0.06642863359302283, + "epoch": 8.624781443058632, + "grad_norm": 2.78125, + "learning_rate": 4.712084062186774e-05, + "loss": 0.0095, + "mean_token_accuracy": 0.997755628824234, + "num_tokens": 119025096.0, + "step": 37000 + }, + { + "entropy": 0.05094387661665678, + "epoch": 8.625947080079264, + "grad_norm": 0.373046875, + "learning_rate": 4.711986602198035e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9983302295207978, + "num_tokens": 119056220.0, + "step": 37005 + }, + { + "entropy": 0.04212229116819799, + "epoch": 8.627112717099894, + "grad_norm": 0.8671875, + "learning_rate": 4.711889127864629e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9991898596286773, + "num_tokens": 119081884.0, + "step": 37010 + }, + { + "entropy": 0.07550401668995618, + "epoch": 8.628278354120527, + "grad_norm": 0.322265625, + "learning_rate": 4.711791639188009e-05, + "loss": 0.0125, + "mean_token_accuracy": 0.9969983458518982, + "num_tokens": 119098151.0, + "step": 37015 + }, + { + "entropy": 0.0732094880193472, + "epoch": 8.629443991141159, + "grad_norm": 2.921875, + "learning_rate": 4.71169413616963e-05, + "loss": 0.0079, + "mean_token_accuracy": 0.9970404207706451, + "num_tokens": 119116875.0, + "step": 37020 + }, + { + "entropy": 0.058289825543761255, + "epoch": 8.63060962816179, + "grad_norm": 1.0859375, + "learning_rate": 4.711596618810944e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9994800806045532, + "num_tokens": 119139337.0, + "step": 37025 + }, + { + "entropy": 0.05543985888361931, + "epoch": 8.631775265182423, + "grad_norm": 1.015625, + "learning_rate": 4.711499087113406e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.998934018611908, + "num_tokens": 119152298.0, + "step": 37030 + }, + { + "entropy": 0.0805392375215888, + "epoch": 8.632940902203053, + "grad_norm": 0.11328125, + "learning_rate": 4.7114015410784705e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9997101426124573, + "num_tokens": 119162999.0, + "step": 37035 + }, + { + "entropy": 0.05598073918372393, + "epoch": 8.634106539223685, + "grad_norm": 0.275390625, + "learning_rate": 4.711303980707593e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9981026768684387, + "num_tokens": 119190278.0, + "step": 37040 + }, + { + "entropy": 0.05474949460476637, + "epoch": 8.635272176244317, + "grad_norm": 0.26953125, + "learning_rate": 4.7112064060022266e-05, + "loss": 0.0068, + "mean_token_accuracy": 0.998596829175949, + "num_tokens": 119212347.0, + "step": 37045 + }, + { + "entropy": 0.05179757541045547, + "epoch": 8.63643781326495, + "grad_norm": 0.162109375, + "learning_rate": 4.711108816963827e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9997380375862122, + "num_tokens": 119237205.0, + "step": 37050 + }, + { + "entropy": 0.05135552119463682, + "epoch": 8.637603450285582, + "grad_norm": 0.3203125, + "learning_rate": 4.711011213593849e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9992295384407044, + "num_tokens": 119260123.0, + "step": 37055 + }, + { + "entropy": 0.07269342504441738, + "epoch": 8.638769087306212, + "grad_norm": 2.734375, + "learning_rate": 4.710913595893749e-05, + "loss": 0.0092, + "mean_token_accuracy": 0.9974887371063232, + "num_tokens": 119269978.0, + "step": 37060 + }, + { + "entropy": 0.06440305057913065, + "epoch": 8.639934724326844, + "grad_norm": 0.310546875, + "learning_rate": 4.710815963864981e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9989653289318084, + "num_tokens": 119290205.0, + "step": 37065 + }, + { + "entropy": 0.0616750443354249, + "epoch": 8.641100361347476, + "grad_norm": 0.46484375, + "learning_rate": 4.7107183175090034e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9990861892700196, + "num_tokens": 119308798.0, + "step": 37070 + }, + { + "entropy": 0.052800285443663594, + "epoch": 8.642265998368108, + "grad_norm": 0.1826171875, + "learning_rate": 4.710620656827269e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9986510217189789, + "num_tokens": 119335956.0, + "step": 37075 + }, + { + "entropy": 0.04528741203248501, + "epoch": 8.64343163538874, + "grad_norm": 0.40234375, + "learning_rate": 4.7105229818212363e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9998533010482789, + "num_tokens": 119369936.0, + "step": 37080 + }, + { + "entropy": 0.05757056567817927, + "epoch": 8.644597272409372, + "grad_norm": 0.44921875, + "learning_rate": 4.710425292492362e-05, + "loss": 0.0038, + "mean_token_accuracy": 0.9993304073810577, + "num_tokens": 119385045.0, + "step": 37085 + }, + { + "entropy": 0.07393027395009995, + "epoch": 8.645762909430003, + "grad_norm": 2.21875, + "learning_rate": 4.710327588842101e-05, + "loss": 0.0058, + "mean_token_accuracy": 0.9989072501659393, + "num_tokens": 119403772.0, + "step": 37090 + }, + { + "entropy": 0.06416831258684397, + "epoch": 8.646928546450635, + "grad_norm": 0.06640625, + "learning_rate": 4.7102298708719114e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9994474589824677, + "num_tokens": 119425992.0, + "step": 37095 + }, + { + "entropy": 0.05758734289556742, + "epoch": 8.648094183471267, + "grad_norm": 0.310546875, + "learning_rate": 4.7101321385832506e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9988933801651001, + "num_tokens": 119458267.0, + "step": 37100 + }, + { + "entropy": 0.059611622150987385, + "epoch": 8.6492598204919, + "grad_norm": 1.0078125, + "learning_rate": 4.7100343919775755e-05, + "loss": 0.0038, + "mean_token_accuracy": 0.9986943900585175, + "num_tokens": 119477394.0, + "step": 37105 + }, + { + "entropy": 0.05624700449407101, + "epoch": 8.650425457512531, + "grad_norm": 0.357421875, + "learning_rate": 4.7099366310563436e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9987177789211273, + "num_tokens": 119494246.0, + "step": 37110 + }, + { + "entropy": 0.04718580381013453, + "epoch": 8.651591094533162, + "grad_norm": 0.34765625, + "learning_rate": 4.7098388558210125e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9990927994251251, + "num_tokens": 119537066.0, + "step": 37115 + }, + { + "entropy": 0.05400928994640708, + "epoch": 8.652756731553794, + "grad_norm": 0.466796875, + "learning_rate": 4.7097410662730404e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.9985824465751648, + "num_tokens": 119557231.0, + "step": 37120 + }, + { + "entropy": 0.0821190900169313, + "epoch": 8.653922368574426, + "grad_norm": 2.21875, + "learning_rate": 4.7096432624138856e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9990611732006073, + "num_tokens": 119579195.0, + "step": 37125 + }, + { + "entropy": 0.0718055371195078, + "epoch": 8.655088005595058, + "grad_norm": 1.546875, + "learning_rate": 4.709545444245006e-05, + "loss": 0.0074, + "mean_token_accuracy": 0.9984009981155395, + "num_tokens": 119596227.0, + "step": 37130 + }, + { + "entropy": 0.0754847377538681, + "epoch": 8.65625364261569, + "grad_norm": 1.6171875, + "learning_rate": 4.7094476117678615e-05, + "loss": 0.0091, + "mean_token_accuracy": 0.9982321262359619, + "num_tokens": 119605099.0, + "step": 37135 + }, + { + "entropy": 0.05754865976050496, + "epoch": 8.657419279636322, + "grad_norm": 0.11083984375, + "learning_rate": 4.7093497649839094e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9994746029376984, + "num_tokens": 119621905.0, + "step": 37140 + }, + { + "entropy": 0.07236482677981257, + "epoch": 8.658584916656952, + "grad_norm": 0.216796875, + "learning_rate": 4.709251903894609e-05, + "loss": 0.0084, + "mean_token_accuracy": 0.9981110274791718, + "num_tokens": 119634260.0, + "step": 37145 + }, + { + "entropy": 0.044534440897405145, + "epoch": 8.659750553677585, + "grad_norm": 1.015625, + "learning_rate": 4.709154028501421e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9993156492710114, + "num_tokens": 119671143.0, + "step": 37150 + }, + { + "entropy": 0.04072251198813319, + "epoch": 8.660916190698217, + "grad_norm": 1.453125, + "learning_rate": 4.709056138805803e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9978930294513703, + "num_tokens": 119708077.0, + "step": 37155 + }, + { + "entropy": 0.060729991924017665, + "epoch": 8.662081827718849, + "grad_norm": 0.38671875, + "learning_rate": 4.7089582348092155e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9985782206058502, + "num_tokens": 119725703.0, + "step": 37160 + }, + { + "entropy": 0.06289136074483395, + "epoch": 8.66324746473948, + "grad_norm": 1.0859375, + "learning_rate": 4.7088603165131184e-05, + "loss": 0.0058, + "mean_token_accuracy": 0.9981587767601013, + "num_tokens": 119735588.0, + "step": 37165 + }, + { + "entropy": 0.0584659150801599, + "epoch": 8.664413101760111, + "grad_norm": 0.62890625, + "learning_rate": 4.7087623839189716e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9993330657482147, + "num_tokens": 119757051.0, + "step": 37170 + }, + { + "entropy": 0.06662820726633072, + "epoch": 8.665578738780743, + "grad_norm": 1.953125, + "learning_rate": 4.7086644370282355e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9991186201572418, + "num_tokens": 119768192.0, + "step": 37175 + }, + { + "entropy": 0.0852966820821166, + "epoch": 8.666744375801375, + "grad_norm": 0.431640625, + "learning_rate": 4.708566475842371e-05, + "loss": 0.0515, + "mean_token_accuracy": 0.9915950953960418, + "num_tokens": 119789693.0, + "step": 37180 + }, + { + "entropy": 0.05510229915380478, + "epoch": 8.667910012822007, + "grad_norm": 0.322265625, + "learning_rate": 4.708468500362839e-05, + "loss": 0.0106, + "mean_token_accuracy": 0.9975094437599182, + "num_tokens": 119820800.0, + "step": 37185 + }, + { + "entropy": 0.05542775820940733, + "epoch": 8.66907564984264, + "grad_norm": 0.30078125, + "learning_rate": 4.708370510591099e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9986540079116821, + "num_tokens": 119839424.0, + "step": 37190 + }, + { + "entropy": 0.05367962615564466, + "epoch": 8.67024128686327, + "grad_norm": 0.91015625, + "learning_rate": 4.7082725065286146e-05, + "loss": 0.005, + "mean_token_accuracy": 0.9973668694496155, + "num_tokens": 119854949.0, + "step": 37195 + }, + { + "entropy": 0.0600742656737566, + "epoch": 8.671406923883902, + "grad_norm": 0.859375, + "learning_rate": 4.708174488176845e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.999224054813385, + "num_tokens": 119870736.0, + "step": 37200 + }, + { + "entropy": 0.054541374929249285, + "epoch": 8.672572560904534, + "grad_norm": 2.34375, + "learning_rate": 4.708076455537253e-05, + "loss": 0.0102, + "mean_token_accuracy": 0.9980546355247497, + "num_tokens": 119889971.0, + "step": 37205 + }, + { + "entropy": 0.05923604611307383, + "epoch": 8.673738197925166, + "grad_norm": 0.58984375, + "learning_rate": 4.707978408611299e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9991122007369995, + "num_tokens": 119905624.0, + "step": 37210 + }, + { + "entropy": 0.04516512975096702, + "epoch": 8.674903834945798, + "grad_norm": 0.341796875, + "learning_rate": 4.707880347400447e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9992085576057435, + "num_tokens": 119939204.0, + "step": 37215 + }, + { + "entropy": 0.07773965373635291, + "epoch": 8.67606947196643, + "grad_norm": 1.5546875, + "learning_rate": 4.707782271906158e-05, + "loss": 0.0109, + "mean_token_accuracy": 0.9983750343322754, + "num_tokens": 119949517.0, + "step": 37220 + }, + { + "entropy": 0.054637028463184835, + "epoch": 8.67723510898706, + "grad_norm": 1.296875, + "learning_rate": 4.707684182129894e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9994326770305634, + "num_tokens": 119971182.0, + "step": 37225 + }, + { + "entropy": 0.047064932715147736, + "epoch": 8.678400746007693, + "grad_norm": 0.22265625, + "learning_rate": 4.70758607807312e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.99907346367836, + "num_tokens": 119996422.0, + "step": 37230 + }, + { + "entropy": 0.07008444052189589, + "epoch": 8.679566383028325, + "grad_norm": 0.49609375, + "learning_rate": 4.707487959737296e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9998964786529541, + "num_tokens": 120006560.0, + "step": 37235 + }, + { + "entropy": 0.04979623667895794, + "epoch": 8.680732020048957, + "grad_norm": 1.015625, + "learning_rate": 4.707389827123887e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.999424010515213, + "num_tokens": 120033004.0, + "step": 37240 + }, + { + "entropy": 0.04261988895013928, + "epoch": 8.68189765706959, + "grad_norm": 0.251953125, + "learning_rate": 4.707291680234356e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.998600310087204, + "num_tokens": 120065940.0, + "step": 37245 + }, + { + "entropy": 0.058392337942495945, + "epoch": 8.68306329409022, + "grad_norm": 0.07958984375, + "learning_rate": 4.7071935190701657e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9992400527000427, + "num_tokens": 120088547.0, + "step": 37250 + }, + { + "entropy": 0.0629306823015213, + "epoch": 8.684228931110852, + "grad_norm": 0.035888671875, + "learning_rate": 4.70709534363278e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.9986060202121735, + "num_tokens": 120101167.0, + "step": 37255 + }, + { + "entropy": 0.06437505278736352, + "epoch": 8.685394568131484, + "grad_norm": 1.0234375, + "learning_rate": 4.706997153923663e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9984506249427796, + "num_tokens": 120113468.0, + "step": 37260 + }, + { + "entropy": 0.06895528947934508, + "epoch": 8.686560205152116, + "grad_norm": 0.2890625, + "learning_rate": 4.70689894994428e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9981598138809205, + "num_tokens": 120132392.0, + "step": 37265 + }, + { + "entropy": 0.07193691097199917, + "epoch": 8.687725842172748, + "grad_norm": 0.984375, + "learning_rate": 4.706800731696094e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9991972386837006, + "num_tokens": 120144937.0, + "step": 37270 + }, + { + "entropy": 0.06414989028126002, + "epoch": 8.68889147919338, + "grad_norm": 1.0390625, + "learning_rate": 4.70670249918057e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9996701180934906, + "num_tokens": 120154910.0, + "step": 37275 + }, + { + "entropy": 0.04639833634719252, + "epoch": 8.69005711621401, + "grad_norm": 0.1298828125, + "learning_rate": 4.7066042523991726e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9986519098281861, + "num_tokens": 120189822.0, + "step": 37280 + }, + { + "entropy": 0.05604259353131056, + "epoch": 8.691222753234642, + "grad_norm": 0.1318359375, + "learning_rate": 4.706505991353367e-05, + "loss": 0.0073, + "mean_token_accuracy": 0.9979808628559113, + "num_tokens": 120210678.0, + "step": 37285 + }, + { + "entropy": 0.06102119609713554, + "epoch": 8.692388390255275, + "grad_norm": 0.66015625, + "learning_rate": 4.7064077160446186e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9990385472774506, + "num_tokens": 120226506.0, + "step": 37290 + }, + { + "entropy": 0.06351532833650708, + "epoch": 8.693554027275907, + "grad_norm": 0.462890625, + "learning_rate": 4.7063094264743926e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.9987837851047516, + "num_tokens": 120247799.0, + "step": 37295 + }, + { + "entropy": 0.061118031945079566, + "epoch": 8.694719664296539, + "grad_norm": 1.234375, + "learning_rate": 4.706211122644155e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9995421290397644, + "num_tokens": 120271327.0, + "step": 37300 + }, + { + "entropy": 0.05696291178464889, + "epoch": 8.695885301317169, + "grad_norm": 0.50390625, + "learning_rate": 4.70611280455537e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.9981352150440216, + "num_tokens": 120285859.0, + "step": 37305 + }, + { + "entropy": 0.05032131155021489, + "epoch": 8.697050938337801, + "grad_norm": 0.3671875, + "learning_rate": 4.706014472209506e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.999362564086914, + "num_tokens": 120315442.0, + "step": 37310 + }, + { + "entropy": 0.0457731731235981, + "epoch": 8.698216575358433, + "grad_norm": 1.109375, + "learning_rate": 4.7059161256080284e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9987881302833557, + "num_tokens": 120341149.0, + "step": 37315 + }, + { + "entropy": 0.06630498059093952, + "epoch": 8.699382212379065, + "grad_norm": 1.046875, + "learning_rate": 4.705817764752404e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9983902394771575, + "num_tokens": 120360094.0, + "step": 37320 + }, + { + "entropy": 0.052767217718064786, + "epoch": 8.700547849399697, + "grad_norm": 0.201171875, + "learning_rate": 4.7057193896440984e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9992530107498169, + "num_tokens": 120405568.0, + "step": 37325 + }, + { + "entropy": 0.07418045364320278, + "epoch": 8.701713486420328, + "grad_norm": 2.140625, + "learning_rate": 4.705621000284579e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9987151980400085, + "num_tokens": 120415566.0, + "step": 37330 + }, + { + "entropy": 0.06375032095238567, + "epoch": 8.70287912344096, + "grad_norm": 0.7265625, + "learning_rate": 4.705522596675314e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9989543080329895, + "num_tokens": 120431073.0, + "step": 37335 + }, + { + "entropy": 0.060994432866573335, + "epoch": 8.704044760461592, + "grad_norm": 0.28125, + "learning_rate": 4.705424178817769e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9991658449172973, + "num_tokens": 120447912.0, + "step": 37340 + }, + { + "entropy": 0.05534893814474344, + "epoch": 8.705210397482224, + "grad_norm": 0.7734375, + "learning_rate": 4.7053257467134125e-05, + "loss": 0.0049, + "mean_token_accuracy": 0.9992211759090424, + "num_tokens": 120480904.0, + "step": 37345 + }, + { + "entropy": 0.07157540544867516, + "epoch": 8.706376034502856, + "grad_norm": 0.64453125, + "learning_rate": 4.705227300363713e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9997665584087372, + "num_tokens": 120500577.0, + "step": 37350 + }, + { + "entropy": 0.07752133421599865, + "epoch": 8.707541671523488, + "grad_norm": 0.4375, + "learning_rate": 4.705128839770137e-05, + "loss": 0.0075, + "mean_token_accuracy": 0.9980410099029541, + "num_tokens": 120513824.0, + "step": 37355 + }, + { + "entropy": 0.0577980768866837, + "epoch": 8.708707308544119, + "grad_norm": 0.52734375, + "learning_rate": 4.705030364934154e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9989452242851258, + "num_tokens": 120532064.0, + "step": 37360 + }, + { + "entropy": 0.06213938985019922, + "epoch": 8.70987294556475, + "grad_norm": 2.078125, + "learning_rate": 4.7049318758572316e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9985716700553894, + "num_tokens": 120555377.0, + "step": 37365 + }, + { + "entropy": 0.04146665600128472, + "epoch": 8.711038582585383, + "grad_norm": 0.267578125, + "learning_rate": 4.7048333725408386e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9992313742637634, + "num_tokens": 120588649.0, + "step": 37370 + }, + { + "entropy": 0.08321618214249611, + "epoch": 8.712204219606015, + "grad_norm": 0.234375, + "learning_rate": 4.704734854986443e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9995436012744904, + "num_tokens": 120600085.0, + "step": 37375 + }, + { + "entropy": 0.045752164581790565, + "epoch": 8.713369856626647, + "grad_norm": 0.3203125, + "learning_rate": 4.704636323195516e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9973626852035522, + "num_tokens": 120634174.0, + "step": 37380 + }, + { + "entropy": 0.05586595851927996, + "epoch": 8.714535493647277, + "grad_norm": 0.2109375, + "learning_rate": 4.7045377771695254e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9989406883716583, + "num_tokens": 120648975.0, + "step": 37385 + }, + { + "entropy": 0.07887803390622139, + "epoch": 8.71570113066791, + "grad_norm": 0.59765625, + "learning_rate": 4.7044392169099406e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9995689630508423, + "num_tokens": 120662651.0, + "step": 37390 + }, + { + "entropy": 0.04403102770447731, + "epoch": 8.716866767688542, + "grad_norm": 0.451171875, + "learning_rate": 4.704340642418231e-05, + "loss": 0.0058, + "mean_token_accuracy": 0.9982415914535523, + "num_tokens": 120697029.0, + "step": 37395 + }, + { + "entropy": 0.07757324762642384, + "epoch": 8.718032404709174, + "grad_norm": 0.15234375, + "learning_rate": 4.704242053695868e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9992213308811188, + "num_tokens": 120706590.0, + "step": 37400 + }, + { + "entropy": 0.048875637259334324, + "epoch": 8.719198041729806, + "grad_norm": 1.6796875, + "learning_rate": 4.7041434507443195e-05, + "loss": 0.0057, + "mean_token_accuracy": 0.9986173272132873, + "num_tokens": 120730767.0, + "step": 37405 + }, + { + "entropy": 0.07898346781730652, + "epoch": 8.720363678750438, + "grad_norm": 1.125, + "learning_rate": 4.704044833565058e-05, + "loss": 0.0049, + "mean_token_accuracy": 0.9988845109939575, + "num_tokens": 120742874.0, + "step": 37410 + }, + { + "entropy": 0.06409270605072379, + "epoch": 8.721529315771068, + "grad_norm": 0.031005859375, + "learning_rate": 4.7039462021595524e-05, + "loss": 0.0079, + "mean_token_accuracy": 0.9972718060016632, + "num_tokens": 120765821.0, + "step": 37415 + }, + { + "entropy": 0.064057532325387, + "epoch": 8.7226949527917, + "grad_norm": 0.7890625, + "learning_rate": 4.703847556529275e-05, + "loss": 0.0086, + "mean_token_accuracy": 0.9980102241039276, + "num_tokens": 120775932.0, + "step": 37420 + }, + { + "entropy": 0.052418787498027086, + "epoch": 8.723860589812332, + "grad_norm": 0.5234375, + "learning_rate": 4.7037488966756947e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9994726002216339, + "num_tokens": 120812212.0, + "step": 37425 + }, + { + "entropy": 0.07726996615529061, + "epoch": 8.725026226832965, + "grad_norm": 0.78515625, + "learning_rate": 4.7036502226002846e-05, + "loss": 0.0135, + "mean_token_accuracy": 0.9986808180809021, + "num_tokens": 120835862.0, + "step": 37430 + }, + { + "entropy": 0.0491664957255125, + "epoch": 8.726191863853597, + "grad_norm": 0.40234375, + "learning_rate": 4.7035515343045154e-05, + "loss": 0.006, + "mean_token_accuracy": 0.9971634745597839, + "num_tokens": 120849769.0, + "step": 37435 + }, + { + "entropy": 0.07634511133655905, + "epoch": 8.727357500874227, + "grad_norm": 0.61328125, + "learning_rate": 4.703452831789858e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9992483615875244, + "num_tokens": 120871055.0, + "step": 37440 + }, + { + "entropy": 0.062133604381233457, + "epoch": 8.72852313789486, + "grad_norm": 0.265625, + "learning_rate": 4.7033541150577855e-05, + "loss": 0.0057, + "mean_token_accuracy": 0.9979267299175263, + "num_tokens": 120896972.0, + "step": 37445 + }, + { + "entropy": 0.050842320267111066, + "epoch": 8.729688774915491, + "grad_norm": 0.87890625, + "learning_rate": 4.7032553841097685e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9996079862117767, + "num_tokens": 120920337.0, + "step": 37450 + }, + { + "entropy": 0.06864991504698992, + "epoch": 8.730854411936123, + "grad_norm": 0.21875, + "learning_rate": 4.703156638947281e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9985345125198364, + "num_tokens": 120929393.0, + "step": 37455 + }, + { + "entropy": 0.06609613662585616, + "epoch": 8.732020048956755, + "grad_norm": 1.546875, + "learning_rate": 4.703057879571793e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9994767427444458, + "num_tokens": 120942647.0, + "step": 37460 + }, + { + "entropy": 0.08063123216852545, + "epoch": 8.733185685977386, + "grad_norm": 0.44921875, + "learning_rate": 4.70295910598478e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9994502007961273, + "num_tokens": 120957571.0, + "step": 37465 + }, + { + "entropy": 0.04949803929775953, + "epoch": 8.734351322998018, + "grad_norm": 0.365234375, + "learning_rate": 4.7028603181877124e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.999658590555191, + "num_tokens": 120988159.0, + "step": 37470 + }, + { + "entropy": 0.0553031301125884, + "epoch": 8.73551696001865, + "grad_norm": 0.30859375, + "learning_rate": 4.702761516182065e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9988710165023804, + "num_tokens": 121009241.0, + "step": 37475 + }, + { + "entropy": 0.05864779045805335, + "epoch": 8.736682597039282, + "grad_norm": 0.7265625, + "learning_rate": 4.70266269996931e-05, + "loss": 0.0079, + "mean_token_accuracy": 0.9978500843048096, + "num_tokens": 121026722.0, + "step": 37480 + }, + { + "entropy": 0.07253898419439793, + "epoch": 8.737848234059914, + "grad_norm": 1.90625, + "learning_rate": 4.7025638695509205e-05, + "loss": 0.0047, + "mean_token_accuracy": 0.9983980894088745, + "num_tokens": 121042962.0, + "step": 37485 + }, + { + "entropy": 0.051535771181806925, + "epoch": 8.739013871080546, + "grad_norm": 0.1982421875, + "learning_rate": 4.702465024928372e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9981653451919555, + "num_tokens": 121080611.0, + "step": 37490 + }, + { + "entropy": 0.05589711694046855, + "epoch": 8.740179508101177, + "grad_norm": 0.51953125, + "learning_rate": 4.702366166103137e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9993067443370819, + "num_tokens": 121105123.0, + "step": 37495 + }, + { + "entropy": 0.05385690657421947, + "epoch": 8.741345145121809, + "grad_norm": 0.412109375, + "learning_rate": 4.70226729307669e-05, + "loss": 0.0039, + "mean_token_accuracy": 0.999626749753952, + "num_tokens": 121129332.0, + "step": 37500 + }, + { + "entropy": 0.062184521462768316, + "epoch": 8.74251078214244, + "grad_norm": 0.322265625, + "learning_rate": 4.7021684058505054e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9991961359977722, + "num_tokens": 121145454.0, + "step": 37505 + }, + { + "entropy": 0.060806962009519336, + "epoch": 8.743676419163073, + "grad_norm": 0.1689453125, + "learning_rate": 4.702069504426058e-05, + "loss": 0.012, + "mean_token_accuracy": 0.9983984470367432, + "num_tokens": 121164100.0, + "step": 37510 + }, + { + "entropy": 0.05776356812566519, + "epoch": 8.744842056183705, + "grad_norm": 2.734375, + "learning_rate": 4.7019705888048214e-05, + "loss": 0.0055, + "mean_token_accuracy": 0.998740166425705, + "num_tokens": 121178031.0, + "step": 37515 + }, + { + "entropy": 0.07295375410467386, + "epoch": 8.746007693204335, + "grad_norm": 0.349609375, + "learning_rate": 4.7018716589882724e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9994298398494721, + "num_tokens": 121195358.0, + "step": 37520 + }, + { + "entropy": 0.05422674883157015, + "epoch": 8.747173330224967, + "grad_norm": 2.109375, + "learning_rate": 4.701772714977885e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9994776785373688, + "num_tokens": 121231081.0, + "step": 37525 + }, + { + "entropy": 0.06482381774112582, + "epoch": 8.7483389672456, + "grad_norm": 0.24609375, + "learning_rate": 4.7016737567751346e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9999204099178314, + "num_tokens": 121257291.0, + "step": 37530 + }, + { + "entropy": 0.05208018375560641, + "epoch": 8.749504604266232, + "grad_norm": 0.5078125, + "learning_rate": 4.7015747843814974e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9984355688095092, + "num_tokens": 121284555.0, + "step": 37535 + }, + { + "entropy": 0.046364251803606746, + "epoch": 8.750670241286864, + "grad_norm": 0.453125, + "learning_rate": 4.701475797798449e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9997131705284119, + "num_tokens": 121319114.0, + "step": 37540 + }, + { + "entropy": 0.04175833626650274, + "epoch": 8.751835878307496, + "grad_norm": 0.37890625, + "learning_rate": 4.701376797027465e-05, + "loss": 0.0099, + "mean_token_accuracy": 0.9990463435649872, + "num_tokens": 121359675.0, + "step": 37545 + }, + { + "entropy": 0.061986826453357936, + "epoch": 8.753001515328126, + "grad_norm": 3.03125, + "learning_rate": 4.7012777820700226e-05, + "loss": 0.0096, + "mean_token_accuracy": 0.9976721167564392, + "num_tokens": 121371584.0, + "step": 37550 + }, + { + "entropy": 0.046063092350959775, + "epoch": 8.754167152348758, + "grad_norm": 0.3671875, + "learning_rate": 4.701178752927598e-05, + "loss": 0.0057, + "mean_token_accuracy": 0.9985677540302277, + "num_tokens": 121412011.0, + "step": 37555 + }, + { + "entropy": 0.08060178887099027, + "epoch": 8.75533278936939, + "grad_norm": 2.046875, + "learning_rate": 4.701079709601666e-05, + "loss": 0.0342, + "mean_token_accuracy": 0.9890652418136596, + "num_tokens": 121452091.0, + "step": 37560 + }, + { + "entropy": 0.06487112455070018, + "epoch": 8.756498426390023, + "grad_norm": 2.078125, + "learning_rate": 4.700980652093706e-05, + "loss": 0.0066, + "mean_token_accuracy": 0.9989998698234558, + "num_tokens": 121463790.0, + "step": 37565 + }, + { + "entropy": 0.07172239758074284, + "epoch": 8.757664063410655, + "grad_norm": 2.6875, + "learning_rate": 4.700881580405194e-05, + "loss": 0.0076, + "mean_token_accuracy": 0.9977281510829925, + "num_tokens": 121477631.0, + "step": 37570 + }, + { + "entropy": 0.05877719409763813, + "epoch": 8.758829700431285, + "grad_norm": 0.345703125, + "learning_rate": 4.7007824945376074e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9990646779537201, + "num_tokens": 121502532.0, + "step": 37575 + }, + { + "entropy": 0.041250808723270894, + "epoch": 8.759995337451917, + "grad_norm": 0.34375, + "learning_rate": 4.7006833944924236e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9991530656814576, + "num_tokens": 121539955.0, + "step": 37580 + }, + { + "entropy": 0.06379560967907309, + "epoch": 8.76116097447255, + "grad_norm": 0.546875, + "learning_rate": 4.70058428027112e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9996963918209076, + "num_tokens": 121558100.0, + "step": 37585 + }, + { + "entropy": 0.06465062042698264, + "epoch": 8.762326611493181, + "grad_norm": 1.4921875, + "learning_rate": 4.700485151875176e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.99923455119133, + "num_tokens": 121583763.0, + "step": 37590 + }, + { + "entropy": 0.06468153018504381, + "epoch": 8.763492248513813, + "grad_norm": 0.4296875, + "learning_rate": 4.700386009306069e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9996022522449494, + "num_tokens": 121604954.0, + "step": 37595 + }, + { + "entropy": 0.06871114894747735, + "epoch": 8.764657885534444, + "grad_norm": 0.98828125, + "learning_rate": 4.700286852565276e-05, + "loss": 0.01, + "mean_token_accuracy": 0.997947508096695, + "num_tokens": 121614592.0, + "step": 37600 + }, + { + "entropy": 0.04772069398313761, + "epoch": 8.765823522555076, + "grad_norm": 0.345703125, + "learning_rate": 4.700187681654277e-05, + "loss": 0.0042, + "mean_token_accuracy": 0.9992965459823608, + "num_tokens": 121638138.0, + "step": 37605 + }, + { + "entropy": 0.06294498881325125, + "epoch": 8.766989159575708, + "grad_norm": 0.400390625, + "learning_rate": 4.700088496574551e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9988208770751953, + "num_tokens": 121656429.0, + "step": 37610 + }, + { + "entropy": 0.07124684005975723, + "epoch": 8.76815479659634, + "grad_norm": 0.158203125, + "learning_rate": 4.699989297327577e-05, + "loss": 0.002, + "mean_token_accuracy": 0.999472838640213, + "num_tokens": 121667496.0, + "step": 37615 + }, + { + "entropy": 0.049650049302726984, + "epoch": 8.769320433616972, + "grad_norm": 0.181640625, + "learning_rate": 4.6998900839148326e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9990351021289825, + "num_tokens": 121693895.0, + "step": 37620 + }, + { + "entropy": 0.04391482761129737, + "epoch": 8.770486070637604, + "grad_norm": 0.12890625, + "learning_rate": 4.6997908563377986e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9973374605178833, + "num_tokens": 121741181.0, + "step": 37625 + }, + { + "entropy": 0.06794363204389811, + "epoch": 8.771651707658235, + "grad_norm": 0.96875, + "learning_rate": 4.699691614597955e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9987189173698425, + "num_tokens": 121751435.0, + "step": 37630 + }, + { + "entropy": 0.06100701736286283, + "epoch": 8.772817344678867, + "grad_norm": 0.259765625, + "learning_rate": 4.6995923586967796e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9995702564716339, + "num_tokens": 121772227.0, + "step": 37635 + }, + { + "entropy": 0.06428485959768296, + "epoch": 8.773982981699499, + "grad_norm": 3.421875, + "learning_rate": 4.699493088635755e-05, + "loss": 0.0101, + "mean_token_accuracy": 0.9976272761821747, + "num_tokens": 121793079.0, + "step": 37640 + }, + { + "entropy": 0.07052382789552211, + "epoch": 8.77514861872013, + "grad_norm": 1.53125, + "learning_rate": 4.69939380441636e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.9983150899410248, + "num_tokens": 121803848.0, + "step": 37645 + }, + { + "entropy": 0.0693971360102296, + "epoch": 8.776314255740763, + "grad_norm": 1.34375, + "learning_rate": 4.699294506040076e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9960627436637879, + "num_tokens": 121818194.0, + "step": 37650 + }, + { + "entropy": 0.09031069064512849, + "epoch": 8.777479892761393, + "grad_norm": 1.2734375, + "learning_rate": 4.6991951935083824e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9992822110652924, + "num_tokens": 121833034.0, + "step": 37655 + }, + { + "entropy": 0.0589916012249887, + "epoch": 8.778645529782025, + "grad_norm": 2.390625, + "learning_rate": 4.6990958668227615e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9991851389408112, + "num_tokens": 121847066.0, + "step": 37660 + }, + { + "entropy": 0.06416790802031755, + "epoch": 8.779811166802657, + "grad_norm": 1.71875, + "learning_rate": 4.6989965259846926e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9991792678833008, + "num_tokens": 121857992.0, + "step": 37665 + }, + { + "entropy": 0.055831207521259786, + "epoch": 8.78097680382329, + "grad_norm": 0.15625, + "learning_rate": 4.698897170995658e-05, + "loss": 0.0209, + "mean_token_accuracy": 0.9955951690673828, + "num_tokens": 121904833.0, + "step": 37670 + }, + { + "entropy": 0.0680293409153819, + "epoch": 8.782142440843922, + "grad_norm": 0.890625, + "learning_rate": 4.698797801857141e-05, + "loss": 0.0091, + "mean_token_accuracy": 0.9971361041069031, + "num_tokens": 121914517.0, + "step": 37675 + }, + { + "entropy": 0.04976560343056917, + "epoch": 8.783308077864554, + "grad_norm": 0.70703125, + "learning_rate": 4.6986984185706206e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9991954684257507, + "num_tokens": 121934035.0, + "step": 37680 + }, + { + "entropy": 0.08044440317898989, + "epoch": 8.784473714885184, + "grad_norm": 0.193359375, + "learning_rate": 4.6985990211375805e-05, + "loss": 0.0332, + "mean_token_accuracy": 0.9900187373161315, + "num_tokens": 121958662.0, + "step": 37685 + }, + { + "entropy": 0.06931205466389656, + "epoch": 8.785639351905816, + "grad_norm": 0.2158203125, + "learning_rate": 4.6984996095595014e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9993624806404113, + "num_tokens": 121972128.0, + "step": 37690 + }, + { + "entropy": 0.04132307339459658, + "epoch": 8.786804988926448, + "grad_norm": 0.2294921875, + "learning_rate": 4.698400183837867e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9990397334098816, + "num_tokens": 122011044.0, + "step": 37695 + }, + { + "entropy": 0.061664972454309464, + "epoch": 8.78797062594708, + "grad_norm": 0.69921875, + "learning_rate": 4.6983007439741586e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9992005169391632, + "num_tokens": 122021209.0, + "step": 37700 + }, + { + "entropy": 0.05934088248759508, + "epoch": 8.789136262967713, + "grad_norm": 1.859375, + "learning_rate": 4.698201289969861e-05, + "loss": 0.0056, + "mean_token_accuracy": 0.9981985509395599, + "num_tokens": 122052417.0, + "step": 37705 + }, + { + "entropy": 0.05349223613739014, + "epoch": 8.790301899988343, + "grad_norm": 1.0078125, + "learning_rate": 4.698101821826455e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9989639759063721, + "num_tokens": 122074536.0, + "step": 37710 + }, + { + "entropy": 0.0589401114732027, + "epoch": 8.791467537008975, + "grad_norm": 1.3828125, + "learning_rate": 4.6980023395454256e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.9988614022731781, + "num_tokens": 122089899.0, + "step": 37715 + }, + { + "entropy": 0.06001086411997676, + "epoch": 8.792633174029607, + "grad_norm": 2.296875, + "learning_rate": 4.697902843128255e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9989833116531373, + "num_tokens": 122112483.0, + "step": 37720 + }, + { + "entropy": 0.05685397181659937, + "epoch": 8.79379881105024, + "grad_norm": 0.197265625, + "learning_rate": 4.697803332576428e-05, + "loss": 0.0119, + "mean_token_accuracy": 0.9974867701530457, + "num_tokens": 122126367.0, + "step": 37725 + }, + { + "entropy": 0.07201154017820954, + "epoch": 8.794964448070871, + "grad_norm": 0.73046875, + "learning_rate": 4.697703807891426e-05, + "loss": 0.003, + "mean_token_accuracy": 0.999059921503067, + "num_tokens": 122145494.0, + "step": 37730 + }, + { + "entropy": 0.047375439945608375, + "epoch": 8.796130085091502, + "grad_norm": 0.1416015625, + "learning_rate": 4.6976042690747366e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9999362409114838, + "num_tokens": 122165067.0, + "step": 37735 + }, + { + "entropy": 0.07775508984923363, + "epoch": 8.797295722112134, + "grad_norm": 0.36328125, + "learning_rate": 4.697504716127842e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9980196058750153, + "num_tokens": 122175557.0, + "step": 37740 + }, + { + "entropy": 0.066469413228333, + "epoch": 8.798461359132766, + "grad_norm": 0.10693359375, + "learning_rate": 4.6974051490522256e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9987132906913757, + "num_tokens": 122188226.0, + "step": 37745 + }, + { + "entropy": 0.061857113242149354, + "epoch": 8.799626996153398, + "grad_norm": 0.99609375, + "learning_rate": 4.697305567849375e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9985567629337311, + "num_tokens": 122198763.0, + "step": 37750 + }, + { + "entropy": 0.07967776516452432, + "epoch": 8.80079263317403, + "grad_norm": 1.59375, + "learning_rate": 4.697205972520773e-05, + "loss": 0.0049, + "mean_token_accuracy": 0.9984486401081085, + "num_tokens": 122216833.0, + "step": 37755 + }, + { + "entropy": 0.05474145282059908, + "epoch": 8.801958270194662, + "grad_norm": 1.890625, + "learning_rate": 4.697106363067905e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.9994183659553528, + "num_tokens": 122235649.0, + "step": 37760 + }, + { + "entropy": 0.0869576308876276, + "epoch": 8.803123907215292, + "grad_norm": 0.3828125, + "learning_rate": 4.697006739492257e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9988507032394409, + "num_tokens": 122249504.0, + "step": 37765 + }, + { + "entropy": 0.05836290316656232, + "epoch": 8.804289544235925, + "grad_norm": 0.328125, + "learning_rate": 4.696907101795314e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.998987078666687, + "num_tokens": 122269940.0, + "step": 37770 + }, + { + "entropy": 0.06279167141765356, + "epoch": 8.805455181256557, + "grad_norm": 2.15625, + "learning_rate": 4.6968074499785615e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.9981760680675507, + "num_tokens": 122287840.0, + "step": 37775 + }, + { + "entropy": 0.07444556690752506, + "epoch": 8.806620818277189, + "grad_norm": 0.921875, + "learning_rate": 4.696707784043486e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9971623122692108, + "num_tokens": 122296145.0, + "step": 37780 + }, + { + "entropy": 0.052605046331882475, + "epoch": 8.807786455297821, + "grad_norm": 0.81640625, + "learning_rate": 4.6966081039915735e-05, + "loss": 0.0114, + "mean_token_accuracy": 0.9969050943851471, + "num_tokens": 122336505.0, + "step": 37785 + }, + { + "entropy": 0.0693147087469697, + "epoch": 8.808952092318451, + "grad_norm": 2.09375, + "learning_rate": 4.696508409824311e-05, + "loss": 0.006, + "mean_token_accuracy": 0.9984557271003723, + "num_tokens": 122349602.0, + "step": 37790 + }, + { + "entropy": 0.07108534360304475, + "epoch": 8.810117729339083, + "grad_norm": 2.5, + "learning_rate": 4.696408701543184e-05, + "loss": 0.0111, + "mean_token_accuracy": 0.9988550305366516, + "num_tokens": 122385144.0, + "step": 37795 + }, + { + "entropy": 0.07313933782279491, + "epoch": 8.811283366359715, + "grad_norm": 0.58984375, + "learning_rate": 4.696308979149679e-05, + "loss": 0.0089, + "mean_token_accuracy": 0.9986230254173278, + "num_tokens": 122400947.0, + "step": 37800 + }, + { + "entropy": 0.05324373729526997, + "epoch": 8.812449003380348, + "grad_norm": 0.255859375, + "learning_rate": 4.696209242645285e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9986553013324737, + "num_tokens": 122420296.0, + "step": 37805 + }, + { + "entropy": 0.06737412475049495, + "epoch": 8.81361464040098, + "grad_norm": 2.046875, + "learning_rate": 4.696109492031488e-05, + "loss": 0.0113, + "mean_token_accuracy": 0.9984551131725311, + "num_tokens": 122453040.0, + "step": 37810 + }, + { + "entropy": 0.05885831089690328, + "epoch": 8.814780277421612, + "grad_norm": 0.08935546875, + "learning_rate": 4.696009727309775e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.999238908290863, + "num_tokens": 122478826.0, + "step": 37815 + }, + { + "entropy": 0.06913261096924543, + "epoch": 8.815945914442242, + "grad_norm": 0.234375, + "learning_rate": 4.6959099484816336e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9983415126800537, + "num_tokens": 122495715.0, + "step": 37820 + }, + { + "entropy": 0.05994777157902718, + "epoch": 8.817111551462874, + "grad_norm": 1.6015625, + "learning_rate": 4.695810155548553e-05, + "loss": 0.0171, + "mean_token_accuracy": 0.9962732553482055, + "num_tokens": 122521314.0, + "step": 37825 + }, + { + "entropy": 0.07015992011874914, + "epoch": 8.818277188483506, + "grad_norm": 0.25, + "learning_rate": 4.6957103485120204e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.999011218547821, + "num_tokens": 122541171.0, + "step": 37830 + }, + { + "entropy": 0.06579969152808189, + "epoch": 8.819442825504138, + "grad_norm": 1.53125, + "learning_rate": 4.6956105273735234e-05, + "loss": 0.0086, + "mean_token_accuracy": 0.998229706287384, + "num_tokens": 122553154.0, + "step": 37835 + }, + { + "entropy": 0.07723803166300058, + "epoch": 8.82060846252477, + "grad_norm": 0.59375, + "learning_rate": 4.6955106921345516e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9987864375114441, + "num_tokens": 122575622.0, + "step": 37840 + }, + { + "entropy": 0.05942539321258664, + "epoch": 8.8217740995454, + "grad_norm": 0.75390625, + "learning_rate": 4.695410842796594e-05, + "loss": 0.0061, + "mean_token_accuracy": 0.9983376860618591, + "num_tokens": 122600863.0, + "step": 37845 + }, + { + "entropy": 0.04993642652407289, + "epoch": 8.822939736566033, + "grad_norm": 0.50390625, + "learning_rate": 4.695310979361137e-05, + "loss": 0.0061, + "mean_token_accuracy": 0.9980162382125854, + "num_tokens": 122624134.0, + "step": 37850 + }, + { + "entropy": 0.07489926908165216, + "epoch": 8.824105373586665, + "grad_norm": 4.21875, + "learning_rate": 4.695211101829673e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9985229194164276, + "num_tokens": 122638432.0, + "step": 37855 + }, + { + "entropy": 0.06193410158157349, + "epoch": 8.825271010607297, + "grad_norm": 0.08837890625, + "learning_rate": 4.695111210203689e-05, + "loss": 0.006, + "mean_token_accuracy": 0.9991264760494232, + "num_tokens": 122655695.0, + "step": 37860 + }, + { + "entropy": 0.04280728902667761, + "epoch": 8.82643664762793, + "grad_norm": 0.70703125, + "learning_rate": 4.6950113044846754e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9995523571968079, + "num_tokens": 122675293.0, + "step": 37865 + }, + { + "entropy": 0.03909464376047254, + "epoch": 8.82760228464856, + "grad_norm": 0.2080078125, + "learning_rate": 4.694911384674122e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9993949353694915, + "num_tokens": 122703106.0, + "step": 37870 + }, + { + "entropy": 0.06501968447118997, + "epoch": 8.828767921669192, + "grad_norm": 0.26171875, + "learning_rate": 4.694811450773519e-05, + "loss": 0.0039, + "mean_token_accuracy": 0.999257355928421, + "num_tokens": 122718218.0, + "step": 37875 + }, + { + "entropy": 0.05192201929166913, + "epoch": 8.829933558689824, + "grad_norm": 1.28125, + "learning_rate": 4.6947115027843556e-05, + "loss": 0.0038, + "mean_token_accuracy": 0.9991044104099274, + "num_tokens": 122743671.0, + "step": 37880 + }, + { + "entropy": 0.049569260654971005, + "epoch": 8.831099195710456, + "grad_norm": 0.1220703125, + "learning_rate": 4.694611540708123e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9995582580566407, + "num_tokens": 122767291.0, + "step": 37885 + }, + { + "entropy": 0.050462238024920225, + "epoch": 8.832264832731088, + "grad_norm": 1.15625, + "learning_rate": 4.6945115645463114e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.9993569672107696, + "num_tokens": 122790047.0, + "step": 37890 + }, + { + "entropy": 0.05963395088911057, + "epoch": 8.83343046975172, + "grad_norm": 0.046875, + "learning_rate": 4.694411574300412e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9995575189590454, + "num_tokens": 122802120.0, + "step": 37895 + }, + { + "entropy": 0.0568438459187746, + "epoch": 8.83459610677235, + "grad_norm": 0.3359375, + "learning_rate": 4.6943115699719155e-05, + "loss": 0.0053, + "mean_token_accuracy": 0.9985347628593445, + "num_tokens": 122840584.0, + "step": 37900 + }, + { + "entropy": 0.08491461314260959, + "epoch": 8.835761743792983, + "grad_norm": 0.55078125, + "learning_rate": 4.694211551562313e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9972289741039276, + "num_tokens": 122853087.0, + "step": 37905 + }, + { + "entropy": 0.056212511658668515, + "epoch": 8.836927380813615, + "grad_norm": 0.076171875, + "learning_rate": 4.694111519073096e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.9995512783527374, + "num_tokens": 122863495.0, + "step": 37910 + }, + { + "entropy": 0.05507175326347351, + "epoch": 8.838093017834247, + "grad_norm": 1.859375, + "learning_rate": 4.6940114725057567e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9987462341785431, + "num_tokens": 122880791.0, + "step": 37915 + }, + { + "entropy": 0.05416063591837883, + "epoch": 8.839258654854879, + "grad_norm": 1.4296875, + "learning_rate": 4.693911411861786e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.998863023519516, + "num_tokens": 122899547.0, + "step": 37920 + }, + { + "entropy": 0.05549648702144623, + "epoch": 8.84042429187551, + "grad_norm": 1.3125, + "learning_rate": 4.6938113371426766e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9996604323387146, + "num_tokens": 122931372.0, + "step": 37925 + }, + { + "entropy": 0.03992574992589652, + "epoch": 8.841589928896141, + "grad_norm": 0.6171875, + "learning_rate": 4.693711248349921e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9993171095848083, + "num_tokens": 122956032.0, + "step": 37930 + }, + { + "entropy": 0.05518424436450005, + "epoch": 8.842755565916773, + "grad_norm": 2.578125, + "learning_rate": 4.693611145485011e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9986071944236755, + "num_tokens": 122967343.0, + "step": 37935 + }, + { + "entropy": 0.04317314065992832, + "epoch": 8.843921202937405, + "grad_norm": 0.349609375, + "learning_rate": 4.693511028549439e-05, + "loss": 0.006, + "mean_token_accuracy": 0.998054838180542, + "num_tokens": 122996226.0, + "step": 37940 + }, + { + "entropy": 0.05350646786391735, + "epoch": 8.845086839958038, + "grad_norm": 0.2353515625, + "learning_rate": 4.693410897544699e-05, + "loss": 0.0056, + "mean_token_accuracy": 0.9985206425189972, + "num_tokens": 123027698.0, + "step": 37945 + }, + { + "entropy": 0.05209854450076819, + "epoch": 8.84625247697867, + "grad_norm": 0.39453125, + "learning_rate": 4.6933107524722835e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9998648226261139, + "num_tokens": 123061500.0, + "step": 37950 + }, + { + "entropy": 0.050198511127382515, + "epoch": 8.8474181139993, + "grad_norm": 0.240234375, + "learning_rate": 4.6932105933336854e-05, + "loss": 0.0058, + "mean_token_accuracy": 0.9992323398590088, + "num_tokens": 123076939.0, + "step": 37955 + }, + { + "entropy": 0.06413730578497052, + "epoch": 8.848583751019932, + "grad_norm": 0.59765625, + "learning_rate": 4.693110420130399e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9995835602283478, + "num_tokens": 123093348.0, + "step": 37960 + }, + { + "entropy": 0.04941152567043901, + "epoch": 8.849749388040564, + "grad_norm": 0.416015625, + "learning_rate": 4.693010232863918e-05, + "loss": 0.0106, + "mean_token_accuracy": 0.9988395273685455, + "num_tokens": 123113623.0, + "step": 37965 + }, + { + "entropy": 0.06690901890397072, + "epoch": 8.850915025061196, + "grad_norm": 3.828125, + "learning_rate": 4.6929100315357364e-05, + "loss": 0.035, + "mean_token_accuracy": 0.9943624317646027, + "num_tokens": 123142491.0, + "step": 37970 + }, + { + "entropy": 0.06515284590423107, + "epoch": 8.852080662081828, + "grad_norm": 1.359375, + "learning_rate": 4.692809816147347e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9986517250537872, + "num_tokens": 123156097.0, + "step": 37975 + }, + { + "entropy": 0.045817646011710164, + "epoch": 8.853246299102459, + "grad_norm": 2.421875, + "learning_rate": 4.692709586700246e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.9982234835624695, + "num_tokens": 123182011.0, + "step": 37980 + }, + { + "entropy": 0.05612170891836286, + "epoch": 8.85441193612309, + "grad_norm": 0.265625, + "learning_rate": 4.692609343195926e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.999575287103653, + "num_tokens": 123201193.0, + "step": 37985 + }, + { + "entropy": 0.07323649059981108, + "epoch": 8.855577573143723, + "grad_norm": 0.1806640625, + "learning_rate": 4.692509085635884e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.9975827276706696, + "num_tokens": 123242077.0, + "step": 37990 + }, + { + "entropy": 0.04696584166958928, + "epoch": 8.856743210164355, + "grad_norm": 0.5703125, + "learning_rate": 4.692408814021614e-05, + "loss": 0.0059, + "mean_token_accuracy": 0.9984638512134552, + "num_tokens": 123265675.0, + "step": 37995 + }, + { + "entropy": 0.06086919633671641, + "epoch": 8.857908847184987, + "grad_norm": 0.298828125, + "learning_rate": 4.6923085283546106e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.999669861793518, + "num_tokens": 123286453.0, + "step": 38000 + }, + { + "entropy": 0.0598510229960084, + "epoch": 8.859074484205617, + "grad_norm": 3.15625, + "learning_rate": 4.69220822863637e-05, + "loss": 0.0084, + "mean_token_accuracy": 0.9975327312946319, + "num_tokens": 123298214.0, + "step": 38005 + }, + { + "entropy": 0.05113176926970482, + "epoch": 8.86024012122625, + "grad_norm": 1.1484375, + "learning_rate": 4.692107914868387e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9995282173156739, + "num_tokens": 123313774.0, + "step": 38010 + }, + { + "entropy": 0.06497044824063777, + "epoch": 8.861405758246882, + "grad_norm": 1.34375, + "learning_rate": 4.692007587052159e-05, + "loss": 0.0039, + "mean_token_accuracy": 0.9993992626667023, + "num_tokens": 123323123.0, + "step": 38015 + }, + { + "entropy": 0.07354457750916481, + "epoch": 8.862571395267514, + "grad_norm": 0.189453125, + "learning_rate": 4.69190724518918e-05, + "loss": 0.0116, + "mean_token_accuracy": 0.9962685942649842, + "num_tokens": 123342630.0, + "step": 38020 + }, + { + "entropy": 0.053354914858937265, + "epoch": 8.863737032288146, + "grad_norm": 0.62109375, + "learning_rate": 4.691806889280948e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9992679178714752, + "num_tokens": 123365283.0, + "step": 38025 + }, + { + "entropy": 0.0487318092957139, + "epoch": 8.864902669308778, + "grad_norm": 0.453125, + "learning_rate": 4.691706519328958e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9992010533809662, + "num_tokens": 123406472.0, + "step": 38030 + }, + { + "entropy": 0.05246655810624361, + "epoch": 8.866068306329408, + "grad_norm": 1.765625, + "learning_rate": 4.691606135334708e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9991471171379089, + "num_tokens": 123434661.0, + "step": 38035 + }, + { + "entropy": 0.060769391059875486, + "epoch": 8.86723394335004, + "grad_norm": 0.3671875, + "learning_rate": 4.691505737299694e-05, + "loss": 0.0055, + "mean_token_accuracy": 0.9982271492481232, + "num_tokens": 123451385.0, + "step": 38040 + }, + { + "entropy": 0.08358767367899418, + "epoch": 8.868399580370673, + "grad_norm": 0.1591796875, + "learning_rate": 4.691405325225413e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9988424837589264, + "num_tokens": 123465541.0, + "step": 38045 + }, + { + "entropy": 0.04848737036809325, + "epoch": 8.869565217391305, + "grad_norm": 1.7734375, + "learning_rate": 4.6913048991133636e-05, + "loss": 0.0068, + "mean_token_accuracy": 0.996968537569046, + "num_tokens": 123486840.0, + "step": 38050 + }, + { + "entropy": 0.0603315188549459, + "epoch": 8.870730854411937, + "grad_norm": 3.234375, + "learning_rate": 4.6912044589650414e-05, + "loss": 0.009, + "mean_token_accuracy": 0.997438782453537, + "num_tokens": 123512872.0, + "step": 38055 + }, + { + "entropy": 0.05510821873322129, + "epoch": 8.871896491432567, + "grad_norm": 0.59765625, + "learning_rate": 4.691104004781946e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9990927755832673, + "num_tokens": 123538521.0, + "step": 38060 + }, + { + "entropy": 0.055203709099441764, + "epoch": 8.8730621284532, + "grad_norm": 0.45703125, + "learning_rate": 4.691003536565574e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9989315450191498, + "num_tokens": 123566004.0, + "step": 38065 + }, + { + "entropy": 0.053858717624098065, + "epoch": 8.874227765473831, + "grad_norm": 1.0234375, + "learning_rate": 4.690903054317424e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9992274165153503, + "num_tokens": 123587755.0, + "step": 38070 + }, + { + "entropy": 0.04106435338035226, + "epoch": 8.875393402494463, + "grad_norm": 0.447265625, + "learning_rate": 4.690802558038994e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9998286664485931, + "num_tokens": 123608853.0, + "step": 38075 + }, + { + "entropy": 0.055756100453436375, + "epoch": 8.876559039515096, + "grad_norm": 0.328125, + "learning_rate": 4.690702047731782e-05, + "loss": 0.0039, + "mean_token_accuracy": 0.9991030275821686, + "num_tokens": 123624845.0, + "step": 38080 + }, + { + "entropy": 0.05964929591864347, + "epoch": 8.877724676535728, + "grad_norm": 0.1396484375, + "learning_rate": 4.690601523397289e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9988237857818604, + "num_tokens": 123634948.0, + "step": 38085 + }, + { + "entropy": 0.06275531104765833, + "epoch": 8.878890313556358, + "grad_norm": 0.19921875, + "learning_rate": 4.690500985037012e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9983062148094177, + "num_tokens": 123661599.0, + "step": 38090 + }, + { + "entropy": 0.05262847691774368, + "epoch": 8.88005595057699, + "grad_norm": 0.2197265625, + "learning_rate": 4.690400432652451e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9996294438838959, + "num_tokens": 123686160.0, + "step": 38095 + }, + { + "entropy": 0.057931592734530565, + "epoch": 8.881221587597622, + "grad_norm": 0.32421875, + "learning_rate": 4.690299866245105e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.999240392446518, + "num_tokens": 123706409.0, + "step": 38100 + }, + { + "entropy": 0.06777220210060478, + "epoch": 8.882387224618254, + "grad_norm": 0.6484375, + "learning_rate": 4.690199285816473e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.998669707775116, + "num_tokens": 123718962.0, + "step": 38105 + }, + { + "entropy": 0.05980415008962155, + "epoch": 8.883552861638886, + "grad_norm": 0.0625, + "learning_rate": 4.690098691368056e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9993426620960235, + "num_tokens": 123743234.0, + "step": 38110 + }, + { + "entropy": 0.07145977187901735, + "epoch": 8.884718498659517, + "grad_norm": 0.1953125, + "learning_rate": 4.6899980829013534e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9988697230815887, + "num_tokens": 123771453.0, + "step": 38115 + }, + { + "entropy": 0.06307173445820809, + "epoch": 8.885884135680149, + "grad_norm": 0.09130859375, + "learning_rate": 4.6898974604178655e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9999496757984161, + "num_tokens": 123788361.0, + "step": 38120 + }, + { + "entropy": 0.06684031132608652, + "epoch": 8.887049772700781, + "grad_norm": 1.890625, + "learning_rate": 4.689796823919094e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9992721199989318, + "num_tokens": 123799970.0, + "step": 38125 + }, + { + "entropy": 0.04606306701898575, + "epoch": 8.888215409721413, + "grad_norm": 0.42578125, + "learning_rate": 4.689696173406537e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9999196231365204, + "num_tokens": 123826915.0, + "step": 38130 + }, + { + "entropy": 0.06843077950179577, + "epoch": 8.889381046742045, + "grad_norm": 1.5234375, + "learning_rate": 4.6895955088816964e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9991783380508423, + "num_tokens": 123836264.0, + "step": 38135 + }, + { + "entropy": 0.057823483273386954, + "epoch": 8.890546683762675, + "grad_norm": 1.1015625, + "learning_rate": 4.689494830346074e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9989975571632386, + "num_tokens": 123849316.0, + "step": 38140 + }, + { + "entropy": 0.05925882076844573, + "epoch": 8.891712320783308, + "grad_norm": 0.0498046875, + "learning_rate": 4.6893941378011716e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9984163522720337, + "num_tokens": 123887079.0, + "step": 38145 + }, + { + "entropy": 0.06463732328265906, + "epoch": 8.89287795780394, + "grad_norm": 0.357421875, + "learning_rate": 4.689293431248488e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9998124241828918, + "num_tokens": 123910484.0, + "step": 38150 + }, + { + "entropy": 0.04970733867958188, + "epoch": 8.894043594824572, + "grad_norm": 0.275390625, + "learning_rate": 4.689192710689528e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9995087504386901, + "num_tokens": 123933985.0, + "step": 38155 + }, + { + "entropy": 0.040503044705837966, + "epoch": 8.895209231845204, + "grad_norm": 0.1494140625, + "learning_rate": 4.689091976125791e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9990995824337006, + "num_tokens": 123980387.0, + "step": 38160 + }, + { + "entropy": 0.04687797641381621, + "epoch": 8.896374868865836, + "grad_norm": 0.07177734375, + "learning_rate": 4.688991227558781e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9996799647808075, + "num_tokens": 124006601.0, + "step": 38165 + }, + { + "entropy": 0.05270485226064921, + "epoch": 8.897540505886466, + "grad_norm": 0.1845703125, + "learning_rate": 4.688890464989999e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9976952373981476, + "num_tokens": 124045129.0, + "step": 38170 + }, + { + "entropy": 0.049232575856149194, + "epoch": 8.898706142907098, + "grad_norm": 0.3515625, + "learning_rate": 4.688789688420948e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.998599624633789, + "num_tokens": 124077367.0, + "step": 38175 + }, + { + "entropy": 0.07646393990144133, + "epoch": 8.89987177992773, + "grad_norm": 1.0703125, + "learning_rate": 4.688688897853131e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.999968695640564, + "num_tokens": 124095346.0, + "step": 38180 + }, + { + "entropy": 0.05587264345958829, + "epoch": 8.901037416948363, + "grad_norm": 0.140625, + "learning_rate": 4.68858809328805e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.998901629447937, + "num_tokens": 124124179.0, + "step": 38185 + }, + { + "entropy": 0.06073369812220335, + "epoch": 8.902203053968995, + "grad_norm": 1.65625, + "learning_rate": 4.688487274727209e-05, + "loss": 0.0055, + "mean_token_accuracy": 0.9984927356243134, + "num_tokens": 124145302.0, + "step": 38190 + }, + { + "entropy": 0.04628473650664091, + "epoch": 8.903368690989625, + "grad_norm": 1.2734375, + "learning_rate": 4.688386442172111e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.998195481300354, + "num_tokens": 124179163.0, + "step": 38195 + }, + { + "entropy": 0.04713690942153335, + "epoch": 8.904534328010257, + "grad_norm": 0.58203125, + "learning_rate": 4.688285595624261e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9986393749713898, + "num_tokens": 124209097.0, + "step": 38200 + }, + { + "entropy": 0.056856155022978784, + "epoch": 8.90569996503089, + "grad_norm": 0.205078125, + "learning_rate": 4.68818473508516e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9994652688503265, + "num_tokens": 124239155.0, + "step": 38205 + }, + { + "entropy": 0.06115551386028528, + "epoch": 8.906865602051521, + "grad_norm": 0.11279296875, + "learning_rate": 4.688083860556315e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9995314121246338, + "num_tokens": 124253044.0, + "step": 38210 + }, + { + "entropy": 0.07185219712555409, + "epoch": 8.908031239072153, + "grad_norm": 0.3984375, + "learning_rate": 4.6879829720392276e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9995608389377594, + "num_tokens": 124263476.0, + "step": 38215 + }, + { + "entropy": 0.06399134192615748, + "epoch": 8.909196876092786, + "grad_norm": 0.2255859375, + "learning_rate": 4.687882069535403e-05, + "loss": 0.0079, + "mean_token_accuracy": 0.9987414300441741, + "num_tokens": 124284712.0, + "step": 38220 + }, + { + "entropy": 0.04301735432818532, + "epoch": 8.910362513113416, + "grad_norm": 0.369140625, + "learning_rate": 4.687781153046347e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9982926070690155, + "num_tokens": 124315576.0, + "step": 38225 + }, + { + "entropy": 0.048022180516272786, + "epoch": 8.911528150134048, + "grad_norm": 1.5390625, + "learning_rate": 4.6876802225735626e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9999112784862518, + "num_tokens": 124345674.0, + "step": 38230 + }, + { + "entropy": 0.0699318254366517, + "epoch": 8.91269378715468, + "grad_norm": 2.609375, + "learning_rate": 4.687579278118557e-05, + "loss": 0.0056, + "mean_token_accuracy": 0.9987849235534668, + "num_tokens": 124357459.0, + "step": 38235 + }, + { + "entropy": 0.10346001256257295, + "epoch": 8.913859424175312, + "grad_norm": 1.2734375, + "learning_rate": 4.687478319682833e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9995729744434356, + "num_tokens": 124369169.0, + "step": 38240 + }, + { + "entropy": 0.08160105086863041, + "epoch": 8.915025061195944, + "grad_norm": 3.0625, + "learning_rate": 4.6873773472678975e-05, + "loss": 0.0087, + "mean_token_accuracy": 0.9984677612781525, + "num_tokens": 124377276.0, + "step": 38245 + }, + { + "entropy": 0.049109653756022456, + "epoch": 8.916190698216575, + "grad_norm": 0.291015625, + "learning_rate": 4.6872763608752566e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9981947541236877, + "num_tokens": 124402590.0, + "step": 38250 + }, + { + "entropy": 0.08780342619866133, + "epoch": 8.917356335237207, + "grad_norm": 1.6953125, + "learning_rate": 4.687175360506415e-05, + "loss": 0.0055, + "mean_token_accuracy": 0.9984165310859681, + "num_tokens": 124416878.0, + "step": 38255 + }, + { + "entropy": 0.0503659694455564, + "epoch": 8.918521972257839, + "grad_norm": 0.37109375, + "learning_rate": 4.6870743461628785e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9995361506938935, + "num_tokens": 124447276.0, + "step": 38260 + }, + { + "entropy": 0.048782548774033783, + "epoch": 8.919687609278471, + "grad_norm": 0.37890625, + "learning_rate": 4.686973317846155e-05, + "loss": 0.0049, + "mean_token_accuracy": 0.9984687030315399, + "num_tokens": 124481843.0, + "step": 38265 + }, + { + "entropy": 0.0999077981337905, + "epoch": 8.920853246299103, + "grad_norm": 1.5234375, + "learning_rate": 4.6868722755577486e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9997912287712097, + "num_tokens": 124492998.0, + "step": 38270 + }, + { + "entropy": 0.055358598567545415, + "epoch": 8.922018883319733, + "grad_norm": 0.1435546875, + "learning_rate": 4.6867712192991685e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9990233540534973, + "num_tokens": 124517996.0, + "step": 38275 + }, + { + "entropy": 0.04316076897084713, + "epoch": 8.923184520340365, + "grad_norm": 1.21875, + "learning_rate": 4.6866701490719206e-05, + "loss": 0.0066, + "mean_token_accuracy": 0.9968756437301636, + "num_tokens": 124541992.0, + "step": 38280 + }, + { + "entropy": 0.060758284851908685, + "epoch": 8.924350157360998, + "grad_norm": 0.578125, + "learning_rate": 4.6865690648775115e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9993073165416717, + "num_tokens": 124555858.0, + "step": 38285 + }, + { + "entropy": 0.048283427767455575, + "epoch": 8.92551579438163, + "grad_norm": 0.66796875, + "learning_rate": 4.6864679667174494e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.999762213230133, + "num_tokens": 124577980.0, + "step": 38290 + }, + { + "entropy": 0.05046857642009854, + "epoch": 8.926681431402262, + "grad_norm": 0.58984375, + "learning_rate": 4.6863668545932415e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9984736323356629, + "num_tokens": 124600566.0, + "step": 38295 + }, + { + "entropy": 0.0775529894977808, + "epoch": 8.927847068422894, + "grad_norm": 0.76953125, + "learning_rate": 4.686265728506395e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9988628149032592, + "num_tokens": 124612207.0, + "step": 38300 + }, + { + "entropy": 0.06363149667158723, + "epoch": 8.929012705443524, + "grad_norm": 1.578125, + "learning_rate": 4.686164588458418e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.9986072659492493, + "num_tokens": 124627386.0, + "step": 38305 + }, + { + "entropy": 0.054681507777422665, + "epoch": 8.930178342464156, + "grad_norm": 1.0625, + "learning_rate": 4.68606343445082e-05, + "loss": 0.018, + "mean_token_accuracy": 0.9967020988464356, + "num_tokens": 124653093.0, + "step": 38310 + }, + { + "entropy": 0.06164302993565798, + "epoch": 8.931343979484788, + "grad_norm": 0.455078125, + "learning_rate": 4.685962266485107e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9991594314575195, + "num_tokens": 124666686.0, + "step": 38315 + }, + { + "entropy": 0.06907703513279558, + "epoch": 8.93250961650542, + "grad_norm": 0.1484375, + "learning_rate": 4.6858610845627895e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9997636675834656, + "num_tokens": 124685401.0, + "step": 38320 + }, + { + "entropy": 0.07067847475409508, + "epoch": 8.933675253526053, + "grad_norm": 0.390625, + "learning_rate": 4.685759888685376e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.9993339359760285, + "num_tokens": 124696379.0, + "step": 38325 + }, + { + "entropy": 0.06941550485789776, + "epoch": 8.934840890546683, + "grad_norm": 0.43359375, + "learning_rate": 4.6856586788543746e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9992957353591919, + "num_tokens": 124711477.0, + "step": 38330 + }, + { + "entropy": 0.04978791456669569, + "epoch": 8.936006527567315, + "grad_norm": 0.376953125, + "learning_rate": 4.685557455071295e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9994521677494049, + "num_tokens": 124726622.0, + "step": 38335 + }, + { + "entropy": 0.04930603364482522, + "epoch": 8.937172164587947, + "grad_norm": 0.1748046875, + "learning_rate": 4.685456217337646e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.9980187237262725, + "num_tokens": 124754390.0, + "step": 38340 + }, + { + "entropy": 0.05293769268319011, + "epoch": 8.93833780160858, + "grad_norm": 0.333984375, + "learning_rate": 4.685354965654939e-05, + "loss": 0.0038, + "mean_token_accuracy": 0.9984555840492249, + "num_tokens": 124776615.0, + "step": 38345 + }, + { + "entropy": 0.05897071985527873, + "epoch": 8.939503438629211, + "grad_norm": 0.314453125, + "learning_rate": 4.685253700024682e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9994755685329437, + "num_tokens": 124813214.0, + "step": 38350 + }, + { + "entropy": 0.07651661131531, + "epoch": 8.940669075649843, + "grad_norm": 0.609375, + "learning_rate": 4.685152420448386e-05, + "loss": 0.0066, + "mean_token_accuracy": 0.9984458088874817, + "num_tokens": 124822339.0, + "step": 38355 + }, + { + "entropy": 0.0461973468773067, + "epoch": 8.941834712670474, + "grad_norm": 0.375, + "learning_rate": 4.685051126927561e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.9989622116088868, + "num_tokens": 124848166.0, + "step": 38360 + }, + { + "entropy": 0.17166866697371005, + "epoch": 8.943000349691106, + "grad_norm": 0.28515625, + "learning_rate": 4.684949819463717e-05, + "loss": 0.2022, + "mean_token_accuracy": 0.9561330854892731, + "num_tokens": 124867564.0, + "step": 38365 + }, + { + "entropy": 0.06029870919883251, + "epoch": 8.944165986711738, + "grad_norm": 0.2470703125, + "learning_rate": 4.684848498058364e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9992516517639161, + "num_tokens": 124889029.0, + "step": 38370 + }, + { + "entropy": 0.058437543269246814, + "epoch": 8.94533162373237, + "grad_norm": 0.1552734375, + "learning_rate": 4.6847471627130145e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9987386465072632, + "num_tokens": 124903062.0, + "step": 38375 + }, + { + "entropy": 0.057839416153728965, + "epoch": 8.946497260753002, + "grad_norm": 2.34375, + "learning_rate": 4.684645813429179e-05, + "loss": 0.0148, + "mean_token_accuracy": 0.9975367724895478, + "num_tokens": 124925603.0, + "step": 38380 + }, + { + "entropy": 0.07765648029744625, + "epoch": 8.947662897773633, + "grad_norm": 0.37890625, + "learning_rate": 4.684544450208368e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9997811794281006, + "num_tokens": 124936529.0, + "step": 38385 + }, + { + "entropy": 0.05938598429784179, + "epoch": 8.948828534794265, + "grad_norm": 0.123046875, + "learning_rate": 4.684443073052095e-05, + "loss": 0.0079, + "mean_token_accuracy": 0.9987027943134308, + "num_tokens": 124957318.0, + "step": 38390 + }, + { + "entropy": 0.12599324379116297, + "epoch": 8.949994171814897, + "grad_norm": 0.404296875, + "learning_rate": 4.684341681961869e-05, + "loss": 0.0921, + "mean_token_accuracy": 0.9860056221485138, + "num_tokens": 124980658.0, + "step": 38395 + }, + { + "entropy": 0.06396199259907007, + "epoch": 8.951159808835529, + "grad_norm": 0.88671875, + "learning_rate": 4.684240276939204e-05, + "loss": 0.007, + "mean_token_accuracy": 0.9991120994091034, + "num_tokens": 125005084.0, + "step": 38400 + }, + { + "entropy": 0.05854538474231959, + "epoch": 8.952325445856161, + "grad_norm": 0.4921875, + "learning_rate": 4.684138857985611e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9997228562831879, + "num_tokens": 125033670.0, + "step": 38405 + }, + { + "entropy": 0.05626933202147484, + "epoch": 8.953491082876791, + "grad_norm": 1.3046875, + "learning_rate": 4.684037425102603e-05, + "loss": 0.0099, + "mean_token_accuracy": 0.9971358001232147, + "num_tokens": 125065295.0, + "step": 38410 + }, + { + "entropy": 0.06237728837877512, + "epoch": 8.954656719897423, + "grad_norm": 0.314453125, + "learning_rate": 4.6839359782916916e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9991385698318481, + "num_tokens": 125093192.0, + "step": 38415 + }, + { + "entropy": 0.050046677514910695, + "epoch": 8.955822356918056, + "grad_norm": 2.53125, + "learning_rate": 4.68383451755439e-05, + "loss": 0.0117, + "mean_token_accuracy": 0.9974133551120759, + "num_tokens": 125108996.0, + "step": 38420 + }, + { + "entropy": 0.054030220536515114, + "epoch": 8.956987993938688, + "grad_norm": 0.1572265625, + "learning_rate": 4.683733042892211e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9980381071567536, + "num_tokens": 125130684.0, + "step": 38425 + }, + { + "entropy": 0.06457688459195196, + "epoch": 8.95815363095932, + "grad_norm": 0.384765625, + "learning_rate": 4.683631554306668e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9989829778671264, + "num_tokens": 125158330.0, + "step": 38430 + }, + { + "entropy": 0.07588558178395033, + "epoch": 8.959319267979952, + "grad_norm": 2.59375, + "learning_rate": 4.6835300517992755e-05, + "loss": 0.0069, + "mean_token_accuracy": 0.998191100358963, + "num_tokens": 125182823.0, + "step": 38435 + }, + { + "entropy": 0.055414431262761354, + "epoch": 8.960484905000582, + "grad_norm": 0.64453125, + "learning_rate": 4.683428535371544e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9980771243572235, + "num_tokens": 125201575.0, + "step": 38440 + }, + { + "entropy": 0.05777478665113449, + "epoch": 8.961650542021214, + "grad_norm": 2.234375, + "learning_rate": 4.68332700502499e-05, + "loss": 0.0049, + "mean_token_accuracy": 0.9983273804187774, + "num_tokens": 125212087.0, + "step": 38445 + }, + { + "entropy": 0.06871578618884086, + "epoch": 8.962816179041846, + "grad_norm": 0.72265625, + "learning_rate": 4.683225460761126e-05, + "loss": 0.0088, + "mean_token_accuracy": 0.9972965836524963, + "num_tokens": 125223395.0, + "step": 38450 + }, + { + "entropy": 0.05714446417987347, + "epoch": 8.963981816062478, + "grad_norm": 0.349609375, + "learning_rate": 4.683123902581468e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9998061537742615, + "num_tokens": 125249399.0, + "step": 38455 + }, + { + "entropy": 0.06488155499100685, + "epoch": 8.96514745308311, + "grad_norm": 0.2255859375, + "learning_rate": 4.683022330487528e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9994613707065583, + "num_tokens": 125278360.0, + "step": 38460 + }, + { + "entropy": 0.06843510447070003, + "epoch": 8.966313090103741, + "grad_norm": 0.87890625, + "learning_rate": 4.682920744480822e-05, + "loss": 0.0098, + "mean_token_accuracy": 0.9972912669181824, + "num_tokens": 125294562.0, + "step": 38465 + }, + { + "entropy": 0.05685213636606932, + "epoch": 8.967478727124373, + "grad_norm": 2.234375, + "learning_rate": 4.6828191445628643e-05, + "loss": 0.004, + "mean_token_accuracy": 0.998846048116684, + "num_tokens": 125323621.0, + "step": 38470 + }, + { + "entropy": 0.05628944206982851, + "epoch": 8.968644364145005, + "grad_norm": 1.203125, + "learning_rate": 4.682717530735171e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.998843890428543, + "num_tokens": 125337092.0, + "step": 38475 + }, + { + "entropy": 0.06336890961974859, + "epoch": 8.969810001165637, + "grad_norm": 0.25, + "learning_rate": 4.682615902999255e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9988363981246948, + "num_tokens": 125355100.0, + "step": 38480 + }, + { + "entropy": 0.049503239383921024, + "epoch": 8.97097563818627, + "grad_norm": 0.9140625, + "learning_rate": 4.682514261356634e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9989173531532287, + "num_tokens": 125372821.0, + "step": 38485 + }, + { + "entropy": 0.06290744915604592, + "epoch": 8.972141275206901, + "grad_norm": 2.265625, + "learning_rate": 4.682412605808823e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.9985226809978485, + "num_tokens": 125386583.0, + "step": 38490 + }, + { + "entropy": 0.06669617369771004, + "epoch": 8.973306912227532, + "grad_norm": 0.5078125, + "learning_rate": 4.6823109363573375e-05, + "loss": 0.0091, + "mean_token_accuracy": 0.9965139627456665, + "num_tokens": 125398934.0, + "step": 38495 + }, + { + "entropy": 0.05491267712786794, + "epoch": 8.974472549248164, + "grad_norm": 1.59375, + "learning_rate": 4.682209253003693e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.998519778251648, + "num_tokens": 125419781.0, + "step": 38500 + }, + { + "entropy": 0.06229007430374622, + "epoch": 8.975638186268796, + "grad_norm": 1.125, + "learning_rate": 4.682107555749408e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9979374289512635, + "num_tokens": 125436392.0, + "step": 38505 + }, + { + "entropy": 0.052480381168425086, + "epoch": 8.976803823289428, + "grad_norm": 0.255859375, + "learning_rate": 4.682005844595996e-05, + "loss": 0.0058, + "mean_token_accuracy": 0.9982479214668274, + "num_tokens": 125467487.0, + "step": 38510 + }, + { + "entropy": 0.07493066936731338, + "epoch": 8.97796946031006, + "grad_norm": 0.1796875, + "learning_rate": 4.6819041195449755e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9987802803516388, + "num_tokens": 125477378.0, + "step": 38515 + }, + { + "entropy": 0.06157904057763517, + "epoch": 8.97913509733069, + "grad_norm": 0.171875, + "learning_rate": 4.681802380597862e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.9989335715770722, + "num_tokens": 125498711.0, + "step": 38520 + }, + { + "entropy": 0.039980428479611876, + "epoch": 8.980300734351323, + "grad_norm": 0.220703125, + "learning_rate": 4.6817006277561745e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9981152474880218, + "num_tokens": 125543250.0, + "step": 38525 + }, + { + "entropy": 0.06129704499617219, + "epoch": 8.981466371371955, + "grad_norm": 0.30859375, + "learning_rate": 4.681598861021429e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9974672019481658, + "num_tokens": 125568173.0, + "step": 38530 + }, + { + "entropy": 0.07135277204215526, + "epoch": 8.982632008392587, + "grad_norm": 2.703125, + "learning_rate": 4.681497080395143e-05, + "loss": 0.0081, + "mean_token_accuracy": 0.997850650548935, + "num_tokens": 125578244.0, + "step": 38535 + }, + { + "entropy": 0.06666518365964294, + "epoch": 8.983797645413219, + "grad_norm": 0.953125, + "learning_rate": 4.681395285878835e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9992936909198761, + "num_tokens": 125589548.0, + "step": 38540 + }, + { + "entropy": 0.05618604850023985, + "epoch": 8.98496328243385, + "grad_norm": 0.54296875, + "learning_rate": 4.6812934774740223e-05, + "loss": 0.0059, + "mean_token_accuracy": 0.9985435128211975, + "num_tokens": 125615502.0, + "step": 38545 + }, + { + "entropy": 0.04700936172157526, + "epoch": 8.986128919454481, + "grad_norm": 1.1171875, + "learning_rate": 4.6811916551822235e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9995196282863616, + "num_tokens": 125642561.0, + "step": 38550 + }, + { + "entropy": 0.06371036674827338, + "epoch": 8.987294556475113, + "grad_norm": 0.3984375, + "learning_rate": 4.681089819004956e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9984392046928405, + "num_tokens": 125665212.0, + "step": 38555 + }, + { + "entropy": 0.0539166197180748, + "epoch": 8.988460193495746, + "grad_norm": 0.26953125, + "learning_rate": 4.680987968943739e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.999776154756546, + "num_tokens": 125701838.0, + "step": 38560 + }, + { + "entropy": 0.07485632486641407, + "epoch": 8.989625830516378, + "grad_norm": 1.0625, + "learning_rate": 4.68088610500009e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9985475897789001, + "num_tokens": 125716351.0, + "step": 38565 + }, + { + "entropy": 0.06469077356159687, + "epoch": 8.99079146753701, + "grad_norm": 0.443359375, + "learning_rate": 4.6807842271755306e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9997498154640198, + "num_tokens": 125731984.0, + "step": 38570 + }, + { + "entropy": 0.05720685347914696, + "epoch": 8.99195710455764, + "grad_norm": 1.5546875, + "learning_rate": 4.680682335471577e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9984744906425476, + "num_tokens": 125746456.0, + "step": 38575 + }, + { + "entropy": 0.04150606356561184, + "epoch": 8.993122741578272, + "grad_norm": 0.072265625, + "learning_rate": 4.680580429889751e-05, + "loss": 0.0042, + "mean_token_accuracy": 0.9995882928371429, + "num_tokens": 125770312.0, + "step": 38580 + }, + { + "entropy": 0.06257959809154272, + "epoch": 8.994288378598904, + "grad_norm": 0.07421875, + "learning_rate": 4.6804785104315714e-05, + "loss": 0.0065, + "mean_token_accuracy": 0.9992784261703491, + "num_tokens": 125789938.0, + "step": 38585 + }, + { + "entropy": 0.10159647967666388, + "epoch": 8.995454015619536, + "grad_norm": 1.625, + "learning_rate": 4.680376577098557e-05, + "loss": 0.0816, + "mean_token_accuracy": 0.9853849947452545, + "num_tokens": 125809595.0, + "step": 38590 + }, + { + "entropy": 0.056752304825931785, + "epoch": 8.996619652640168, + "grad_norm": 1.515625, + "learning_rate": 4.680274629892228e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9988771855831147, + "num_tokens": 125824181.0, + "step": 38595 + }, + { + "entropy": 0.051522306725382806, + "epoch": 8.997785289660799, + "grad_norm": 1.078125, + "learning_rate": 4.680172668814106e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.9990641057491303, + "num_tokens": 125840965.0, + "step": 38600 + }, + { + "entropy": 0.06658210419118404, + "epoch": 8.998950926681431, + "grad_norm": 0.1044921875, + "learning_rate": 4.68007069386571e-05, + "loss": 0.0047, + "mean_token_accuracy": 0.9983263134956359, + "num_tokens": 125854797.0, + "step": 38605 + }, + { + "entropy": 0.06707908130354351, + "epoch": 9.0, + "grad_norm": 0.5703125, + "learning_rate": 4.679968705048562e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9990188876787821, + "num_tokens": 125865250.0, + "step": 38610 + }, + { + "entropy": 0.04199001295492053, + "epoch": 9.001165637020632, + "grad_norm": 0.1748046875, + "learning_rate": 4.679866702364181e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999545395374299, + "num_tokens": 125890465.0, + "step": 38615 + }, + { + "entropy": 0.19603591002523899, + "epoch": 9.002331274041264, + "grad_norm": 5.21875, + "learning_rate": 4.679764685814089e-05, + "loss": 0.2893, + "mean_token_accuracy": 0.9659708976745606, + "num_tokens": 125923536.0, + "step": 38620 + }, + { + "entropy": 0.04350639209151268, + "epoch": 9.003496911061895, + "grad_norm": 0.1865234375, + "learning_rate": 4.679662655399806e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9994899988174438, + "num_tokens": 125965804.0, + "step": 38625 + }, + { + "entropy": 0.06196370590478182, + "epoch": 9.004662548082527, + "grad_norm": 0.1298828125, + "learning_rate": 4.6795606111228565e-05, + "loss": 0.0007, + "mean_token_accuracy": 1.0, + "num_tokens": 125983321.0, + "step": 38630 + }, + { + "entropy": 0.07306013368070126, + "epoch": 9.005828185103159, + "grad_norm": 0.2333984375, + "learning_rate": 4.679458552984759e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.9990762650966645, + "num_tokens": 125994538.0, + "step": 38635 + }, + { + "entropy": 0.052279172092676164, + "epoch": 9.00699382212379, + "grad_norm": 0.05517578125, + "learning_rate": 4.679356480987036e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9994321703910828, + "num_tokens": 126019599.0, + "step": 38640 + }, + { + "entropy": 0.06070118434727192, + "epoch": 9.008159459144423, + "grad_norm": 0.037109375, + "learning_rate": 4.679254395131211e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9997816622257233, + "num_tokens": 126031811.0, + "step": 38645 + }, + { + "entropy": 0.057560394145548345, + "epoch": 9.009325096165055, + "grad_norm": 0.016357421875, + "learning_rate": 4.679152295418805e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9993645131587983, + "num_tokens": 126048597.0, + "step": 38650 + }, + { + "entropy": 0.0726345956325531, + "epoch": 9.010490733185685, + "grad_norm": 0.111328125, + "learning_rate": 4.679050181851341e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9994578123092651, + "num_tokens": 126060395.0, + "step": 38655 + }, + { + "entropy": 0.05440345862880349, + "epoch": 9.011656370206317, + "grad_norm": 0.63671875, + "learning_rate": 4.678948054430341e-05, + "loss": 0.0039, + "mean_token_accuracy": 0.9978732764720917, + "num_tokens": 126079809.0, + "step": 38660 + }, + { + "entropy": 0.06478796168230475, + "epoch": 9.01282200722695, + "grad_norm": 0.138671875, + "learning_rate": 4.678845913157328e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9992167770862579, + "num_tokens": 126108676.0, + "step": 38665 + }, + { + "entropy": 0.04735125498846173, + "epoch": 9.013987644247582, + "grad_norm": 1.9140625, + "learning_rate": 4.678743758033826e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9993732571601868, + "num_tokens": 126131661.0, + "step": 38670 + }, + { + "entropy": 0.07930247653275728, + "epoch": 9.015153281268214, + "grad_norm": 0.3046875, + "learning_rate": 4.6786415890613576e-05, + "loss": 0.0006, + "mean_token_accuracy": 1.0, + "num_tokens": 126142115.0, + "step": 38675 + }, + { + "entropy": 0.07448353804647923, + "epoch": 9.016318918288844, + "grad_norm": 0.09912109375, + "learning_rate": 4.678539406241446e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9995809614658355, + "num_tokens": 126170355.0, + "step": 38680 + }, + { + "entropy": 0.06288553746417165, + "epoch": 9.017484555309476, + "grad_norm": 0.099609375, + "learning_rate": 4.6784372095756155e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999343454837799, + "num_tokens": 126200626.0, + "step": 38685 + }, + { + "entropy": 0.06611028239130974, + "epoch": 9.018650192330108, + "grad_norm": 0.1455078125, + "learning_rate": 4.67833499906539e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9992486298084259, + "num_tokens": 126211632.0, + "step": 38690 + }, + { + "entropy": 0.058050422370433806, + "epoch": 9.01981582935074, + "grad_norm": 0.1953125, + "learning_rate": 4.678232774712293e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9996413052082062, + "num_tokens": 126233280.0, + "step": 38695 + }, + { + "entropy": 0.0486367829144001, + "epoch": 9.020981466371373, + "grad_norm": 0.07568359375, + "learning_rate": 4.6781305365178495e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999536752700806, + "num_tokens": 126255103.0, + "step": 38700 + }, + { + "entropy": 0.05647154543548823, + "epoch": 9.022147103392003, + "grad_norm": 0.2060546875, + "learning_rate": 4.678028284483583e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999893784523011, + "num_tokens": 126283378.0, + "step": 38705 + }, + { + "entropy": 0.05234082993119955, + "epoch": 9.023312740412635, + "grad_norm": 0.2177734375, + "learning_rate": 4.67792601861102e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9999635398387909, + "num_tokens": 126312391.0, + "step": 38710 + }, + { + "entropy": 0.07205567061901093, + "epoch": 9.024478377433267, + "grad_norm": 0.01806640625, + "learning_rate": 4.6778237389016835e-05, + "loss": 0.0006, + "mean_token_accuracy": 1.0, + "num_tokens": 126327468.0, + "step": 38715 + }, + { + "entropy": 0.07139207748696208, + "epoch": 9.0256440144539, + "grad_norm": 0.1083984375, + "learning_rate": 4.6777214453571e-05, + "loss": 0.0007, + "mean_token_accuracy": 1.0, + "num_tokens": 126341775.0, + "step": 38720 + }, + { + "entropy": 0.0707708889618516, + "epoch": 9.026809651474531, + "grad_norm": 0.10498046875, + "learning_rate": 4.677619137978794e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999778509140015, + "num_tokens": 126361618.0, + "step": 38725 + }, + { + "entropy": 0.0603253073990345, + "epoch": 9.027975288495163, + "grad_norm": 0.06494140625, + "learning_rate": 4.677516816768292e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999888360500335, + "num_tokens": 126383105.0, + "step": 38730 + }, + { + "entropy": 0.07485587485134601, + "epoch": 9.029140925515794, + "grad_norm": 0.28125, + "learning_rate": 4.6774144817271195e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9997031033039093, + "num_tokens": 126403286.0, + "step": 38735 + }, + { + "entropy": 0.06576101407408715, + "epoch": 9.030306562536426, + "grad_norm": 1.234375, + "learning_rate": 4.677312132856801e-05, + "loss": 0.0052, + "mean_token_accuracy": 0.9995265305042267, + "num_tokens": 126415973.0, + "step": 38740 + }, + { + "entropy": 0.08720704466104508, + "epoch": 9.031472199557058, + "grad_norm": 0.26953125, + "learning_rate": 4.6772097701588646e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9993067860603333, + "num_tokens": 126428004.0, + "step": 38745 + }, + { + "entropy": 0.06274453792721033, + "epoch": 9.03263783657769, + "grad_norm": 0.173828125, + "learning_rate": 4.677107393634835e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 126442681.0, + "step": 38750 + }, + { + "entropy": 0.04070065952837467, + "epoch": 9.033803473598322, + "grad_norm": 0.609375, + "learning_rate": 4.677005003286241e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9995027482509613, + "num_tokens": 126467633.0, + "step": 38755 + }, + { + "entropy": 0.08652214827015996, + "epoch": 9.034969110618952, + "grad_norm": 0.0751953125, + "learning_rate": 4.6769025991146076e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9998975336551666, + "num_tokens": 126502224.0, + "step": 38760 + }, + { + "entropy": 0.05523251341655851, + "epoch": 9.036134747639585, + "grad_norm": 0.1298828125, + "learning_rate": 4.676800181121462e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 126522879.0, + "step": 38765 + }, + { + "entropy": 0.06810875101946294, + "epoch": 9.037300384660217, + "grad_norm": 0.0849609375, + "learning_rate": 4.676697749308332e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.999539029598236, + "num_tokens": 126537183.0, + "step": 38770 + }, + { + "entropy": 0.06015290655195713, + "epoch": 9.038466021680849, + "grad_norm": 0.33203125, + "learning_rate": 4.676595303676745e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999792695045471, + "num_tokens": 126552182.0, + "step": 38775 + }, + { + "entropy": 0.06348831951618195, + "epoch": 9.03963165870148, + "grad_norm": 0.07861328125, + "learning_rate": 4.676492844228227e-05, + "loss": 0.0009, + "mean_token_accuracy": 1.0, + "num_tokens": 126561944.0, + "step": 38780 + }, + { + "entropy": 0.06204386427998543, + "epoch": 9.040797295722113, + "grad_norm": 0.546875, + "learning_rate": 4.676390370964309e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9997840166091919, + "num_tokens": 126573591.0, + "step": 38785 + }, + { + "entropy": 0.06764180241152644, + "epoch": 9.041962932742743, + "grad_norm": 0.345703125, + "learning_rate": 4.676287883886516e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9997762858867645, + "num_tokens": 126585784.0, + "step": 38790 + }, + { + "entropy": 0.05596046075224877, + "epoch": 9.043128569763375, + "grad_norm": 0.23828125, + "learning_rate": 4.676185382996378e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9999856591224671, + "num_tokens": 126601959.0, + "step": 38795 + }, + { + "entropy": 0.05647138254716992, + "epoch": 9.044294206784008, + "grad_norm": 0.1416015625, + "learning_rate": 4.676082868295423e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9988505721092225, + "num_tokens": 126620139.0, + "step": 38800 + }, + { + "entropy": 0.050310919806361196, + "epoch": 9.04545984380464, + "grad_norm": 0.1474609375, + "learning_rate": 4.675980339785179e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.999627536535263, + "num_tokens": 126645308.0, + "step": 38805 + }, + { + "entropy": 0.05990322157740593, + "epoch": 9.046625480825272, + "grad_norm": 0.72265625, + "learning_rate": 4.675877797467176e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9993937849998474, + "num_tokens": 126657453.0, + "step": 38810 + }, + { + "entropy": 0.04789342461153865, + "epoch": 9.047791117845902, + "grad_norm": 0.1611328125, + "learning_rate": 4.675775241342942e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999779224395752, + "num_tokens": 126690452.0, + "step": 38815 + }, + { + "entropy": 0.07110001184046269, + "epoch": 9.048956754866534, + "grad_norm": 0.046630859375, + "learning_rate": 4.675672671414006e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 126705011.0, + "step": 38820 + }, + { + "entropy": 0.0685180657543242, + "epoch": 9.050122391887166, + "grad_norm": 0.09814453125, + "learning_rate": 4.6755700876819e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9996334731578826, + "num_tokens": 126725173.0, + "step": 38825 + }, + { + "entropy": 0.061684256233274934, + "epoch": 9.051288028907798, + "grad_norm": 0.06396484375, + "learning_rate": 4.67546749014815e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9999591529369354, + "num_tokens": 126750446.0, + "step": 38830 + }, + { + "entropy": 0.07004572823643684, + "epoch": 9.05245366592843, + "grad_norm": 0.03173828125, + "learning_rate": 4.675364878814289e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9995744705200196, + "num_tokens": 126762479.0, + "step": 38835 + }, + { + "entropy": 0.08703018184751272, + "epoch": 9.05361930294906, + "grad_norm": 0.06298828125, + "learning_rate": 4.675262253681845e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999328851699829, + "num_tokens": 126775726.0, + "step": 38840 + }, + { + "entropy": 0.05530893374234438, + "epoch": 9.054784939969693, + "grad_norm": 0.07666015625, + "learning_rate": 4.67515961475235e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9998360633850097, + "num_tokens": 126799072.0, + "step": 38845 + }, + { + "entropy": 0.05625711902976036, + "epoch": 9.055950576990325, + "grad_norm": 0.04638671875, + "learning_rate": 4.6750569620273324e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9993377506732941, + "num_tokens": 126813234.0, + "step": 38850 + }, + { + "entropy": 0.08031964246183634, + "epoch": 9.057116214010957, + "grad_norm": 0.064453125, + "learning_rate": 4.6749542955083253e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 126840453.0, + "step": 38855 + }, + { + "entropy": 0.05963180642575026, + "epoch": 9.05828185103159, + "grad_norm": 0.039306640625, + "learning_rate": 4.674851615196858e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 126850720.0, + "step": 38860 + }, + { + "entropy": 0.07368194349110127, + "epoch": 9.059447488052221, + "grad_norm": 0.61328125, + "learning_rate": 4.674748921094462e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9993014574050904, + "num_tokens": 126865529.0, + "step": 38865 + }, + { + "entropy": 0.06885956060141325, + "epoch": 9.060613125072852, + "grad_norm": 0.0908203125, + "learning_rate": 4.6746462132026686e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9986264824867248, + "num_tokens": 126884470.0, + "step": 38870 + }, + { + "entropy": 0.06841649003326893, + "epoch": 9.061778762093484, + "grad_norm": 0.06640625, + "learning_rate": 4.67454349152301e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 126895503.0, + "step": 38875 + }, + { + "entropy": 0.06709451526403427, + "epoch": 9.062944399114116, + "grad_norm": 0.78515625, + "learning_rate": 4.674440756057017e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9993994951248169, + "num_tokens": 126908015.0, + "step": 38880 + }, + { + "entropy": 0.05293879248201847, + "epoch": 9.064110036134748, + "grad_norm": 0.1376953125, + "learning_rate": 4.674338006806222e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9995798289775848, + "num_tokens": 126928878.0, + "step": 38885 + }, + { + "entropy": 0.07902012132108212, + "epoch": 9.06527567315538, + "grad_norm": 0.439453125, + "learning_rate": 4.674235243772156e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999373495578766, + "num_tokens": 126950751.0, + "step": 38890 + }, + { + "entropy": 0.06013939660042524, + "epoch": 9.06644131017601, + "grad_norm": 0.2119140625, + "learning_rate": 4.674132466956354e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9996572375297547, + "num_tokens": 126962951.0, + "step": 38895 + }, + { + "entropy": 0.04780583553947508, + "epoch": 9.067606947196643, + "grad_norm": 0.19140625, + "learning_rate": 4.674029676360346e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9996591031551361, + "num_tokens": 126992608.0, + "step": 38900 + }, + { + "entropy": 0.06537991110235453, + "epoch": 9.068772584217275, + "grad_norm": 0.0615234375, + "learning_rate": 4.6739268719856657e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999897062778473, + "num_tokens": 127019628.0, + "step": 38905 + }, + { + "entropy": 0.04793858341872692, + "epoch": 9.069938221237907, + "grad_norm": 0.67578125, + "learning_rate": 4.673824053833846e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9994303047657013, + "num_tokens": 127039633.0, + "step": 38910 + }, + { + "entropy": 0.042522894544526936, + "epoch": 9.071103858258539, + "grad_norm": 0.21484375, + "learning_rate": 4.6737212219064204e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9996679306030274, + "num_tokens": 127070798.0, + "step": 38915 + }, + { + "entropy": 0.037787226028740406, + "epoch": 9.072269495279171, + "grad_norm": 0.0240478515625, + "learning_rate": 4.673618376204922e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999767899513244, + "num_tokens": 127100035.0, + "step": 38920 + }, + { + "entropy": 0.04557211743667722, + "epoch": 9.073435132299801, + "grad_norm": 0.049072265625, + "learning_rate": 4.6735155167308844e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9997222900390625, + "num_tokens": 127130459.0, + "step": 38925 + }, + { + "entropy": 0.05762959066778421, + "epoch": 9.074600769320433, + "grad_norm": 0.030029296875, + "learning_rate": 4.6734126434858416e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999673962593079, + "num_tokens": 127143328.0, + "step": 38930 + }, + { + "entropy": 0.039792079012840986, + "epoch": 9.075766406341065, + "grad_norm": 0.1298828125, + "learning_rate": 4.673309756471327e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9997629404067994, + "num_tokens": 127172420.0, + "step": 38935 + }, + { + "entropy": 0.0473951231688261, + "epoch": 9.076932043361698, + "grad_norm": 0.0986328125, + "learning_rate": 4.6732068556888755e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9999626576900482, + "num_tokens": 127203870.0, + "step": 38940 + }, + { + "entropy": 0.05528845563530922, + "epoch": 9.07809768038233, + "grad_norm": 0.1748046875, + "learning_rate": 4.673103941140021e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999382495880127, + "num_tokens": 127228127.0, + "step": 38945 + }, + { + "entropy": 0.046199146658182144, + "epoch": 9.07926331740296, + "grad_norm": 0.8515625, + "learning_rate": 4.673001012826298e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9992288112640381, + "num_tokens": 127256065.0, + "step": 38950 + }, + { + "entropy": 0.052226276509463786, + "epoch": 9.080428954423592, + "grad_norm": 0.2431640625, + "learning_rate": 4.6728980707492426e-05, + "loss": 0.0039, + "mean_token_accuracy": 0.999572080373764, + "num_tokens": 127273454.0, + "step": 38955 + }, + { + "entropy": 0.048444395791739224, + "epoch": 9.081594591444224, + "grad_norm": 0.138671875, + "learning_rate": 4.6727951149103884e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999888062477111, + "num_tokens": 127294859.0, + "step": 38960 + }, + { + "entropy": 0.06333124712109565, + "epoch": 9.082760228464856, + "grad_norm": 0.033447265625, + "learning_rate": 4.672692145311271e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9994173049926758, + "num_tokens": 127311275.0, + "step": 38965 + }, + { + "entropy": 0.04639546973630786, + "epoch": 9.083925865485488, + "grad_norm": 0.056640625, + "learning_rate": 4.672589161953426e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 127328407.0, + "step": 38970 + }, + { + "entropy": 0.050291641149669886, + "epoch": 9.085091502506119, + "grad_norm": 0.34375, + "learning_rate": 4.672486164838389e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999376535415649, + "num_tokens": 127355314.0, + "step": 38975 + }, + { + "entropy": 0.05147938821464777, + "epoch": 9.08625713952675, + "grad_norm": 2.1875, + "learning_rate": 4.672383153967695e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.999790358543396, + "num_tokens": 127382802.0, + "step": 38980 + }, + { + "entropy": 0.045143406558781865, + "epoch": 9.087422776547383, + "grad_norm": 0.087890625, + "learning_rate": 4.672280129342882e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999670028686524, + "num_tokens": 127418533.0, + "step": 38985 + }, + { + "entropy": 0.06548787970095873, + "epoch": 9.088588413568015, + "grad_norm": 0.3984375, + "learning_rate": 4.672177090965484e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9999308168888092, + "num_tokens": 127429872.0, + "step": 38990 + }, + { + "entropy": 0.0844784826040268, + "epoch": 9.089754050588647, + "grad_norm": 0.29296875, + "learning_rate": 4.672074038837039e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 127441571.0, + "step": 38995 + }, + { + "entropy": 0.05351873962208629, + "epoch": 9.09091968760928, + "grad_norm": 0.1865234375, + "learning_rate": 4.671970972959083e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.999194449186325, + "num_tokens": 127470617.0, + "step": 39000 + }, + { + "entropy": 0.0646541254594922, + "epoch": 9.09208532462991, + "grad_norm": 0.375, + "learning_rate": 4.671867893333154e-05, + "loss": 0.0006, + "mean_token_accuracy": 1.0, + "num_tokens": 127486521.0, + "step": 39005 + }, + { + "entropy": 0.06300574182532728, + "epoch": 9.093250961650542, + "grad_norm": 0.11328125, + "learning_rate": 4.671764799960787e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9999026715755462, + "num_tokens": 127515347.0, + "step": 39010 + }, + { + "entropy": 0.07608828879892826, + "epoch": 9.094416598671174, + "grad_norm": 0.34765625, + "learning_rate": 4.6716616928435215e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999895751476288, + "num_tokens": 127537060.0, + "step": 39015 + }, + { + "entropy": 0.09740983368828893, + "epoch": 9.095582235691806, + "grad_norm": 0.0849609375, + "learning_rate": 4.671558571982893e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 127550040.0, + "step": 39020 + }, + { + "entropy": 0.15485297152772545, + "epoch": 9.096747872712438, + "grad_norm": 0.07080078125, + "learning_rate": 4.6714554373804404e-05, + "loss": 0.2097, + "mean_token_accuracy": 0.9618768632411957, + "num_tokens": 127577627.0, + "step": 39025 + }, + { + "entropy": 0.0448100233450532, + "epoch": 9.097913509733068, + "grad_norm": 0.71484375, + "learning_rate": 4.671352289037701e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9998813092708587, + "num_tokens": 127601452.0, + "step": 39030 + }, + { + "entropy": 0.051290947990491983, + "epoch": 9.0990791467537, + "grad_norm": 0.1455078125, + "learning_rate": 4.671249126956214e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999408841133117, + "num_tokens": 127637347.0, + "step": 39035 + }, + { + "entropy": 0.07046590894460678, + "epoch": 9.100244783774333, + "grad_norm": 0.0322265625, + "learning_rate": 4.671145951137516e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999824047088623, + "num_tokens": 127652235.0, + "step": 39040 + }, + { + "entropy": 0.07944189198315144, + "epoch": 9.101410420794965, + "grad_norm": 0.5703125, + "learning_rate": 4.671042761583147e-05, + "loss": 0.0187, + "mean_token_accuracy": 0.9976053655147552, + "num_tokens": 127671703.0, + "step": 39045 + }, + { + "entropy": 0.06307450551539659, + "epoch": 9.102576057815597, + "grad_norm": 0.05419921875, + "learning_rate": 4.670939558294645e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9993710696697236, + "num_tokens": 127685689.0, + "step": 39050 + }, + { + "entropy": 0.08569299336522818, + "epoch": 9.103741694836229, + "grad_norm": 0.94921875, + "learning_rate": 4.6708363412735486e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9993679821491241, + "num_tokens": 127711429.0, + "step": 39055 + }, + { + "entropy": 0.05434844773262739, + "epoch": 9.10490733185686, + "grad_norm": 0.26953125, + "learning_rate": 4.670733110521398e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9994161069393158, + "num_tokens": 127736568.0, + "step": 39060 + }, + { + "entropy": 0.07459601685404778, + "epoch": 9.106072968877491, + "grad_norm": 0.07763671875, + "learning_rate": 4.6706298660397306e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 127755668.0, + "step": 39065 + }, + { + "entropy": 0.060200719721615316, + "epoch": 9.107238605898123, + "grad_norm": 0.154296875, + "learning_rate": 4.6705266078300886e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 127768457.0, + "step": 39070 + }, + { + "entropy": 0.0584429781883955, + "epoch": 9.108404242918756, + "grad_norm": 0.68359375, + "learning_rate": 4.6704233358940094e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9996918320655823, + "num_tokens": 127778860.0, + "step": 39075 + }, + { + "entropy": 0.060449579171836375, + "epoch": 9.109569879939388, + "grad_norm": 0.08642578125, + "learning_rate": 4.6703200502330345e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9997960925102234, + "num_tokens": 127810046.0, + "step": 39080 + }, + { + "entropy": 0.03814925597980619, + "epoch": 9.110735516960018, + "grad_norm": 0.1728515625, + "learning_rate": 4.670216750848703e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999839961528778, + "num_tokens": 127843080.0, + "step": 39085 + }, + { + "entropy": 0.06509601455181838, + "epoch": 9.11190115398065, + "grad_norm": 0.0311279296875, + "learning_rate": 4.670113437742556e-05, + "loss": 0.0009, + "mean_token_accuracy": 1.0, + "num_tokens": 127867744.0, + "step": 39090 + }, + { + "entropy": 0.05478460993617773, + "epoch": 9.113066791001282, + "grad_norm": 0.94140625, + "learning_rate": 4.670010110916133e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9996774196624756, + "num_tokens": 127895035.0, + "step": 39095 + }, + { + "entropy": 0.060867223888635635, + "epoch": 9.114232428021914, + "grad_norm": 0.302734375, + "learning_rate": 4.6699067703709766e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.99984050989151, + "num_tokens": 127907214.0, + "step": 39100 + }, + { + "entropy": 0.09498673398047686, + "epoch": 9.115398065042546, + "grad_norm": 0.20703125, + "learning_rate": 4.669803416108626e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9991422772407532, + "num_tokens": 127932050.0, + "step": 39105 + }, + { + "entropy": 0.05927578574046492, + "epoch": 9.116563702063177, + "grad_norm": 0.0390625, + "learning_rate": 4.669700048130622e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 127952338.0, + "step": 39110 + }, + { + "entropy": 0.06774403676390647, + "epoch": 9.117729339083809, + "grad_norm": 0.9921875, + "learning_rate": 4.6695966664385087e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.999355947971344, + "num_tokens": 127963600.0, + "step": 39115 + }, + { + "entropy": 0.05179953342303634, + "epoch": 9.11889497610444, + "grad_norm": 1.3515625, + "learning_rate": 4.669493271033825e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9997416198253631, + "num_tokens": 127990422.0, + "step": 39120 + }, + { + "entropy": 0.06711933370679617, + "epoch": 9.120060613125073, + "grad_norm": 0.2177734375, + "learning_rate": 4.6693898619181144e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9995348811149597, + "num_tokens": 128001948.0, + "step": 39125 + }, + { + "entropy": 0.0793570352718234, + "epoch": 9.121226250145705, + "grad_norm": 0.08837890625, + "learning_rate": 4.669286439092917e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9998459160327912, + "num_tokens": 128019219.0, + "step": 39130 + }, + { + "entropy": 0.061221508868038656, + "epoch": 9.122391887166337, + "grad_norm": 0.045166015625, + "learning_rate": 4.669183002559777e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 128036619.0, + "step": 39135 + }, + { + "entropy": 0.06286525307223201, + "epoch": 9.123557524186968, + "grad_norm": 0.04638671875, + "learning_rate": 4.6690795523202355e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 128048687.0, + "step": 39140 + }, + { + "entropy": 0.03850018824450672, + "epoch": 9.1247231612076, + "grad_norm": 0.2275390625, + "learning_rate": 4.668976088375836e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9997575998306274, + "num_tokens": 128072663.0, + "step": 39145 + }, + { + "entropy": 0.0629619574174285, + "epoch": 9.125888798228232, + "grad_norm": 0.02099609375, + "learning_rate": 4.66887261072812e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9994776368141174, + "num_tokens": 128084948.0, + "step": 39150 + }, + { + "entropy": 0.06467896215617656, + "epoch": 9.127054435248864, + "grad_norm": 0.189453125, + "learning_rate": 4.668769119378632e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 128094427.0, + "step": 39155 + }, + { + "entropy": 0.053331601060926916, + "epoch": 9.128220072269496, + "grad_norm": 0.0238037109375, + "learning_rate": 4.668665614328914e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9996485114097595, + "num_tokens": 128118089.0, + "step": 39160 + }, + { + "entropy": 0.048440984450280666, + "epoch": 9.129385709290126, + "grad_norm": 0.173828125, + "learning_rate": 4.6685620955805104e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999478876590728, + "num_tokens": 128133298.0, + "step": 39165 + }, + { + "entropy": 0.056567294523119926, + "epoch": 9.130551346310758, + "grad_norm": 0.10107421875, + "learning_rate": 4.6684585631349644e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9995827078819275, + "num_tokens": 128152849.0, + "step": 39170 + }, + { + "entropy": 0.05280892346054315, + "epoch": 9.13171698333139, + "grad_norm": 0.076171875, + "learning_rate": 4.668355016993819e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 128166039.0, + "step": 39175 + }, + { + "entropy": 0.06629458190873265, + "epoch": 9.132882620352023, + "grad_norm": 0.1669921875, + "learning_rate": 4.66825145715862e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999472379684449, + "num_tokens": 128183728.0, + "step": 39180 + }, + { + "entropy": 0.04763224720954895, + "epoch": 9.134048257372655, + "grad_norm": 0.11572265625, + "learning_rate": 4.66814788363091e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 128199350.0, + "step": 39185 + }, + { + "entropy": 0.07102838419377804, + "epoch": 9.135213894393287, + "grad_norm": 0.1953125, + "learning_rate": 4.668044296412234e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.9999483764171601, + "num_tokens": 128220784.0, + "step": 39190 + }, + { + "entropy": 0.0709263451397419, + "epoch": 9.136379531413917, + "grad_norm": 1.140625, + "learning_rate": 4.667940695504137e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9998858451843262, + "num_tokens": 128229335.0, + "step": 39195 + }, + { + "entropy": 0.059687384590506556, + "epoch": 9.13754516843455, + "grad_norm": 0.0986328125, + "learning_rate": 4.667837080908164e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.999447512626648, + "num_tokens": 128246055.0, + "step": 39200 + }, + { + "entropy": 0.054660573275759815, + "epoch": 9.138710805455181, + "grad_norm": 0.05029296875, + "learning_rate": 4.667733452625859e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9999875128269196, + "num_tokens": 128267602.0, + "step": 39205 + }, + { + "entropy": 0.05184516375884414, + "epoch": 9.139876442475813, + "grad_norm": 0.08740234375, + "learning_rate": 4.667629810658768e-05, + "loss": 0.0051, + "mean_token_accuracy": 0.9993105053901672, + "num_tokens": 128291298.0, + "step": 39210 + }, + { + "entropy": 0.04663265328854323, + "epoch": 9.141042079496446, + "grad_norm": 0.08447265625, + "learning_rate": 4.667526155008436e-05, + "loss": 0.001, + "mean_token_accuracy": 0.999936830997467, + "num_tokens": 128306082.0, + "step": 39215 + }, + { + "entropy": 0.06637532748281956, + "epoch": 9.142207716517076, + "grad_norm": 0.1240234375, + "learning_rate": 4.6674224856764096e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9994987487792969, + "num_tokens": 128316316.0, + "step": 39220 + }, + { + "entropy": 0.06515855994075537, + "epoch": 9.143373353537708, + "grad_norm": 0.0322265625, + "learning_rate": 4.667318802664234e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9996699690818787, + "num_tokens": 128329256.0, + "step": 39225 + }, + { + "entropy": 0.06536714136600494, + "epoch": 9.14453899055834, + "grad_norm": 0.1357421875, + "learning_rate": 4.6672151059734555e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999700427055359, + "num_tokens": 128352667.0, + "step": 39230 + }, + { + "entropy": 0.07236762633547186, + "epoch": 9.145704627578972, + "grad_norm": 1.75, + "learning_rate": 4.6671113956056194e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9995726466178894, + "num_tokens": 128371166.0, + "step": 39235 + }, + { + "entropy": 0.048507886566221715, + "epoch": 9.146870264599604, + "grad_norm": 0.267578125, + "learning_rate": 4.667007671562274e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9994894981384277, + "num_tokens": 128410651.0, + "step": 39240 + }, + { + "entropy": 0.058967319689691065, + "epoch": 9.148035901620235, + "grad_norm": 0.5234375, + "learning_rate": 4.6669039338449636e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9997023820877076, + "num_tokens": 128423915.0, + "step": 39245 + }, + { + "entropy": 0.0643064547330141, + "epoch": 9.149201538640867, + "grad_norm": 0.1025390625, + "learning_rate": 4.666800182455238e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9993151903152466, + "num_tokens": 128436003.0, + "step": 39250 + }, + { + "entropy": 0.05355655811727047, + "epoch": 9.150367175661499, + "grad_norm": 1.0390625, + "learning_rate": 4.6666964173946415e-05, + "loss": 0.0066, + "mean_token_accuracy": 0.9981539249420166, + "num_tokens": 128466715.0, + "step": 39255 + }, + { + "entropy": 0.0643747929483652, + "epoch": 9.151532812682131, + "grad_norm": 0.095703125, + "learning_rate": 4.666592638664724e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9996855318546295, + "num_tokens": 128480809.0, + "step": 39260 + }, + { + "entropy": 0.07963294740766287, + "epoch": 9.152698449702763, + "grad_norm": 0.091796875, + "learning_rate": 4.6664888462670295e-05, + "loss": 0.0175, + "mean_token_accuracy": 0.9961723029613495, + "num_tokens": 128520211.0, + "step": 39265 + }, + { + "entropy": 0.0579895157366991, + "epoch": 9.153864086723395, + "grad_norm": 0.017822265625, + "learning_rate": 4.666385040203109e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 128536577.0, + "step": 39270 + }, + { + "entropy": 0.053957913815975186, + "epoch": 9.155029723744025, + "grad_norm": 0.036865234375, + "learning_rate": 4.66628122047451e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999778687953949, + "num_tokens": 128555849.0, + "step": 39275 + }, + { + "entropy": 0.05544408448040485, + "epoch": 9.156195360764658, + "grad_norm": 3.296875, + "learning_rate": 4.666177387082779e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9991379320621491, + "num_tokens": 128569598.0, + "step": 39280 + }, + { + "entropy": 0.05070274667814374, + "epoch": 9.15736099778529, + "grad_norm": 0.0380859375, + "learning_rate": 4.666073540029465e-05, + "loss": 0.0008, + "mean_token_accuracy": 1.0, + "num_tokens": 128586309.0, + "step": 39285 + }, + { + "entropy": 0.0538034837692976, + "epoch": 9.158526634805922, + "grad_norm": 0.05615234375, + "learning_rate": 4.665969679316117e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9999155402183533, + "num_tokens": 128617861.0, + "step": 39290 + }, + { + "entropy": 0.04511388307437301, + "epoch": 9.159692271826554, + "grad_norm": 0.0556640625, + "learning_rate": 4.665865804944284e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9997424304485321, + "num_tokens": 128647469.0, + "step": 39295 + }, + { + "entropy": 0.04719185484573245, + "epoch": 9.160857908847184, + "grad_norm": 0.044677734375, + "learning_rate": 4.665761916915513e-05, + "loss": 0.012, + "mean_token_accuracy": 0.9976949393749237, + "num_tokens": 128666573.0, + "step": 39300 + }, + { + "entropy": 0.047588223777711394, + "epoch": 9.162023545867816, + "grad_norm": 0.146484375, + "learning_rate": 4.6656580152313554e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.999963355064392, + "num_tokens": 128694201.0, + "step": 39305 + }, + { + "entropy": 0.049427392426878214, + "epoch": 9.163189182888448, + "grad_norm": 0.96875, + "learning_rate": 4.665554099893359e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9998815894126892, + "num_tokens": 128722755.0, + "step": 39310 + }, + { + "entropy": 0.047885213326662776, + "epoch": 9.16435481990908, + "grad_norm": 0.21875, + "learning_rate": 4.665450170903074e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9990951359272003, + "num_tokens": 128751245.0, + "step": 39315 + }, + { + "entropy": 0.06532157673500479, + "epoch": 9.165520456929713, + "grad_norm": 0.111328125, + "learning_rate": 4.6653462282620504e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.999978369474411, + "num_tokens": 128777772.0, + "step": 39320 + }, + { + "entropy": 0.06396664790809155, + "epoch": 9.166686093950345, + "grad_norm": 0.14453125, + "learning_rate": 4.6652422719718374e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9997727274894714, + "num_tokens": 128790482.0, + "step": 39325 + }, + { + "entropy": 0.06470691915601492, + "epoch": 9.167851730970975, + "grad_norm": 0.036376953125, + "learning_rate": 4.6651383020339855e-05, + "loss": 0.0015, + "mean_token_accuracy": 1.0, + "num_tokens": 128801350.0, + "step": 39330 + }, + { + "entropy": 0.08019221909344196, + "epoch": 9.169017367991607, + "grad_norm": 0.2451171875, + "learning_rate": 4.665034318450045e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.999977171421051, + "num_tokens": 128819447.0, + "step": 39335 + }, + { + "entropy": 0.06017429428175092, + "epoch": 9.17018300501224, + "grad_norm": 0.0203857421875, + "learning_rate": 4.664930321221567e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999876737594604, + "num_tokens": 128847434.0, + "step": 39340 + }, + { + "entropy": 0.05513544147834182, + "epoch": 9.171348642032871, + "grad_norm": 0.333984375, + "learning_rate": 4.664826310350102e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9994580030441285, + "num_tokens": 128878814.0, + "step": 39345 + }, + { + "entropy": 0.06086507327854633, + "epoch": 9.172514279053503, + "grad_norm": 0.08203125, + "learning_rate": 4.6647222858372004e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 128897376.0, + "step": 39350 + }, + { + "entropy": 0.05507046952843666, + "epoch": 9.173679916074134, + "grad_norm": 0.0615234375, + "learning_rate": 4.6646182476844135e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9995168089866638, + "num_tokens": 128912835.0, + "step": 39355 + }, + { + "entropy": 0.05077353697270155, + "epoch": 9.174845553094766, + "grad_norm": 0.34375, + "learning_rate": 4.664514195893293e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9998065769672394, + "num_tokens": 128930582.0, + "step": 39360 + }, + { + "entropy": 0.044340075273066756, + "epoch": 9.176011190115398, + "grad_norm": 0.341796875, + "learning_rate": 4.6644101304653904e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9997381329536438, + "num_tokens": 128956920.0, + "step": 39365 + }, + { + "entropy": 0.05113975182175636, + "epoch": 9.17717682713603, + "grad_norm": 0.2119140625, + "learning_rate": 4.664306051402257e-05, + "loss": 0.003, + "mean_token_accuracy": 0.998814046382904, + "num_tokens": 128978761.0, + "step": 39370 + }, + { + "entropy": 0.0629701149649918, + "epoch": 9.178342464156662, + "grad_norm": 0.12451171875, + "learning_rate": 4.664201958705445e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9992149472236633, + "num_tokens": 129001845.0, + "step": 39375 + }, + { + "entropy": 0.05342851486057043, + "epoch": 9.179508101177293, + "grad_norm": 0.058349609375, + "learning_rate": 4.6640978523765075e-05, + "loss": 0.0008, + "mean_token_accuracy": 1.0, + "num_tokens": 129016277.0, + "step": 39380 + }, + { + "entropy": 0.07416110690683127, + "epoch": 9.180673738197925, + "grad_norm": 0.07421875, + "learning_rate": 4.6639937324169966e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9997787594795227, + "num_tokens": 129027129.0, + "step": 39385 + }, + { + "entropy": 0.0515309227630496, + "epoch": 9.181839375218557, + "grad_norm": 0.62109375, + "learning_rate": 4.6638895988284634e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.99997638463974, + "num_tokens": 129042418.0, + "step": 39390 + }, + { + "entropy": 0.0686886103823781, + "epoch": 9.183005012239189, + "grad_norm": 0.328125, + "learning_rate": 4.6637854516124616e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9993710696697236, + "num_tokens": 129059775.0, + "step": 39395 + }, + { + "entropy": 0.05704756639897823, + "epoch": 9.184170649259821, + "grad_norm": 0.0556640625, + "learning_rate": 4.663681290770545e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 129079156.0, + "step": 39400 + }, + { + "entropy": 0.04882788760587573, + "epoch": 9.185336286280453, + "grad_norm": 0.2099609375, + "learning_rate": 4.663577116304266e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9994452178478241, + "num_tokens": 129104080.0, + "step": 39405 + }, + { + "entropy": 0.051801460422575475, + "epoch": 9.186501923301083, + "grad_norm": 0.1806640625, + "learning_rate": 4.663472928215178e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 129123118.0, + "step": 39410 + }, + { + "entropy": 0.07059294497594237, + "epoch": 9.187667560321715, + "grad_norm": 0.134765625, + "learning_rate": 4.6633687265048344e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999624252319336, + "num_tokens": 129142115.0, + "step": 39415 + }, + { + "entropy": 0.04961256729438901, + "epoch": 9.188833197342348, + "grad_norm": 0.232421875, + "learning_rate": 4.66326451117479e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9996217608451843, + "num_tokens": 129171901.0, + "step": 39420 + }, + { + "entropy": 0.04771387707442045, + "epoch": 9.18999883436298, + "grad_norm": 0.5390625, + "learning_rate": 4.663160282226597e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9998019516468049, + "num_tokens": 129212825.0, + "step": 39425 + }, + { + "entropy": 0.062344396207481625, + "epoch": 9.191164471383612, + "grad_norm": 0.8515625, + "learning_rate": 4.663056039661811e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9993386328220367, + "num_tokens": 129228522.0, + "step": 39430 + }, + { + "entropy": 0.06107481122016907, + "epoch": 9.192330108404242, + "grad_norm": 0.193359375, + "learning_rate": 4.662951783481987e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 129242723.0, + "step": 39435 + }, + { + "entropy": 0.06025108881294727, + "epoch": 9.193495745424874, + "grad_norm": 0.52734375, + "learning_rate": 4.662847513688678e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9987414538860321, + "num_tokens": 129267902.0, + "step": 39440 + }, + { + "entropy": 0.0548030543141067, + "epoch": 9.194661382445506, + "grad_norm": 0.039794921875, + "learning_rate": 4.66274323028344e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9995507836341858, + "num_tokens": 129301913.0, + "step": 39445 + }, + { + "entropy": 0.05716030802577734, + "epoch": 9.195827019466138, + "grad_norm": 1.7578125, + "learning_rate": 4.662638933267827e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9993688941001893, + "num_tokens": 129321794.0, + "step": 39450 + }, + { + "entropy": 0.04664466297253966, + "epoch": 9.19699265648677, + "grad_norm": 0.0849609375, + "learning_rate": 4.662534622643395e-05, + "loss": 0.0119, + "mean_token_accuracy": 0.9973366379737854, + "num_tokens": 129355666.0, + "step": 39455 + }, + { + "entropy": 0.058199040777981284, + "epoch": 9.198158293507403, + "grad_norm": 0.14453125, + "learning_rate": 4.6624302984116996e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9995925843715667, + "num_tokens": 129391276.0, + "step": 39460 + }, + { + "entropy": 0.064020661637187, + "epoch": 9.199323930528033, + "grad_norm": 0.07080078125, + "learning_rate": 4.662325960574297e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 129408426.0, + "step": 39465 + }, + { + "entropy": 0.04356107474304736, + "epoch": 9.200489567548665, + "grad_norm": 0.09619140625, + "learning_rate": 4.6622216091327403e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999892115592957, + "num_tokens": 129435871.0, + "step": 39470 + }, + { + "entropy": 0.06568758022040129, + "epoch": 9.201655204569297, + "grad_norm": 0.055419921875, + "learning_rate": 4.662117244088588e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9998577475547791, + "num_tokens": 129453203.0, + "step": 39475 + }, + { + "entropy": 0.05361274089664221, + "epoch": 9.20282084158993, + "grad_norm": 0.04345703125, + "learning_rate": 4.662012865443396e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999482154846191, + "num_tokens": 129470797.0, + "step": 39480 + }, + { + "entropy": 0.050313870795071124, + "epoch": 9.203986478610561, + "grad_norm": 0.07861328125, + "learning_rate": 4.66190847319872e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9993630588054657, + "num_tokens": 129495570.0, + "step": 39485 + }, + { + "entropy": 0.041808879002928734, + "epoch": 9.205152115631192, + "grad_norm": 0.1298828125, + "learning_rate": 4.661804067356118e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9996637821197509, + "num_tokens": 129530054.0, + "step": 39490 + }, + { + "entropy": 0.06288732271641492, + "epoch": 9.206317752651824, + "grad_norm": 0.0238037109375, + "learning_rate": 4.661699647917145e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 129544100.0, + "step": 39495 + }, + { + "entropy": 0.043396432511508466, + "epoch": 9.207483389672456, + "grad_norm": 1.0234375, + "learning_rate": 4.6615952148833587e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9995144903659821, + "num_tokens": 129582973.0, + "step": 39500 + }, + { + "entropy": 0.045847236458212134, + "epoch": 9.208649026693088, + "grad_norm": 0.1396484375, + "learning_rate": 4.6614907682563177e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 129609091.0, + "step": 39505 + }, + { + "entropy": 0.07774993143975735, + "epoch": 9.20981466371372, + "grad_norm": 0.265625, + "learning_rate": 4.661386308037577e-05, + "loss": 0.0006, + "mean_token_accuracy": 1.0, + "num_tokens": 129619771.0, + "step": 39510 + }, + { + "entropy": 0.07045667059719563, + "epoch": 9.21098030073435, + "grad_norm": 0.034423828125, + "learning_rate": 4.661281834228697e-05, + "loss": 0.0012, + "mean_token_accuracy": 1.0, + "num_tokens": 129629617.0, + "step": 39515 + }, + { + "entropy": 0.05546746281906963, + "epoch": 9.212145937754983, + "grad_norm": 0.85546875, + "learning_rate": 4.661177346831234e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9997250974178314, + "num_tokens": 129652976.0, + "step": 39520 + }, + { + "entropy": 0.04471162809059024, + "epoch": 9.213311574775615, + "grad_norm": 0.234375, + "learning_rate": 4.661072845846746e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 129678217.0, + "step": 39525 + }, + { + "entropy": 0.040480018593370914, + "epoch": 9.214477211796247, + "grad_norm": 0.10986328125, + "learning_rate": 4.660968331276791e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999004542827606, + "num_tokens": 129720188.0, + "step": 39530 + }, + { + "entropy": 0.060479015298187735, + "epoch": 9.215642848816879, + "grad_norm": 0.06591796875, + "learning_rate": 4.660863803122928e-05, + "loss": 0.0039, + "mean_token_accuracy": 0.9987586557865142, + "num_tokens": 129731917.0, + "step": 39535 + }, + { + "entropy": 0.06452511474490166, + "epoch": 9.216808485837511, + "grad_norm": 0.029296875, + "learning_rate": 4.660759261386717e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9995082080364227, + "num_tokens": 129743041.0, + "step": 39540 + }, + { + "entropy": 0.048442641738802195, + "epoch": 9.217974122858141, + "grad_norm": 0.134765625, + "learning_rate": 4.6606547060697145e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9992859959602356, + "num_tokens": 129772704.0, + "step": 39545 + }, + { + "entropy": 0.06779967844486237, + "epoch": 9.219139759878773, + "grad_norm": 0.05810546875, + "learning_rate": 4.6605501371734804e-05, + "loss": 0.0041, + "mean_token_accuracy": 0.9993433296680451, + "num_tokens": 129782439.0, + "step": 39550 + }, + { + "entropy": 0.04868889665231109, + "epoch": 9.220305396899406, + "grad_norm": 0.12451171875, + "learning_rate": 4.660445554699575e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9996962010860443, + "num_tokens": 129817338.0, + "step": 39555 + }, + { + "entropy": 0.047591369785368445, + "epoch": 9.221471033920038, + "grad_norm": 0.380859375, + "learning_rate": 4.660340958649557e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9998111546039581, + "num_tokens": 129848844.0, + "step": 39560 + }, + { + "entropy": 0.06939591914415359, + "epoch": 9.22263667094067, + "grad_norm": 0.23828125, + "learning_rate": 4.660236349024985e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9993773400783539, + "num_tokens": 129871471.0, + "step": 39565 + }, + { + "entropy": 0.05595292616635561, + "epoch": 9.2238023079613, + "grad_norm": 0.05908203125, + "learning_rate": 4.6601317258274214e-05, + "loss": 0.0006, + "mean_token_accuracy": 1.0, + "num_tokens": 129885167.0, + "step": 39570 + }, + { + "entropy": 0.06748554203659296, + "epoch": 9.224967944981932, + "grad_norm": 2.265625, + "learning_rate": 4.660027089058424e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9995475113391876, + "num_tokens": 129894753.0, + "step": 39575 + }, + { + "entropy": 0.08345902096480132, + "epoch": 9.226133582002564, + "grad_norm": 0.060302734375, + "learning_rate": 4.6599224387195537e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 129907997.0, + "step": 39580 + }, + { + "entropy": 0.0697068564593792, + "epoch": 9.227299219023196, + "grad_norm": 0.330078125, + "learning_rate": 4.659817774812372e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999857664108276, + "num_tokens": 129926056.0, + "step": 39585 + }, + { + "entropy": 0.045826638396829364, + "epoch": 9.228464856043828, + "grad_norm": 0.0986328125, + "learning_rate": 4.659713097338438e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999352037906647, + "num_tokens": 129956239.0, + "step": 39590 + }, + { + "entropy": 0.05160574847832322, + "epoch": 9.22963049306446, + "grad_norm": 0.01611328125, + "learning_rate": 4.659608406299314e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999892354011536, + "num_tokens": 129980573.0, + "step": 39595 + }, + { + "entropy": 0.049797141645103696, + "epoch": 9.230796130085091, + "grad_norm": 0.197265625, + "learning_rate": 4.65950370169656e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9998175919055938, + "num_tokens": 130000508.0, + "step": 39600 + }, + { + "entropy": 0.06450515007600188, + "epoch": 9.231961767105723, + "grad_norm": 0.1669921875, + "learning_rate": 4.659398983531739e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999364972114563, + "num_tokens": 130031099.0, + "step": 39605 + }, + { + "entropy": 0.05456445217132568, + "epoch": 9.233127404126355, + "grad_norm": 0.051025390625, + "learning_rate": 4.65929425180641e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999895572662354, + "num_tokens": 130061066.0, + "step": 39610 + }, + { + "entropy": 0.05030291229486465, + "epoch": 9.234293041146987, + "grad_norm": 0.1748046875, + "learning_rate": 4.659189506522137e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999889194965362, + "num_tokens": 130087255.0, + "step": 39615 + }, + { + "entropy": 0.05618452290073037, + "epoch": 9.23545867816762, + "grad_norm": 0.083984375, + "learning_rate": 4.6590847476804803e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9997907936573028, + "num_tokens": 130104918.0, + "step": 39620 + }, + { + "entropy": 0.07257263027131558, + "epoch": 9.23662431518825, + "grad_norm": 0.0849609375, + "learning_rate": 4.658979975283003e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 130117627.0, + "step": 39625 + }, + { + "entropy": 0.05578606994822621, + "epoch": 9.237789952208882, + "grad_norm": 0.19140625, + "learning_rate": 4.6588751893312676e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9997557580471039, + "num_tokens": 130141689.0, + "step": 39630 + }, + { + "entropy": 0.05225554727949202, + "epoch": 9.238955589229514, + "grad_norm": 0.248046875, + "learning_rate": 4.6587703898268353e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9997475385665894, + "num_tokens": 130167512.0, + "step": 39635 + }, + { + "entropy": 0.0557454289868474, + "epoch": 9.240121226250146, + "grad_norm": 0.0162353515625, + "learning_rate": 4.65866557677127e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 130180103.0, + "step": 39640 + }, + { + "entropy": 0.06349661573767662, + "epoch": 9.241286863270778, + "grad_norm": 0.220703125, + "learning_rate": 4.658560750166134e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9994892954826355, + "num_tokens": 130197775.0, + "step": 39645 + }, + { + "entropy": 0.048552720621228215, + "epoch": 9.242452500291408, + "grad_norm": 0.05615234375, + "learning_rate": 4.658455910012991e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9997536957263946, + "num_tokens": 130217603.0, + "step": 39650 + }, + { + "entropy": 0.050778523366898295, + "epoch": 9.24361813731204, + "grad_norm": 1.265625, + "learning_rate": 4.658351056313404e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.9992936372756958, + "num_tokens": 130249221.0, + "step": 39655 + }, + { + "entropy": 0.059298649057745935, + "epoch": 9.244783774332673, + "grad_norm": 0.1015625, + "learning_rate": 4.6582461890689354e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9999761939048767, + "num_tokens": 130267756.0, + "step": 39660 + }, + { + "entropy": 0.05675745829939842, + "epoch": 9.245949411353305, + "grad_norm": 0.1533203125, + "learning_rate": 4.6581413082811514e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9996394336223602, + "num_tokens": 130287265.0, + "step": 39665 + }, + { + "entropy": 0.07041055001318455, + "epoch": 9.247115048373937, + "grad_norm": 2.921875, + "learning_rate": 4.658036413951614e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9992876887321472, + "num_tokens": 130297803.0, + "step": 39670 + }, + { + "entropy": 0.056393209099769595, + "epoch": 9.248280685394569, + "grad_norm": 0.875, + "learning_rate": 4.657931506081889e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9997631192207337, + "num_tokens": 130314472.0, + "step": 39675 + }, + { + "entropy": 0.060738119576126334, + "epoch": 9.2494463224152, + "grad_norm": 0.1259765625, + "learning_rate": 4.657826584673538e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9999781250953674, + "num_tokens": 130328923.0, + "step": 39680 + }, + { + "entropy": 0.0554684535600245, + "epoch": 9.250611959435831, + "grad_norm": 0.107421875, + "learning_rate": 4.6577216497281275e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999745726585388, + "num_tokens": 130344597.0, + "step": 39685 + }, + { + "entropy": 0.05555789954960346, + "epoch": 9.251777596456463, + "grad_norm": 0.0966796875, + "learning_rate": 4.657616701247222e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9996960461139679, + "num_tokens": 130365321.0, + "step": 39690 + }, + { + "entropy": 0.056109121069312096, + "epoch": 9.252943233477096, + "grad_norm": 0.1630859375, + "learning_rate": 4.657511739232387e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9996614396572113, + "num_tokens": 130385100.0, + "step": 39695 + }, + { + "entropy": 0.06600268706679344, + "epoch": 9.254108870497728, + "grad_norm": 0.29296875, + "learning_rate": 4.657406763685187e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9989592075347901, + "num_tokens": 130397765.0, + "step": 39700 + }, + { + "entropy": 0.059470337629318235, + "epoch": 9.255274507518358, + "grad_norm": 0.1259765625, + "learning_rate": 4.657301774607187e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 130408436.0, + "step": 39705 + }, + { + "entropy": 0.04764430820941925, + "epoch": 9.25644014453899, + "grad_norm": 0.2294921875, + "learning_rate": 4.6571967719999524e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999611496925354, + "num_tokens": 130428273.0, + "step": 39710 + }, + { + "entropy": 0.04987523974850774, + "epoch": 9.257605781559622, + "grad_norm": 0.126953125, + "learning_rate": 4.6570917558650495e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9997695863246918, + "num_tokens": 130441734.0, + "step": 39715 + }, + { + "entropy": 0.03264869209378958, + "epoch": 9.258771418580254, + "grad_norm": 0.1611328125, + "learning_rate": 4.656986726204044e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999175131320953, + "num_tokens": 130487344.0, + "step": 39720 + }, + { + "entropy": 0.05826353104785085, + "epoch": 9.259937055600886, + "grad_norm": 0.2109375, + "learning_rate": 4.656881683018503e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9995794057846069, + "num_tokens": 130507878.0, + "step": 39725 + }, + { + "entropy": 0.06025405712425709, + "epoch": 9.261102692621517, + "grad_norm": 0.1376953125, + "learning_rate": 4.6567766263099925e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9991832613945008, + "num_tokens": 130527697.0, + "step": 39730 + }, + { + "entropy": 0.07366060577332974, + "epoch": 9.262268329642149, + "grad_norm": 0.1142578125, + "learning_rate": 4.656671556080078e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9995670974254608, + "num_tokens": 130538675.0, + "step": 39735 + }, + { + "entropy": 0.07155503174290061, + "epoch": 9.263433966662781, + "grad_norm": 0.08984375, + "learning_rate": 4.656566472330326e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9996869444847107, + "num_tokens": 130563391.0, + "step": 39740 + }, + { + "entropy": 0.037785251764580606, + "epoch": 9.264599603683413, + "grad_norm": 0.1484375, + "learning_rate": 4.6564613750623054e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9999007999897003, + "num_tokens": 130594860.0, + "step": 39745 + }, + { + "entropy": 0.07801759839057923, + "epoch": 9.265765240704045, + "grad_norm": 0.0257568359375, + "learning_rate": 4.656356264277582e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9996275663375854, + "num_tokens": 130616197.0, + "step": 39750 + }, + { + "entropy": 0.05990429036319256, + "epoch": 9.266930877724677, + "grad_norm": 0.0184326171875, + "learning_rate": 4.656251139977724e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999756574630737, + "num_tokens": 130640769.0, + "step": 39755 + }, + { + "entropy": 0.066241804510355, + "epoch": 9.268096514745308, + "grad_norm": 0.43359375, + "learning_rate": 4.6561460021642974e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9997572839260102, + "num_tokens": 130656744.0, + "step": 39760 + }, + { + "entropy": 0.06499183923006058, + "epoch": 9.26926215176594, + "grad_norm": 0.1767578125, + "learning_rate": 4.656040850838872e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999786257743836, + "num_tokens": 130675036.0, + "step": 39765 + }, + { + "entropy": 0.0647053737193346, + "epoch": 9.270427788786572, + "grad_norm": 0.026123046875, + "learning_rate": 4.6559356860030146e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9986259639263153, + "num_tokens": 130687371.0, + "step": 39770 + }, + { + "entropy": 0.06163849774748087, + "epoch": 9.271593425807204, + "grad_norm": 0.59375, + "learning_rate": 4.655830507658293e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.999989140033722, + "num_tokens": 130708757.0, + "step": 39775 + }, + { + "entropy": 0.078371412307024, + "epoch": 9.272759062827836, + "grad_norm": 0.043212890625, + "learning_rate": 4.6557253158062765e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9997084558010101, + "num_tokens": 130721840.0, + "step": 39780 + }, + { + "entropy": 0.05995170101523399, + "epoch": 9.273924699848466, + "grad_norm": 1.4921875, + "learning_rate": 4.655620110448533e-05, + "loss": 0.0015, + "mean_token_accuracy": 1.0, + "num_tokens": 130738473.0, + "step": 39785 + }, + { + "entropy": 0.05594620313495398, + "epoch": 9.275090336869098, + "grad_norm": 0.1279296875, + "learning_rate": 4.6555148915866316e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999889075756073, + "num_tokens": 130764506.0, + "step": 39790 + }, + { + "entropy": 0.05183468423783779, + "epoch": 9.27625597388973, + "grad_norm": 0.267578125, + "learning_rate": 4.6554096592221406e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9997442483901977, + "num_tokens": 130781308.0, + "step": 39795 + }, + { + "entropy": 0.04706506533548236, + "epoch": 9.277421610910363, + "grad_norm": 0.09814453125, + "learning_rate": 4.655304413356631e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9997573494911194, + "num_tokens": 130815566.0, + "step": 39800 + }, + { + "entropy": 0.07166340351104736, + "epoch": 9.278587247930995, + "grad_norm": 0.048095703125, + "learning_rate": 4.65519915399167e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 130826851.0, + "step": 39805 + }, + { + "entropy": 0.06749443728476763, + "epoch": 9.279752884951627, + "grad_norm": 0.5859375, + "learning_rate": 4.6550938811288285e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9997927844524384, + "num_tokens": 130854821.0, + "step": 39810 + }, + { + "entropy": 0.04661665195599198, + "epoch": 9.280918521972257, + "grad_norm": 0.2294921875, + "learning_rate": 4.654988594769676e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9997952282428741, + "num_tokens": 130874687.0, + "step": 39815 + }, + { + "entropy": 0.08724109530448913, + "epoch": 9.28208415899289, + "grad_norm": 0.053955078125, + "learning_rate": 4.654883294915782e-05, + "loss": 0.0181, + "mean_token_accuracy": 0.9981146395206452, + "num_tokens": 130901068.0, + "step": 39820 + }, + { + "entropy": 0.08578491769731045, + "epoch": 9.283249796013521, + "grad_norm": 0.1259765625, + "learning_rate": 4.654777981568717e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9997267782688141, + "num_tokens": 130911620.0, + "step": 39825 + }, + { + "entropy": 0.040522088576108214, + "epoch": 9.284415433034154, + "grad_norm": 0.1826171875, + "learning_rate": 4.654672654730052e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999558866024018, + "num_tokens": 130942124.0, + "step": 39830 + }, + { + "entropy": 0.06131479572504759, + "epoch": 9.285581070054786, + "grad_norm": 0.10693359375, + "learning_rate": 4.654567314401356e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9995066106319428, + "num_tokens": 130975937.0, + "step": 39835 + }, + { + "entropy": 0.05530798006802797, + "epoch": 9.286746707075416, + "grad_norm": 0.052734375, + "learning_rate": 4.654461960584202e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9994952142238617, + "num_tokens": 130988252.0, + "step": 39840 + }, + { + "entropy": 0.054783723689615725, + "epoch": 9.287912344096048, + "grad_norm": 0.1787109375, + "learning_rate": 4.654356593280159e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9993468701839447, + "num_tokens": 131023807.0, + "step": 39845 + }, + { + "entropy": 0.05474893264472484, + "epoch": 9.28907798111668, + "grad_norm": 0.1357421875, + "learning_rate": 4.6542512124907995e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9996736764907836, + "num_tokens": 131050692.0, + "step": 39850 + }, + { + "entropy": 0.05652236472815275, + "epoch": 9.290243618137312, + "grad_norm": 0.453125, + "learning_rate": 4.654145818217694e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9995680093765259, + "num_tokens": 131067738.0, + "step": 39855 + }, + { + "entropy": 0.06417035115882755, + "epoch": 9.291409255157944, + "grad_norm": 0.05126953125, + "learning_rate": 4.6540404104624144e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999670863151551, + "num_tokens": 131093753.0, + "step": 39860 + }, + { + "entropy": 0.06767412256449461, + "epoch": 9.292574892178575, + "grad_norm": 0.20703125, + "learning_rate": 4.6539349892265324e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9997669279575347, + "num_tokens": 131106916.0, + "step": 39865 + }, + { + "entropy": 0.052419050503522156, + "epoch": 9.293740529199207, + "grad_norm": 0.0301513671875, + "learning_rate": 4.6538295545116206e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999543309211731, + "num_tokens": 131135751.0, + "step": 39870 + }, + { + "entropy": 0.04931356767192483, + "epoch": 9.294906166219839, + "grad_norm": 0.2265625, + "learning_rate": 4.6537241063192504e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999555230140686, + "num_tokens": 131159416.0, + "step": 39875 + }, + { + "entropy": 0.055171745270490645, + "epoch": 9.296071803240471, + "grad_norm": 0.04931640625, + "learning_rate": 4.653618644650995e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 131172872.0, + "step": 39880 + }, + { + "entropy": 0.05928640691563487, + "epoch": 9.297237440261103, + "grad_norm": 0.037353515625, + "learning_rate": 4.6535131695084255e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9997416019439698, + "num_tokens": 131184609.0, + "step": 39885 + }, + { + "entropy": 0.04858807288110256, + "epoch": 9.298403077281735, + "grad_norm": 0.09326171875, + "learning_rate": 4.653407680893116e-05, + "loss": 0.0054, + "mean_token_accuracy": 0.9997319042682647, + "num_tokens": 131217075.0, + "step": 39890 + }, + { + "entropy": 0.04870633902028203, + "epoch": 9.299568714302366, + "grad_norm": 0.06884765625, + "learning_rate": 4.65330217880664e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.999982088804245, + "num_tokens": 131236468.0, + "step": 39895 + }, + { + "entropy": 0.06770975720137358, + "epoch": 9.300734351322998, + "grad_norm": 0.109375, + "learning_rate": 4.653196663250569e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9991342008113862, + "num_tokens": 131254232.0, + "step": 39900 + }, + { + "entropy": 0.051382277719676496, + "epoch": 9.30189998834363, + "grad_norm": 0.05810546875, + "learning_rate": 4.653091134226478e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999669253826141, + "num_tokens": 131279555.0, + "step": 39905 + }, + { + "entropy": 0.05385262481868267, + "epoch": 9.303065625364262, + "grad_norm": 0.1787109375, + "learning_rate": 4.652985591735939e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.99990394115448, + "num_tokens": 131290450.0, + "step": 39910 + }, + { + "entropy": 0.05229444522410631, + "epoch": 9.304231262384894, + "grad_norm": 1.8671875, + "learning_rate": 4.652880035780527e-05, + "loss": 0.0045, + "mean_token_accuracy": 0.9993044972419739, + "num_tokens": 131314618.0, + "step": 39915 + }, + { + "entropy": 0.06551222130656242, + "epoch": 9.305396899405524, + "grad_norm": 0.302734375, + "learning_rate": 4.652774466361815e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9998004019260407, + "num_tokens": 131325817.0, + "step": 39920 + }, + { + "entropy": 0.07701365770772099, + "epoch": 9.306562536426156, + "grad_norm": 0.04833984375, + "learning_rate": 4.652668883481379e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999864161014557, + "num_tokens": 131345787.0, + "step": 39925 + }, + { + "entropy": 0.06038803607225418, + "epoch": 9.307728173446788, + "grad_norm": 0.205078125, + "learning_rate": 4.652563287140792e-05, + "loss": 0.0063, + "mean_token_accuracy": 0.9997166514396667, + "num_tokens": 131368437.0, + "step": 39930 + }, + { + "entropy": 0.0589005762245506, + "epoch": 9.30889381046742, + "grad_norm": 0.232421875, + "learning_rate": 4.652457677341629e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9997618794441223, + "num_tokens": 131393813.0, + "step": 39935 + }, + { + "entropy": 0.06715304385870695, + "epoch": 9.310059447488053, + "grad_norm": 0.279296875, + "learning_rate": 4.6523520540854644e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9997555017471313, + "num_tokens": 131406815.0, + "step": 39940 + }, + { + "entropy": 0.05549956224858761, + "epoch": 9.311225084508685, + "grad_norm": 0.043212890625, + "learning_rate": 4.652246417373873e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9997076034545899, + "num_tokens": 131418869.0, + "step": 39945 + }, + { + "entropy": 0.05067304102703929, + "epoch": 9.312390721529315, + "grad_norm": 0.2021484375, + "learning_rate": 4.652140767208431e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999592304229736, + "num_tokens": 131441594.0, + "step": 39950 + }, + { + "entropy": 0.06731152012944222, + "epoch": 9.313556358549947, + "grad_norm": 0.0302734375, + "learning_rate": 4.652035103590713e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9997109830379486, + "num_tokens": 131451038.0, + "step": 39955 + }, + { + "entropy": 0.05239680912345648, + "epoch": 9.31472199557058, + "grad_norm": 0.0927734375, + "learning_rate": 4.6519294265222954e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 131473175.0, + "step": 39960 + }, + { + "entropy": 0.05735105858184397, + "epoch": 9.315887632591211, + "grad_norm": 0.0216064453125, + "learning_rate": 4.651823736004753e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999798119068146, + "num_tokens": 131489315.0, + "step": 39965 + }, + { + "entropy": 0.04591607404872775, + "epoch": 9.317053269611844, + "grad_norm": 0.388671875, + "learning_rate": 4.6517180320396624e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9985618889331818, + "num_tokens": 131515106.0, + "step": 39970 + }, + { + "entropy": 0.059040891379117964, + "epoch": 9.318218906632474, + "grad_norm": 0.07373046875, + "learning_rate": 4.6516123146285995e-05, + "loss": 0.0006, + "mean_token_accuracy": 1.0, + "num_tokens": 131536732.0, + "step": 39975 + }, + { + "entropy": 0.05244419164955616, + "epoch": 9.319384543653106, + "grad_norm": 0.1201171875, + "learning_rate": 4.651506583773141e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9999791860580445, + "num_tokens": 131563292.0, + "step": 39980 + }, + { + "entropy": 0.06199465803802014, + "epoch": 9.320550180673738, + "grad_norm": 0.041259765625, + "learning_rate": 4.651400839474863e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 131572957.0, + "step": 39985 + }, + { + "entropy": 0.04819637825712562, + "epoch": 9.32171581769437, + "grad_norm": 0.04345703125, + "learning_rate": 4.651295081735344e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 131592367.0, + "step": 39990 + }, + { + "entropy": 0.05749725298956036, + "epoch": 9.322881454715002, + "grad_norm": 1.6953125, + "learning_rate": 4.651189310556158e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9995614051818847, + "num_tokens": 131619144.0, + "step": 39995 + }, + { + "entropy": 0.04768976755440235, + "epoch": 9.324047091735633, + "grad_norm": 0.1337890625, + "learning_rate": 4.651083525938885e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9995601296424865, + "num_tokens": 131639167.0, + "step": 40000 + }, + { + "entropy": 0.049379723891615865, + "epoch": 9.325212728756265, + "grad_norm": 0.130859375, + "learning_rate": 4.6509777278851015e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9997111022472381, + "num_tokens": 131664826.0, + "step": 40005 + }, + { + "entropy": 0.07187463045120239, + "epoch": 9.326378365776897, + "grad_norm": 0.09765625, + "learning_rate": 4.650871916396384e-05, + "loss": 0.0006, + "mean_token_accuracy": 1.0, + "num_tokens": 131673814.0, + "step": 40010 + }, + { + "entropy": 0.053065793495625256, + "epoch": 9.327544002797529, + "grad_norm": 1.6875, + "learning_rate": 4.650766091474312e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9996896207332611, + "num_tokens": 131695962.0, + "step": 40015 + }, + { + "entropy": 0.06435949224978685, + "epoch": 9.328709639818161, + "grad_norm": 0.11474609375, + "learning_rate": 4.650660253120462e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999523639678956, + "num_tokens": 131723799.0, + "step": 40020 + }, + { + "entropy": 0.04089836934581399, + "epoch": 9.329875276838793, + "grad_norm": 0.5546875, + "learning_rate": 4.650554401336414e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9998673915863037, + "num_tokens": 131748144.0, + "step": 40025 + }, + { + "entropy": 0.06390482522547244, + "epoch": 9.331040913859423, + "grad_norm": 0.5234375, + "learning_rate": 4.650448536123745e-05, + "loss": 0.0061, + "mean_token_accuracy": 0.9995572030544281, + "num_tokens": 131767343.0, + "step": 40030 + }, + { + "entropy": 0.0792075976729393, + "epoch": 9.332206550880056, + "grad_norm": 0.1826171875, + "learning_rate": 4.6503426574840337e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9991525411605835, + "num_tokens": 131777276.0, + "step": 40035 + }, + { + "entropy": 0.04253325518220663, + "epoch": 9.333372187900688, + "grad_norm": 0.01611328125, + "learning_rate": 4.650236765418859e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999195039272308, + "num_tokens": 131827608.0, + "step": 40040 + }, + { + "entropy": 0.0662536833435297, + "epoch": 9.33453782492132, + "grad_norm": 0.1416015625, + "learning_rate": 4.6501308599298e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 131838436.0, + "step": 40045 + }, + { + "entropy": 0.04661760358139873, + "epoch": 9.335703461941952, + "grad_norm": 0.0225830078125, + "learning_rate": 4.6500249410184365e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999865055084228, + "num_tokens": 131860393.0, + "step": 40050 + }, + { + "entropy": 0.05046624289825559, + "epoch": 9.336869098962582, + "grad_norm": 0.1865234375, + "learning_rate": 4.6499190086863476e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.9986062943935394, + "num_tokens": 131896517.0, + "step": 40055 + }, + { + "entropy": 0.043066446855664256, + "epoch": 9.338034735983214, + "grad_norm": 0.06396484375, + "learning_rate": 4.649813062935112e-05, + "loss": 0.0049, + "mean_token_accuracy": 0.9996003627777099, + "num_tokens": 131927175.0, + "step": 40060 + }, + { + "entropy": 0.0638367710635066, + "epoch": 9.339200373003846, + "grad_norm": 0.043701171875, + "learning_rate": 4.649707103766311e-05, + "loss": 0.003, + "mean_token_accuracy": 0.999473923444748, + "num_tokens": 131941489.0, + "step": 40065 + }, + { + "entropy": 0.052022173814475534, + "epoch": 9.340366010024479, + "grad_norm": 0.1455078125, + "learning_rate": 4.649601131181524e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9986912429332733, + "num_tokens": 131977336.0, + "step": 40070 + }, + { + "entropy": 0.04792749881744385, + "epoch": 9.34153164704511, + "grad_norm": 0.0927734375, + "learning_rate": 4.64949514518233e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9998324334621429, + "num_tokens": 132000748.0, + "step": 40075 + }, + { + "entropy": 0.04526071464642882, + "epoch": 9.342697284065743, + "grad_norm": 0.06591796875, + "learning_rate": 4.649389145770311e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 132020054.0, + "step": 40080 + }, + { + "entropy": 0.1261118996888399, + "epoch": 9.343862921086373, + "grad_norm": 0.056396484375, + "learning_rate": 4.649283132947047e-05, + "loss": 0.1482, + "mean_token_accuracy": 0.9802892565727234, + "num_tokens": 132041911.0, + "step": 40085 + }, + { + "entropy": 0.08652307861484587, + "epoch": 9.345028558107005, + "grad_norm": 0.1513671875, + "learning_rate": 4.6491771067141186e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999862551689148, + "num_tokens": 132058753.0, + "step": 40090 + }, + { + "entropy": 0.05955226179212332, + "epoch": 9.346194195127637, + "grad_norm": 0.134765625, + "learning_rate": 4.6490710670731075e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.999969094991684, + "num_tokens": 132080898.0, + "step": 40095 + }, + { + "entropy": 0.05750239389017224, + "epoch": 9.34735983214827, + "grad_norm": 0.05322265625, + "learning_rate": 4.648965014025595e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9995156407356263, + "num_tokens": 132106290.0, + "step": 40100 + }, + { + "entropy": 0.05300535634160042, + "epoch": 9.348525469168901, + "grad_norm": 0.19140625, + "learning_rate": 4.6488589475731614e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9998235583305359, + "num_tokens": 132129402.0, + "step": 40105 + }, + { + "entropy": 0.06626793816685676, + "epoch": 9.349691106189532, + "grad_norm": 0.5390625, + "learning_rate": 4.648752867717389e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9997787594795227, + "num_tokens": 132146370.0, + "step": 40110 + }, + { + "entropy": 0.06416756641119718, + "epoch": 9.350856743210164, + "grad_norm": 2.078125, + "learning_rate": 4.6486467744598596e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.99963099360466, + "num_tokens": 132157773.0, + "step": 40115 + }, + { + "entropy": 0.051140221767127514, + "epoch": 9.352022380230796, + "grad_norm": 1.1875, + "learning_rate": 4.6485406678021546e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9996812820434571, + "num_tokens": 132182802.0, + "step": 40120 + }, + { + "entropy": 0.05248261038213968, + "epoch": 9.353188017251428, + "grad_norm": 0.185546875, + "learning_rate": 4.648434547745858e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9997892618179322, + "num_tokens": 132204779.0, + "step": 40125 + }, + { + "entropy": 0.06364800855517387, + "epoch": 9.35435365427206, + "grad_norm": 0.0302734375, + "learning_rate": 4.64832841429255e-05, + "loss": 0.002, + "mean_token_accuracy": 0.999296373128891, + "num_tokens": 132216288.0, + "step": 40130 + }, + { + "entropy": 0.061640280019491914, + "epoch": 9.35551929129269, + "grad_norm": 0.08154296875, + "learning_rate": 4.6482222674438147e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 132229424.0, + "step": 40135 + }, + { + "entropy": 0.04232308492064476, + "epoch": 9.356684928313323, + "grad_norm": 0.90625, + "learning_rate": 4.6481161072012334e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9995788276195526, + "num_tokens": 132247241.0, + "step": 40140 + }, + { + "entropy": 0.06577129755169153, + "epoch": 9.357850565333955, + "grad_norm": 0.050048828125, + "learning_rate": 4.648009933566391e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9996784567832947, + "num_tokens": 132261612.0, + "step": 40145 + }, + { + "entropy": 0.055283906683325765, + "epoch": 9.359016202354587, + "grad_norm": 0.10498046875, + "learning_rate": 4.64790374654087e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9997281730175018, + "num_tokens": 132279653.0, + "step": 40150 + }, + { + "entropy": 0.057182838954031465, + "epoch": 9.360181839375219, + "grad_norm": 0.1591796875, + "learning_rate": 4.6477975461262535e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9997178077697754, + "num_tokens": 132308947.0, + "step": 40155 + }, + { + "entropy": 0.05807437375187874, + "epoch": 9.361347476395851, + "grad_norm": 0.8671875, + "learning_rate": 4.647691332324124e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9994499266147614, + "num_tokens": 132319937.0, + "step": 40160 + }, + { + "entropy": 0.05894612278789282, + "epoch": 9.362513113416481, + "grad_norm": 0.11572265625, + "learning_rate": 4.647585105136068e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9993421077728272, + "num_tokens": 132329099.0, + "step": 40165 + }, + { + "entropy": 0.060213756002485755, + "epoch": 9.363678750437114, + "grad_norm": 0.083984375, + "learning_rate": 4.647478864563668e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9995322227478027, + "num_tokens": 132367396.0, + "step": 40170 + }, + { + "entropy": 0.054358705133199695, + "epoch": 9.364844387457746, + "grad_norm": 0.0908203125, + "learning_rate": 4.6473726106085076e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9993378758430481, + "num_tokens": 132398650.0, + "step": 40175 + }, + { + "entropy": 0.06580995731055736, + "epoch": 9.366010024478378, + "grad_norm": 0.0576171875, + "learning_rate": 4.647266343272172e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9998428761959076, + "num_tokens": 132418663.0, + "step": 40180 + }, + { + "entropy": 0.09032328445464373, + "epoch": 9.36717566149901, + "grad_norm": 0.47265625, + "learning_rate": 4.647160062556246e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.999989515542984, + "num_tokens": 132436254.0, + "step": 40185 + }, + { + "entropy": 0.04208869868889451, + "epoch": 9.36834129851964, + "grad_norm": 0.373046875, + "learning_rate": 4.647053768462313e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999783515930176, + "num_tokens": 132467998.0, + "step": 40190 + }, + { + "entropy": 0.061600181832909585, + "epoch": 9.369506935540272, + "grad_norm": 0.478515625, + "learning_rate": 4.646947460991961e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.999874085187912, + "num_tokens": 132491206.0, + "step": 40195 + }, + { + "entropy": 0.059306449443101886, + "epoch": 9.370672572560904, + "grad_norm": 0.0986328125, + "learning_rate": 4.646841140146771e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 132511274.0, + "step": 40200 + }, + { + "entropy": 0.04950930029153824, + "epoch": 9.371838209581536, + "grad_norm": 0.047119140625, + "learning_rate": 4.646734805928332e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9993799746036529, + "num_tokens": 132531833.0, + "step": 40205 + }, + { + "entropy": 0.07071809405460953, + "epoch": 9.373003846602169, + "grad_norm": 0.1640625, + "learning_rate": 4.646628458338228e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9997894406318665, + "num_tokens": 132556517.0, + "step": 40210 + }, + { + "entropy": 0.05042863581329584, + "epoch": 9.3741694836228, + "grad_norm": 0.384765625, + "learning_rate": 4.6465220973780455e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 132571158.0, + "step": 40215 + }, + { + "entropy": 0.057017339020967485, + "epoch": 9.375335120643431, + "grad_norm": 0.271484375, + "learning_rate": 4.64641572304937e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 132582965.0, + "step": 40220 + }, + { + "entropy": 0.06408351408317685, + "epoch": 9.376500757664063, + "grad_norm": 0.09716796875, + "learning_rate": 4.646309335353787e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999407470226288, + "num_tokens": 132609368.0, + "step": 40225 + }, + { + "entropy": 0.055050110910087824, + "epoch": 9.377666394684695, + "grad_norm": 0.11474609375, + "learning_rate": 4.646202934292884e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9998552739620209, + "num_tokens": 132634237.0, + "step": 40230 + }, + { + "entropy": 0.0654405765235424, + "epoch": 9.378832031705327, + "grad_norm": 1.15625, + "learning_rate": 4.646096519868248e-05, + "loss": 0.0104, + "mean_token_accuracy": 0.9984454095363617, + "num_tokens": 132655238.0, + "step": 40235 + }, + { + "entropy": 0.05522510912269354, + "epoch": 9.37999766872596, + "grad_norm": 0.6953125, + "learning_rate": 4.6459900920814645e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.9997252762317658, + "num_tokens": 132666985.0, + "step": 40240 + }, + { + "entropy": 0.06411749050021172, + "epoch": 9.38116330574659, + "grad_norm": 0.28515625, + "learning_rate": 4.645883650934121e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9997101426124573, + "num_tokens": 132676241.0, + "step": 40245 + }, + { + "entropy": 0.04640081143006682, + "epoch": 9.382328942767222, + "grad_norm": 0.130859375, + "learning_rate": 4.645777196427805e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999883055686951, + "num_tokens": 132696465.0, + "step": 40250 + }, + { + "entropy": 0.06816220059990882, + "epoch": 9.383494579787854, + "grad_norm": 0.10009765625, + "learning_rate": 4.6456707285641036e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.999319726228714, + "num_tokens": 132707483.0, + "step": 40255 + }, + { + "entropy": 0.055040829442441465, + "epoch": 9.384660216808486, + "grad_norm": 0.1337890625, + "learning_rate": 4.645564247344604e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9996013402938843, + "num_tokens": 132728906.0, + "step": 40260 + }, + { + "entropy": 0.06292527839541436, + "epoch": 9.385825853829118, + "grad_norm": 0.75, + "learning_rate": 4.645457752770895e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9999794006347656, + "num_tokens": 132756625.0, + "step": 40265 + }, + { + "entropy": 0.05493353232741356, + "epoch": 9.386991490849748, + "grad_norm": 0.1337890625, + "learning_rate": 4.645351244844565e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 132769179.0, + "step": 40270 + }, + { + "entropy": 0.06623713504523039, + "epoch": 9.38815712787038, + "grad_norm": 0.89453125, + "learning_rate": 4.6452447235672e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9996913552284241, + "num_tokens": 132781794.0, + "step": 40275 + }, + { + "entropy": 0.04920391724444926, + "epoch": 9.389322764891013, + "grad_norm": 0.294921875, + "learning_rate": 4.6451381889403895e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999530673027038, + "num_tokens": 132810819.0, + "step": 40280 + }, + { + "entropy": 0.05621719462797046, + "epoch": 9.390488401911645, + "grad_norm": 0.1572265625, + "learning_rate": 4.645031640965722e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.999968808889389, + "num_tokens": 132836542.0, + "step": 40285 + }, + { + "entropy": 0.06389943808317185, + "epoch": 9.391654038932277, + "grad_norm": 0.65234375, + "learning_rate": 4.644925079644788e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999892890453339, + "num_tokens": 132855480.0, + "step": 40290 + }, + { + "entropy": 0.059335741028189656, + "epoch": 9.392819675952909, + "grad_norm": 0.058837890625, + "learning_rate": 4.644818504979174e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9992248058319092, + "num_tokens": 132865545.0, + "step": 40295 + }, + { + "entropy": 0.04476952906697988, + "epoch": 9.39398531297354, + "grad_norm": 0.038818359375, + "learning_rate": 4.644711916970471e-05, + "loss": 0.0009, + "mean_token_accuracy": 1.0, + "num_tokens": 132882310.0, + "step": 40300 + }, + { + "entropy": 0.05240940302610397, + "epoch": 9.395150949994171, + "grad_norm": 0.058349609375, + "learning_rate": 4.644605315620266e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.999613881111145, + "num_tokens": 132911227.0, + "step": 40305 + }, + { + "entropy": 0.03990960521623492, + "epoch": 9.396316587014804, + "grad_norm": 0.06298828125, + "learning_rate": 4.644498700930152e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999681174755096, + "num_tokens": 132937528.0, + "step": 40310 + }, + { + "entropy": 0.0688298974186182, + "epoch": 9.397482224035436, + "grad_norm": 0.039794921875, + "learning_rate": 4.6443920729017154e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 132947749.0, + "step": 40315 + }, + { + "entropy": 0.07430526856333017, + "epoch": 9.398647861056068, + "grad_norm": 0.201171875, + "learning_rate": 4.644285431536549e-05, + "loss": 0.0149, + "mean_token_accuracy": 0.9950378179550171, + "num_tokens": 132976251.0, + "step": 40320 + }, + { + "entropy": 0.04790102792903781, + "epoch": 9.399813498076698, + "grad_norm": 0.103515625, + "learning_rate": 4.644178776836241e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9993688344955445, + "num_tokens": 133016802.0, + "step": 40325 + }, + { + "entropy": 0.07281846143305301, + "epoch": 9.40097913509733, + "grad_norm": 0.08740234375, + "learning_rate": 4.644072108802383e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9998487114906311, + "num_tokens": 133033477.0, + "step": 40330 + }, + { + "entropy": 0.04617578536272049, + "epoch": 9.402144772117962, + "grad_norm": 0.1357421875, + "learning_rate": 4.643965427436564e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999640166759491, + "num_tokens": 133065597.0, + "step": 40335 + }, + { + "entropy": 0.05054793646559119, + "epoch": 9.403310409138594, + "grad_norm": 0.21875, + "learning_rate": 4.643858732740377e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9994157433509827, + "num_tokens": 133090321.0, + "step": 40340 + }, + { + "entropy": 0.0676423467695713, + "epoch": 9.404476046159226, + "grad_norm": 0.103515625, + "learning_rate": 4.643752024715412e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9997844815254211, + "num_tokens": 133101643.0, + "step": 40345 + }, + { + "entropy": 0.05842981785535813, + "epoch": 9.405641683179859, + "grad_norm": 0.02490234375, + "learning_rate": 4.64364530336326e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999783515930176, + "num_tokens": 133121728.0, + "step": 40350 + }, + { + "entropy": 0.04886232684366405, + "epoch": 9.406807320200489, + "grad_norm": 0.23046875, + "learning_rate": 4.643538568685511e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9995041966438294, + "num_tokens": 133153226.0, + "step": 40355 + }, + { + "entropy": 0.050247212126851085, + "epoch": 9.407972957221121, + "grad_norm": 0.047119140625, + "learning_rate": 4.64343182068376e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9998877048492432, + "num_tokens": 133180024.0, + "step": 40360 + }, + { + "entropy": 0.04787283539772034, + "epoch": 9.409138594241753, + "grad_norm": 0.03173828125, + "learning_rate": 4.643325059359596e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.999958747625351, + "num_tokens": 133203070.0, + "step": 40365 + }, + { + "entropy": 0.05310492250137031, + "epoch": 9.410304231262385, + "grad_norm": 0.13671875, + "learning_rate": 4.643218284714611e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999827980995178, + "num_tokens": 133221629.0, + "step": 40370 + }, + { + "entropy": 0.08067540870979428, + "epoch": 9.411469868283017, + "grad_norm": 0.08349609375, + "learning_rate": 4.6431114967503985e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9996642708778382, + "num_tokens": 133249735.0, + "step": 40375 + }, + { + "entropy": 0.06417739875614643, + "epoch": 9.412635505303648, + "grad_norm": 0.04150390625, + "learning_rate": 4.64300469546855e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 133261895.0, + "step": 40380 + }, + { + "entropy": 0.059690887294709685, + "epoch": 9.41380114232428, + "grad_norm": 0.1259765625, + "learning_rate": 4.642897880870658e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9997668981552124, + "num_tokens": 133276430.0, + "step": 40385 + }, + { + "entropy": 0.05419606175273657, + "epoch": 9.414966779344912, + "grad_norm": 0.025146484375, + "learning_rate": 4.642791052958316e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.999883234500885, + "num_tokens": 133303994.0, + "step": 40390 + }, + { + "entropy": 0.06044567935168743, + "epoch": 9.416132416365544, + "grad_norm": 0.07177734375, + "learning_rate": 4.642684211733117e-05, + "loss": 0.0006, + "mean_token_accuracy": 1.0, + "num_tokens": 133314143.0, + "step": 40395 + }, + { + "entropy": 0.04812805755063891, + "epoch": 9.417298053386176, + "grad_norm": 0.2080078125, + "learning_rate": 4.642577357196654e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999467551708221, + "num_tokens": 133338336.0, + "step": 40400 + }, + { + "entropy": 0.0419490784406662, + "epoch": 9.418463690406806, + "grad_norm": 0.054931640625, + "learning_rate": 4.642470489350519e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999777257442475, + "num_tokens": 133378960.0, + "step": 40405 + }, + { + "entropy": 0.06326623265631497, + "epoch": 9.419629327427439, + "grad_norm": 0.1181640625, + "learning_rate": 4.6423636081963074e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999671995639801, + "num_tokens": 133406067.0, + "step": 40410 + }, + { + "entropy": 0.05259804669767618, + "epoch": 9.42079496444807, + "grad_norm": 0.125, + "learning_rate": 4.642256713735612e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9996414184570312, + "num_tokens": 133424898.0, + "step": 40415 + }, + { + "entropy": 0.09150759177282453, + "epoch": 9.421960601468703, + "grad_norm": 0.021728515625, + "learning_rate": 4.642149805970026e-05, + "loss": 0.074, + "mean_token_accuracy": 0.9797709345817566, + "num_tokens": 133446510.0, + "step": 40420 + }, + { + "entropy": 0.05307275112718344, + "epoch": 9.423126238489335, + "grad_norm": 0.07080078125, + "learning_rate": 4.642042884901146e-05, + "loss": 0.0072, + "mean_token_accuracy": 0.9985364437103271, + "num_tokens": 133469814.0, + "step": 40425 + }, + { + "entropy": 0.07435050662606954, + "epoch": 9.424291875509967, + "grad_norm": 0.080078125, + "learning_rate": 4.641935950530564e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 133488449.0, + "step": 40430 + }, + { + "entropy": 0.0618276858702302, + "epoch": 9.425457512530597, + "grad_norm": 0.69921875, + "learning_rate": 4.641829002859876e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9993707299232483, + "num_tokens": 133506729.0, + "step": 40435 + }, + { + "entropy": 0.045803864300251004, + "epoch": 9.42662314955123, + "grad_norm": 0.09228515625, + "learning_rate": 4.641722041890676e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999457180500031, + "num_tokens": 133535231.0, + "step": 40440 + }, + { + "entropy": 0.06190860979259014, + "epoch": 9.427788786571861, + "grad_norm": 0.035400390625, + "learning_rate": 4.641615067624559e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 133546578.0, + "step": 40445 + }, + { + "entropy": 0.06924508688971401, + "epoch": 9.428954423592494, + "grad_norm": 0.0751953125, + "learning_rate": 4.64150808006312e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999842643737793, + "num_tokens": 133562598.0, + "step": 40450 + }, + { + "entropy": 0.05628445390611887, + "epoch": 9.430120060613126, + "grad_norm": 0.08154296875, + "learning_rate": 4.641401079207955e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 133576149.0, + "step": 40455 + }, + { + "entropy": 0.05225107911974192, + "epoch": 9.431285697633756, + "grad_norm": 0.376953125, + "learning_rate": 4.6412940650606597e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.999372273683548, + "num_tokens": 133597360.0, + "step": 40460 + }, + { + "entropy": 0.05286563150584698, + "epoch": 9.432451334654388, + "grad_norm": 0.07568359375, + "learning_rate": 4.6411870376228284e-05, + "loss": 0.0006, + "mean_token_accuracy": 1.0, + "num_tokens": 133608823.0, + "step": 40465 + }, + { + "entropy": 0.06937075927853584, + "epoch": 9.43361697167502, + "grad_norm": 1.0546875, + "learning_rate": 4.6410799968960586e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9996330261230468, + "num_tokens": 133619534.0, + "step": 40470 + }, + { + "entropy": 0.04530189856886864, + "epoch": 9.434782608695652, + "grad_norm": 0.17578125, + "learning_rate": 4.640972942881945e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999788701534271, + "num_tokens": 133643820.0, + "step": 40475 + }, + { + "entropy": 0.06963036190718412, + "epoch": 9.435948245716284, + "grad_norm": 0.008056640625, + "learning_rate": 4.6408658755820855e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9994134902954102, + "num_tokens": 133655961.0, + "step": 40480 + }, + { + "entropy": 0.05203675013035536, + "epoch": 9.437113882736917, + "grad_norm": 0.047119140625, + "learning_rate": 4.640758794998076e-05, + "loss": 0.0034, + "mean_token_accuracy": 0.9996129751205445, + "num_tokens": 133679471.0, + "step": 40485 + }, + { + "entropy": 0.07076594727113843, + "epoch": 9.438279519757547, + "grad_norm": 2.484375, + "learning_rate": 4.6406517011315124e-05, + "loss": 0.0165, + "mean_token_accuracy": 0.9988285422325134, + "num_tokens": 133696136.0, + "step": 40490 + }, + { + "entropy": 0.03876218590885401, + "epoch": 9.439445156778179, + "grad_norm": 0.14453125, + "learning_rate": 4.640544593983992e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9997322976589202, + "num_tokens": 133736117.0, + "step": 40495 + }, + { + "entropy": 0.057139042066410184, + "epoch": 9.440610793798811, + "grad_norm": 0.05810546875, + "learning_rate": 4.640437473557113e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9994039118289948, + "num_tokens": 133755791.0, + "step": 40500 + }, + { + "entropy": 0.054068625625222924, + "epoch": 9.441776430819443, + "grad_norm": 0.10107421875, + "learning_rate": 4.640330339852472e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999571919441224, + "num_tokens": 133787308.0, + "step": 40505 + }, + { + "entropy": 0.05481251748278737, + "epoch": 9.442942067840075, + "grad_norm": 0.12109375, + "learning_rate": 4.6402231928716664e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9997755348682403, + "num_tokens": 133800285.0, + "step": 40510 + }, + { + "entropy": 0.06906182300299406, + "epoch": 9.444107704860706, + "grad_norm": 0.024658203125, + "learning_rate": 4.6401160326162934e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9996212124824524, + "num_tokens": 133819035.0, + "step": 40515 + }, + { + "entropy": 0.04636044520884752, + "epoch": 9.445273341881338, + "grad_norm": 0.046142578125, + "learning_rate": 4.640008859087952e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 133853991.0, + "step": 40520 + }, + { + "entropy": 0.05268848827108741, + "epoch": 9.44643897890197, + "grad_norm": 0.091796875, + "learning_rate": 4.6399016722882404e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9995454549789429, + "num_tokens": 133868111.0, + "step": 40525 + }, + { + "entropy": 0.07035997025668621, + "epoch": 9.447604615922602, + "grad_norm": 0.345703125, + "learning_rate": 4.639794472218756e-05, + "loss": 0.0047, + "mean_token_accuracy": 0.9991369843482971, + "num_tokens": 133878376.0, + "step": 40530 + }, + { + "entropy": 0.05296846874989569, + "epoch": 9.448770252943234, + "grad_norm": 0.11865234375, + "learning_rate": 4.639687258881097e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999543309211731, + "num_tokens": 133906468.0, + "step": 40535 + }, + { + "entropy": 0.056552316341549155, + "epoch": 9.449935889963864, + "grad_norm": 0.08935546875, + "learning_rate": 4.6395800322768634e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 133925080.0, + "step": 40540 + }, + { + "entropy": 0.06830793656408787, + "epoch": 9.451101526984496, + "grad_norm": 0.2216796875, + "learning_rate": 4.6394727924076535e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 133937381.0, + "step": 40545 + }, + { + "entropy": 0.061395585630089046, + "epoch": 9.452267164005129, + "grad_norm": 0.1416015625, + "learning_rate": 4.639365539275067e-05, + "loss": 0.0148, + "mean_token_accuracy": 0.998397821187973, + "num_tokens": 133963585.0, + "step": 40550 + }, + { + "entropy": 0.048311513382941484, + "epoch": 9.45343280102576, + "grad_norm": 0.1630859375, + "learning_rate": 4.6392582728807014e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9994284033775329, + "num_tokens": 133989337.0, + "step": 40555 + }, + { + "entropy": 0.04738085251301527, + "epoch": 9.454598438046393, + "grad_norm": 0.291015625, + "learning_rate": 4.639150993226158e-05, + "loss": 0.0007, + "mean_token_accuracy": 1.0, + "num_tokens": 134015143.0, + "step": 40560 + }, + { + "entropy": 0.07007132191210985, + "epoch": 9.455764075067025, + "grad_norm": 0.201171875, + "learning_rate": 4.6390437003130366e-05, + "loss": 0.031, + "mean_token_accuracy": 0.9956527829170227, + "num_tokens": 134042838.0, + "step": 40565 + }, + { + "entropy": 0.049748735316097734, + "epoch": 9.456929712087655, + "grad_norm": 0.04833984375, + "learning_rate": 4.638936394142936e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9994413435459137, + "num_tokens": 134060084.0, + "step": 40570 + }, + { + "entropy": 0.06348553579300642, + "epoch": 9.458095349108287, + "grad_norm": 0.1865234375, + "learning_rate": 4.638829074717456e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9996629536151886, + "num_tokens": 134082574.0, + "step": 40575 + }, + { + "entropy": 0.054871161840856074, + "epoch": 9.45926098612892, + "grad_norm": 0.14453125, + "learning_rate": 4.6387217420381986e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 134096670.0, + "step": 40580 + }, + { + "entropy": 0.054104173369705674, + "epoch": 9.460426623149552, + "grad_norm": 0.76953125, + "learning_rate": 4.638614396106763e-05, + "loss": 0.0049, + "mean_token_accuracy": 0.9995225071907043, + "num_tokens": 134124932.0, + "step": 40585 + }, + { + "entropy": 0.06244059270247817, + "epoch": 9.461592260170184, + "grad_norm": 0.302734375, + "learning_rate": 4.6385070369247495e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9998060464859009, + "num_tokens": 134145979.0, + "step": 40590 + }, + { + "entropy": 0.07192586399614811, + "epoch": 9.462757897190814, + "grad_norm": 0.92578125, + "learning_rate": 4.6383996644937606e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9992907822132111, + "num_tokens": 134156439.0, + "step": 40595 + }, + { + "entropy": 0.07092061564326287, + "epoch": 9.463923534211446, + "grad_norm": 0.341796875, + "learning_rate": 4.638292278815396e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9998226940631867, + "num_tokens": 134171371.0, + "step": 40600 + }, + { + "entropy": 0.0636997226625681, + "epoch": 9.465089171232078, + "grad_norm": 2.234375, + "learning_rate": 4.638184879891258e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.999472576379776, + "num_tokens": 134191017.0, + "step": 40605 + }, + { + "entropy": 0.05375488083809614, + "epoch": 9.46625480825271, + "grad_norm": 1.5078125, + "learning_rate": 4.638077467722947e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9997506201267242, + "num_tokens": 134210202.0, + "step": 40610 + }, + { + "entropy": 0.05368066322989762, + "epoch": 9.467420445273342, + "grad_norm": 0.1728515625, + "learning_rate": 4.637970042312065e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9999517619609832, + "num_tokens": 134236096.0, + "step": 40615 + }, + { + "entropy": 0.07572010587900876, + "epoch": 9.468586082293974, + "grad_norm": 0.07373046875, + "learning_rate": 4.6378626036602145e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 134247013.0, + "step": 40620 + }, + { + "entropy": 0.04615337757859379, + "epoch": 9.469751719314605, + "grad_norm": 1.796875, + "learning_rate": 4.637755151768998e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9996350347995758, + "num_tokens": 134269849.0, + "step": 40625 + }, + { + "entropy": 0.07043313197791576, + "epoch": 9.470917356335237, + "grad_norm": 0.03662109375, + "learning_rate": 4.637647686640015e-05, + "loss": 0.0006, + "mean_token_accuracy": 1.0, + "num_tokens": 134281666.0, + "step": 40630 + }, + { + "entropy": 0.04748814105987549, + "epoch": 9.472082993355869, + "grad_norm": 0.1318359375, + "learning_rate": 4.637540208274872e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9997235715389252, + "num_tokens": 134305770.0, + "step": 40635 + }, + { + "entropy": 0.07811860628426075, + "epoch": 9.473248630376501, + "grad_norm": 1.4296875, + "learning_rate": 4.637432716675168e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9995238184928894, + "num_tokens": 134314655.0, + "step": 40640 + }, + { + "entropy": 0.05494981547817588, + "epoch": 9.474414267397133, + "grad_norm": 0.03759765625, + "learning_rate": 4.637325211842508e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 134337243.0, + "step": 40645 + }, + { + "entropy": 0.05441898424178362, + "epoch": 9.475579904417764, + "grad_norm": 1.4140625, + "learning_rate": 4.637217693778494e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9998080611228943, + "num_tokens": 134351430.0, + "step": 40650 + }, + { + "entropy": 0.07939355503767728, + "epoch": 9.476745541438396, + "grad_norm": 2.4375, + "learning_rate": 4.6371101624847305e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9993265986442565, + "num_tokens": 134364615.0, + "step": 40655 + }, + { + "entropy": 0.05908539802767336, + "epoch": 9.477911178459028, + "grad_norm": 0.162109375, + "learning_rate": 4.63700261796282e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9996601998806, + "num_tokens": 134387056.0, + "step": 40660 + }, + { + "entropy": 0.046356960525736214, + "epoch": 9.47907681547966, + "grad_norm": 0.05322265625, + "learning_rate": 4.636895060214366e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999344050884247, + "num_tokens": 134425884.0, + "step": 40665 + }, + { + "entropy": 0.062171243224292995, + "epoch": 9.480242452500292, + "grad_norm": 0.040283203125, + "learning_rate": 4.636787489240973e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9997522413730622, + "num_tokens": 134448675.0, + "step": 40670 + }, + { + "entropy": 0.06885421648621559, + "epoch": 9.481408089520922, + "grad_norm": 0.8984375, + "learning_rate": 4.636679905044245e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 134459310.0, + "step": 40675 + }, + { + "entropy": 0.06276169000193477, + "epoch": 9.482573726541554, + "grad_norm": 0.060791015625, + "learning_rate": 4.636572307625785e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999372661113739, + "num_tokens": 134475261.0, + "step": 40680 + }, + { + "entropy": 0.06280481340363622, + "epoch": 9.483739363562186, + "grad_norm": 0.86328125, + "learning_rate": 4.636464696987199e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9996323525905609, + "num_tokens": 134500387.0, + "step": 40685 + }, + { + "entropy": 0.08110145181417465, + "epoch": 9.484905000582819, + "grad_norm": 0.09716796875, + "learning_rate": 4.636357073130091e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 134514011.0, + "step": 40690 + }, + { + "entropy": 0.041450155060738327, + "epoch": 9.48607063760345, + "grad_norm": 0.1767578125, + "learning_rate": 4.636249436056066e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9995937526226044, + "num_tokens": 134542359.0, + "step": 40695 + }, + { + "entropy": 0.05160027435049415, + "epoch": 9.487236274624083, + "grad_norm": 0.0810546875, + "learning_rate": 4.636141785766729e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 134561048.0, + "step": 40700 + }, + { + "entropy": 0.07677444498986005, + "epoch": 9.488401911644713, + "grad_norm": 1.1875, + "learning_rate": 4.6360341222636846e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9994491636753082, + "num_tokens": 134576808.0, + "step": 40705 + }, + { + "entropy": 0.07004327522590756, + "epoch": 9.489567548665345, + "grad_norm": 0.09375, + "learning_rate": 4.635926445548539e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 134588413.0, + "step": 40710 + }, + { + "entropy": 0.0489866410382092, + "epoch": 9.490733185685977, + "grad_norm": 0.0849609375, + "learning_rate": 4.635818755622898e-05, + "loss": 0.0037, + "mean_token_accuracy": 0.9992427587509155, + "num_tokens": 134605793.0, + "step": 40715 + }, + { + "entropy": 0.04987419592216611, + "epoch": 9.49189882270661, + "grad_norm": 0.0810546875, + "learning_rate": 4.635711052488367e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9993611574172974, + "num_tokens": 134631089.0, + "step": 40720 + }, + { + "entropy": 0.059205600060522556, + "epoch": 9.493064459727242, + "grad_norm": 0.07861328125, + "learning_rate": 4.6356033361465515e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999550521373749, + "num_tokens": 134656109.0, + "step": 40725 + }, + { + "entropy": 0.06425941651687025, + "epoch": 9.494230096747872, + "grad_norm": 0.2109375, + "learning_rate": 4.635495606599059e-05, + "loss": 0.0125, + "mean_token_accuracy": 0.997954660654068, + "num_tokens": 134687865.0, + "step": 40730 + }, + { + "entropy": 0.056310771498829126, + "epoch": 9.495395733768504, + "grad_norm": 0.185546875, + "learning_rate": 4.635387863847494e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.999619048833847, + "num_tokens": 134699790.0, + "step": 40735 + }, + { + "entropy": 0.04697222877293825, + "epoch": 9.496561370789136, + "grad_norm": 0.0308837890625, + "learning_rate": 4.635280107893465e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9995614051818847, + "num_tokens": 134715296.0, + "step": 40740 + }, + { + "entropy": 0.06893054358661174, + "epoch": 9.497727007809768, + "grad_norm": 0.11962890625, + "learning_rate": 4.6351723387385784e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 134734337.0, + "step": 40745 + }, + { + "entropy": 0.04734471701085567, + "epoch": 9.4988926448304, + "grad_norm": 0.033935546875, + "learning_rate": 4.6350645563844394e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 134753683.0, + "step": 40750 + }, + { + "entropy": 0.0547360529191792, + "epoch": 9.500058281851032, + "grad_norm": 0.01385498046875, + "learning_rate": 4.6349567608326585e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999753832817078, + "num_tokens": 134778207.0, + "step": 40755 + }, + { + "entropy": 0.059074167534708975, + "epoch": 9.501223918871663, + "grad_norm": 0.162109375, + "learning_rate": 4.63484895208484e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 134794641.0, + "step": 40760 + }, + { + "entropy": 0.05270886849611998, + "epoch": 9.502389555892295, + "grad_norm": 0.12255859375, + "learning_rate": 4.634741130142593e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999666035175323, + "num_tokens": 134821573.0, + "step": 40765 + }, + { + "entropy": 0.056115038227289915, + "epoch": 9.503555192912927, + "grad_norm": 0.034912109375, + "learning_rate": 4.634633295007525e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9993288278579712, + "num_tokens": 134839327.0, + "step": 40770 + }, + { + "entropy": 0.03874876936897635, + "epoch": 9.504720829933559, + "grad_norm": 0.1455078125, + "learning_rate": 4.634525446681245e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999327003955841, + "num_tokens": 134882343.0, + "step": 40775 + }, + { + "entropy": 0.05923546832054853, + "epoch": 9.505886466954191, + "grad_norm": 0.115234375, + "learning_rate": 4.634417585165359e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.999529767036438, + "num_tokens": 134897391.0, + "step": 40780 + }, + { + "entropy": 0.05450958535075188, + "epoch": 9.507052103974821, + "grad_norm": 0.1357421875, + "learning_rate": 4.634309710461476e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9994285702705383, + "num_tokens": 134920605.0, + "step": 40785 + }, + { + "entropy": 0.03137501548044384, + "epoch": 9.508217740995454, + "grad_norm": 0.0262451171875, + "learning_rate": 4.634201822571207e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.999960595369339, + "num_tokens": 134959815.0, + "step": 40790 + }, + { + "entropy": 0.058038387820124625, + "epoch": 9.509383378016086, + "grad_norm": 0.041259765625, + "learning_rate": 4.6340939214961575e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.999402391910553, + "num_tokens": 134972976.0, + "step": 40795 + }, + { + "entropy": 0.04890700094401836, + "epoch": 9.510549015036718, + "grad_norm": 0.2373046875, + "learning_rate": 4.633986007237939e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.999665379524231, + "num_tokens": 135004943.0, + "step": 40800 + }, + { + "entropy": 0.04844941468909383, + "epoch": 9.51171465205735, + "grad_norm": 0.037109375, + "learning_rate": 4.6338780797981584e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9996458530426026, + "num_tokens": 135029697.0, + "step": 40805 + }, + { + "entropy": 0.06579445358365774, + "epoch": 9.51288028907798, + "grad_norm": 0.10888671875, + "learning_rate": 4.6337701391784266e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9995369851589203, + "num_tokens": 135043437.0, + "step": 40810 + }, + { + "entropy": 0.06171910166740417, + "epoch": 9.514045926098612, + "grad_norm": 0.025390625, + "learning_rate": 4.633662185380353e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999885797500611, + "num_tokens": 135063057.0, + "step": 40815 + }, + { + "entropy": 0.06348031088709831, + "epoch": 9.515211563119244, + "grad_norm": 0.09326171875, + "learning_rate": 4.633554218405547e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9990782618522644, + "num_tokens": 135073222.0, + "step": 40820 + }, + { + "entropy": 0.05706033930182457, + "epoch": 9.516377200139877, + "grad_norm": 0.0869140625, + "learning_rate": 4.633446238255619e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 135085320.0, + "step": 40825 + }, + { + "entropy": 0.05939996847882867, + "epoch": 9.517542837160509, + "grad_norm": 3.109375, + "learning_rate": 4.6333382449321776e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9996251404285431, + "num_tokens": 135100091.0, + "step": 40830 + }, + { + "entropy": 0.04892335725016892, + "epoch": 9.51870847418114, + "grad_norm": 1.046875, + "learning_rate": 4.6332302384368355e-05, + "loss": 0.0088, + "mean_token_accuracy": 0.9980179607868195, + "num_tokens": 135116547.0, + "step": 40835 + }, + { + "entropy": 0.06294927066192031, + "epoch": 9.519874111201771, + "grad_norm": 0.9453125, + "learning_rate": 4.633122218771202e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9986855506896972, + "num_tokens": 135135187.0, + "step": 40840 + }, + { + "entropy": 0.054868426825851205, + "epoch": 9.521039748222403, + "grad_norm": 0.138671875, + "learning_rate": 4.6330141859368875e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999448001384735, + "num_tokens": 135155106.0, + "step": 40845 + }, + { + "entropy": 0.0764088025316596, + "epoch": 9.522205385243035, + "grad_norm": 0.40625, + "learning_rate": 4.632906139935503e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999555706977844, + "num_tokens": 135174411.0, + "step": 40850 + }, + { + "entropy": 0.0597866203635931, + "epoch": 9.523371022263667, + "grad_norm": 0.138671875, + "learning_rate": 4.632798080768661e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9996915280818939, + "num_tokens": 135195708.0, + "step": 40855 + }, + { + "entropy": 0.04914709161967039, + "epoch": 9.5245366592843, + "grad_norm": 0.41796875, + "learning_rate": 4.632690008437971e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999254047870636, + "num_tokens": 135217479.0, + "step": 40860 + }, + { + "entropy": 0.052582056075334546, + "epoch": 9.52570229630493, + "grad_norm": 0.119140625, + "learning_rate": 4.632581922945046e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 135243405.0, + "step": 40865 + }, + { + "entropy": 0.07694977000355721, + "epoch": 9.526867933325562, + "grad_norm": 0.208984375, + "learning_rate": 4.6324738242914966e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999651193618775, + "num_tokens": 135260625.0, + "step": 40870 + }, + { + "entropy": 0.041239157784730196, + "epoch": 9.528033570346194, + "grad_norm": 0.1875, + "learning_rate": 4.632365712478935e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 135291113.0, + "step": 40875 + }, + { + "entropy": 0.052362867165356874, + "epoch": 9.529199207366826, + "grad_norm": 0.064453125, + "learning_rate": 4.632257587508975e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.999953818321228, + "num_tokens": 135308792.0, + "step": 40880 + }, + { + "entropy": 0.06618333477526903, + "epoch": 9.530364844387458, + "grad_norm": 0.271484375, + "learning_rate": 4.632149449383226e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.999617338180542, + "num_tokens": 135327668.0, + "step": 40885 + }, + { + "entropy": 0.07062805388122798, + "epoch": 9.53153048140809, + "grad_norm": 0.16796875, + "learning_rate": 4.6320412981033015e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9999792397022247, + "num_tokens": 135347719.0, + "step": 40890 + }, + { + "entropy": 0.06933510228991509, + "epoch": 9.53269611842872, + "grad_norm": 0.0849609375, + "learning_rate": 4.631933133670815e-05, + "loss": 0.0009, + "mean_token_accuracy": 1.0, + "num_tokens": 135358370.0, + "step": 40895 + }, + { + "entropy": 0.046752724703401326, + "epoch": 9.533861755449353, + "grad_norm": 0.054443359375, + "learning_rate": 4.63182495608738e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999893844127655, + "num_tokens": 135380333.0, + "step": 40900 + }, + { + "entropy": 0.05584818883799016, + "epoch": 9.535027392469985, + "grad_norm": 0.07861328125, + "learning_rate": 4.631716765354607e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.999989253282547, + "num_tokens": 135401439.0, + "step": 40905 + }, + { + "entropy": 0.06718082525767385, + "epoch": 9.536193029490617, + "grad_norm": 0.77734375, + "learning_rate": 4.6316085614741115e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9996930599212647, + "num_tokens": 135423014.0, + "step": 40910 + }, + { + "entropy": 0.06601903475821018, + "epoch": 9.537358666511249, + "grad_norm": 0.0269775390625, + "learning_rate": 4.6315003444475065e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9995391726493835, + "num_tokens": 135438969.0, + "step": 40915 + }, + { + "entropy": 0.06503906883299351, + "epoch": 9.53852430353188, + "grad_norm": 2.140625, + "learning_rate": 4.631392114276406e-05, + "loss": 0.001, + "mean_token_accuracy": 0.999578058719635, + "num_tokens": 135452833.0, + "step": 40920 + }, + { + "entropy": 0.06475102892145515, + "epoch": 9.539689940552512, + "grad_norm": 0.08935546875, + "learning_rate": 4.6312838709624226e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 135471217.0, + "step": 40925 + }, + { + "entropy": 0.03920008554123342, + "epoch": 9.540855577573144, + "grad_norm": 0.1962890625, + "learning_rate": 4.6311756145071705e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999875962734223, + "num_tokens": 135500601.0, + "step": 40930 + }, + { + "entropy": 0.03399303937330842, + "epoch": 9.542021214593776, + "grad_norm": 0.0185546875, + "learning_rate": 4.631067344912266e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999543249607086, + "num_tokens": 135542941.0, + "step": 40935 + }, + { + "entropy": 0.055815706960856915, + "epoch": 9.543186851614408, + "grad_norm": 1.5859375, + "learning_rate": 4.630959062179321e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9992071092128754, + "num_tokens": 135557143.0, + "step": 40940 + }, + { + "entropy": 0.07005295492708682, + "epoch": 9.544352488635038, + "grad_norm": 0.06787109375, + "learning_rate": 4.6308507663099524e-05, + "loss": 0.0006, + "mean_token_accuracy": 1.0, + "num_tokens": 135568449.0, + "step": 40945 + }, + { + "entropy": 0.05475952555425465, + "epoch": 9.54551812565567, + "grad_norm": 0.06298828125, + "learning_rate": 4.630742457305774e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9997912287712097, + "num_tokens": 135591304.0, + "step": 40950 + }, + { + "entropy": 0.050777725968509914, + "epoch": 9.546683762676302, + "grad_norm": 0.68359375, + "learning_rate": 4.6306341351684e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9998898327350616, + "num_tokens": 135612736.0, + "step": 40955 + }, + { + "entropy": 0.0599704397842288, + "epoch": 9.547849399696934, + "grad_norm": 0.1416015625, + "learning_rate": 4.630525799899447e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 135629357.0, + "step": 40960 + }, + { + "entropy": 0.041774158645421264, + "epoch": 9.549015036717567, + "grad_norm": 0.04248046875, + "learning_rate": 4.6304174515005294e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9995239317417145, + "num_tokens": 135651757.0, + "step": 40965 + }, + { + "entropy": 0.04553243769332767, + "epoch": 9.550180673738199, + "grad_norm": 0.029296875, + "learning_rate": 4.630309089973264e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9994342923164368, + "num_tokens": 135676830.0, + "step": 40970 + }, + { + "entropy": 0.05818598102778196, + "epoch": 9.551346310758829, + "grad_norm": 0.09130859375, + "learning_rate": 4.630200715319266e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9994813144207001, + "num_tokens": 135705468.0, + "step": 40975 + }, + { + "entropy": 0.0411489917896688, + "epoch": 9.552511947779461, + "grad_norm": 0.0927734375, + "learning_rate": 4.6300923275401506e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999789834022522, + "num_tokens": 135731968.0, + "step": 40980 + }, + { + "entropy": 0.045835569687187674, + "epoch": 9.553677584800093, + "grad_norm": 0.01300048828125, + "learning_rate": 4.629983926637536e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999888777732849, + "num_tokens": 135752618.0, + "step": 40985 + }, + { + "entropy": 0.05435104519128799, + "epoch": 9.554843221820725, + "grad_norm": 0.193359375, + "learning_rate": 4.629875512613037e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9998708009719849, + "num_tokens": 135771384.0, + "step": 40990 + }, + { + "entropy": 0.06642589755356312, + "epoch": 9.556008858841357, + "grad_norm": 0.6015625, + "learning_rate": 4.6297670854682705e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9997084558010101, + "num_tokens": 135783644.0, + "step": 40995 + }, + { + "entropy": 0.056930082757025956, + "epoch": 9.557174495861988, + "grad_norm": 0.055419921875, + "learning_rate": 4.629658645204854e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9992791295051575, + "num_tokens": 135812769.0, + "step": 41000 + }, + { + "entropy": 0.05946479961276054, + "epoch": 9.55834013288262, + "grad_norm": 0.09619140625, + "learning_rate": 4.629550191824404e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 135825284.0, + "step": 41005 + }, + { + "entropy": 0.04301239885389805, + "epoch": 9.559505769903252, + "grad_norm": 0.040771484375, + "learning_rate": 4.629441725328537e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999394893646241, + "num_tokens": 135847689.0, + "step": 41010 + }, + { + "entropy": 0.049624948669224976, + "epoch": 9.560671406923884, + "grad_norm": 0.080078125, + "learning_rate": 4.6293332457188724e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9996513247489929, + "num_tokens": 135863808.0, + "step": 41015 + }, + { + "entropy": 0.037809535302221774, + "epoch": 9.561837043944516, + "grad_norm": 0.19140625, + "learning_rate": 4.629224752997026e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9994710803031921, + "num_tokens": 135904205.0, + "step": 41020 + }, + { + "entropy": 0.0786101894453168, + "epoch": 9.563002680965148, + "grad_norm": 0.37109375, + "learning_rate": 4.629116247164616e-05, + "loss": 0.0194, + "mean_token_accuracy": 0.9954461216926574, + "num_tokens": 135932346.0, + "step": 41025 + }, + { + "entropy": 0.05533070582896471, + "epoch": 9.564168317985779, + "grad_norm": 0.46484375, + "learning_rate": 4.6290077282232594e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9999791443347931, + "num_tokens": 135957361.0, + "step": 41030 + }, + { + "entropy": 0.059603410121053454, + "epoch": 9.56533395500641, + "grad_norm": 1.0234375, + "learning_rate": 4.6288991961745764e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.999586033821106, + "num_tokens": 135982860.0, + "step": 41035 + }, + { + "entropy": 0.0578294038772583, + "epoch": 9.566499592027043, + "grad_norm": 0.140625, + "learning_rate": 4.628790651020184e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 136001350.0, + "step": 41040 + }, + { + "entropy": 0.06318863546475768, + "epoch": 9.567665229047675, + "grad_norm": 0.051025390625, + "learning_rate": 4.628682092761702e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999873399734497, + "num_tokens": 136017372.0, + "step": 41045 + }, + { + "entropy": 0.0619500283151865, + "epoch": 9.568830866068307, + "grad_norm": 0.11279296875, + "learning_rate": 4.628573521400748e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999892354011536, + "num_tokens": 136040839.0, + "step": 41050 + }, + { + "entropy": 0.05457418002188206, + "epoch": 9.569996503088937, + "grad_norm": 0.0159912109375, + "learning_rate": 4.6284649369389416e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999674022197723, + "num_tokens": 136069858.0, + "step": 41055 + }, + { + "entropy": 0.057254419988021256, + "epoch": 9.57116214010957, + "grad_norm": 0.12353515625, + "learning_rate": 4.6283563393779014e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.999933785200119, + "num_tokens": 136101309.0, + "step": 41060 + }, + { + "entropy": 0.07205391302704811, + "epoch": 9.572327777130202, + "grad_norm": 0.07763671875, + "learning_rate": 4.628247728719247e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9997916638851165, + "num_tokens": 136115648.0, + "step": 41065 + }, + { + "entropy": 0.06587801650166511, + "epoch": 9.573493414150834, + "grad_norm": 0.376953125, + "learning_rate": 4.6281391049645986e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9996513187885284, + "num_tokens": 136136716.0, + "step": 41070 + }, + { + "entropy": 0.09622649177908897, + "epoch": 9.574659051171466, + "grad_norm": 0.09521484375, + "learning_rate": 4.628030468115574e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 136147763.0, + "step": 41075 + }, + { + "entropy": 0.05813981788232923, + "epoch": 9.575824688192096, + "grad_norm": 0.0380859375, + "learning_rate": 4.627921818173796e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9996845960617066, + "num_tokens": 136166803.0, + "step": 41080 + }, + { + "entropy": 0.06320654163137078, + "epoch": 9.576990325212728, + "grad_norm": 0.1328125, + "learning_rate": 4.627813155140882e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 136191064.0, + "step": 41085 + }, + { + "entropy": 0.041704765520989895, + "epoch": 9.57815596223336, + "grad_norm": 0.04150390625, + "learning_rate": 4.6277044790184546e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 136205964.0, + "step": 41090 + }, + { + "entropy": 0.056818116828799246, + "epoch": 9.579321599253992, + "grad_norm": 0.373046875, + "learning_rate": 4.627595789808133e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9994535505771637, + "num_tokens": 136227356.0, + "step": 41095 + }, + { + "entropy": 0.09946035724133254, + "epoch": 9.580487236274625, + "grad_norm": 0.1796875, + "learning_rate": 4.627487087511538e-05, + "loss": 0.0561, + "mean_token_accuracy": 0.992105895280838, + "num_tokens": 136263847.0, + "step": 41100 + }, + { + "entropy": 0.07794980946928262, + "epoch": 9.581652873295257, + "grad_norm": 0.05322265625, + "learning_rate": 4.6273783721302904e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 136274021.0, + "step": 41105 + }, + { + "entropy": 0.06599203050136566, + "epoch": 9.582818510315887, + "grad_norm": 0.39453125, + "learning_rate": 4.6272696436660125e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9998664915561676, + "num_tokens": 136294119.0, + "step": 41110 + }, + { + "entropy": 0.057835323922336104, + "epoch": 9.583984147336519, + "grad_norm": 1.6796875, + "learning_rate": 4.627160902120324e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9999575018882751, + "num_tokens": 136318752.0, + "step": 41115 + }, + { + "entropy": 0.04515928709879517, + "epoch": 9.585149784357151, + "grad_norm": 0.65234375, + "learning_rate": 4.627052147494847e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9992202460765839, + "num_tokens": 136333968.0, + "step": 41120 + }, + { + "entropy": 0.06753161698579788, + "epoch": 9.586315421377783, + "grad_norm": 0.048583984375, + "learning_rate": 4.6269433797912034e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999408602714539, + "num_tokens": 136352206.0, + "step": 41125 + }, + { + "entropy": 0.06529767028987407, + "epoch": 9.587481058398415, + "grad_norm": 0.03369140625, + "learning_rate": 4.6268345990110156e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9998778998851776, + "num_tokens": 136361336.0, + "step": 41130 + }, + { + "entropy": 0.05374781196005642, + "epoch": 9.588646695419046, + "grad_norm": 0.0286865234375, + "learning_rate": 4.6267258051559046e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9996373355388641, + "num_tokens": 136387286.0, + "step": 41135 + }, + { + "entropy": 0.06637289561331272, + "epoch": 9.589812332439678, + "grad_norm": 0.040283203125, + "learning_rate": 4.626616998227493e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999774336814881, + "num_tokens": 136407787.0, + "step": 41140 + }, + { + "entropy": 0.052790251560509205, + "epoch": 9.59097796946031, + "grad_norm": 0.10009765625, + "learning_rate": 4.6265081782274046e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 136431041.0, + "step": 41145 + }, + { + "entropy": 0.06665988322347402, + "epoch": 9.592143606480942, + "grad_norm": 0.140625, + "learning_rate": 4.6263993451572594e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9994082868099212, + "num_tokens": 136441999.0, + "step": 41150 + }, + { + "entropy": 0.05983887910842896, + "epoch": 9.593309243501574, + "grad_norm": 0.1552734375, + "learning_rate": 4.6262904990186826e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999689280986785, + "num_tokens": 136465318.0, + "step": 41155 + }, + { + "entropy": 0.061148155480623245, + "epoch": 9.594474880522206, + "grad_norm": 0.427734375, + "learning_rate": 4.6261816398132965e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 136478002.0, + "step": 41160 + }, + { + "entropy": 0.05825490411370993, + "epoch": 9.595640517542837, + "grad_norm": 0.1416015625, + "learning_rate": 4.626072767542724e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9997474730014801, + "num_tokens": 136492488.0, + "step": 41165 + }, + { + "entropy": 0.09653316643089056, + "epoch": 9.596806154563469, + "grad_norm": 0.1103515625, + "learning_rate": 4.625963882208589e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 136507177.0, + "step": 41170 + }, + { + "entropy": 0.03887947797775269, + "epoch": 9.5979717915841, + "grad_norm": 0.07421875, + "learning_rate": 4.625854983812515e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9998583555221557, + "num_tokens": 136535310.0, + "step": 41175 + }, + { + "entropy": 0.047138193342834714, + "epoch": 9.599137428604733, + "grad_norm": 0.478515625, + "learning_rate": 4.625746072356126e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 136551768.0, + "step": 41180 + }, + { + "entropy": 0.07058442085981369, + "epoch": 9.600303065625365, + "grad_norm": 0.1298828125, + "learning_rate": 4.625637147841046e-05, + "loss": 0.0046, + "mean_token_accuracy": 0.9993247389793396, + "num_tokens": 136562332.0, + "step": 41185 + }, + { + "entropy": 0.05666069518774748, + "epoch": 9.601468702645995, + "grad_norm": 0.0517578125, + "learning_rate": 4.625528210268898e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9998417735099793, + "num_tokens": 136575910.0, + "step": 41190 + }, + { + "entropy": 0.06339784916490317, + "epoch": 9.602634339666627, + "grad_norm": 0.041259765625, + "learning_rate": 4.625419259641308e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999178290367127, + "num_tokens": 136586032.0, + "step": 41195 + }, + { + "entropy": 0.03752074474468827, + "epoch": 9.60379997668726, + "grad_norm": 0.162109375, + "learning_rate": 4.625310295959901e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9998230099678039, + "num_tokens": 136607533.0, + "step": 41200 + }, + { + "entropy": 0.07200399097055196, + "epoch": 9.604965613707892, + "grad_norm": 0.10693359375, + "learning_rate": 4.6252013192263e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 136626736.0, + "step": 41205 + }, + { + "entropy": 0.07442393903620541, + "epoch": 9.606131250728524, + "grad_norm": 2.859375, + "learning_rate": 4.625092329442131e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9989759445190429, + "num_tokens": 136647055.0, + "step": 41210 + }, + { + "entropy": 0.04741760222241283, + "epoch": 9.607296887749154, + "grad_norm": 0.1396484375, + "learning_rate": 4.62498332660902e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9996415734291076, + "num_tokens": 136680106.0, + "step": 41215 + }, + { + "entropy": 0.07332726744934917, + "epoch": 9.608462524769786, + "grad_norm": 3.328125, + "learning_rate": 4.6248743107285905e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9996515691280365, + "num_tokens": 136699318.0, + "step": 41220 + }, + { + "entropy": 0.05996815362013876, + "epoch": 9.609628161790418, + "grad_norm": 0.029541015625, + "learning_rate": 4.62476528180247e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.999965512752533, + "num_tokens": 136719358.0, + "step": 41225 + }, + { + "entropy": 0.06359810242429376, + "epoch": 9.61079379881105, + "grad_norm": 2.921875, + "learning_rate": 4.624656239832283e-05, + "loss": 0.0036, + "mean_token_accuracy": 0.9995666563510894, + "num_tokens": 136748699.0, + "step": 41230 + }, + { + "entropy": 0.054721962008625266, + "epoch": 9.611959435831682, + "grad_norm": 0.0361328125, + "learning_rate": 4.624547184819656e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999768316745759, + "num_tokens": 136767794.0, + "step": 41235 + }, + { + "entropy": 0.04269056161865592, + "epoch": 9.613125072852315, + "grad_norm": 0.1767578125, + "learning_rate": 4.624438116766215e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9996411383152009, + "num_tokens": 136797142.0, + "step": 41240 + }, + { + "entropy": 0.055638442654162644, + "epoch": 9.614290709872945, + "grad_norm": 0.05712890625, + "learning_rate": 4.6243290356735865e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 136809958.0, + "step": 41245 + }, + { + "entropy": 0.048937355354428294, + "epoch": 9.615456346893577, + "grad_norm": 0.12890625, + "learning_rate": 4.624219941543397e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.999721109867096, + "num_tokens": 136835763.0, + "step": 41250 + }, + { + "entropy": 0.07546586729586124, + "epoch": 9.616621983914209, + "grad_norm": 2.28125, + "learning_rate": 4.6241108343772735e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9991138696670532, + "num_tokens": 136846408.0, + "step": 41255 + }, + { + "entropy": 0.04572682408615947, + "epoch": 9.617787620934841, + "grad_norm": 0.10009765625, + "learning_rate": 4.6240017141768424e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999783515930176, + "num_tokens": 136889137.0, + "step": 41260 + }, + { + "entropy": 0.06971332840621472, + "epoch": 9.618953257955473, + "grad_norm": 0.11474609375, + "learning_rate": 4.623892580943731e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9996296286582946, + "num_tokens": 136898647.0, + "step": 41265 + }, + { + "entropy": 0.05883047040551901, + "epoch": 9.620118894976104, + "grad_norm": 0.09716796875, + "learning_rate": 4.623783434679567e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9993464052677155, + "num_tokens": 136924038.0, + "step": 41270 + }, + { + "entropy": 0.052692413609474895, + "epoch": 9.621284531996736, + "grad_norm": 1.546875, + "learning_rate": 4.623674275385978e-05, + "loss": 0.0057, + "mean_token_accuracy": 0.9981497287750244, + "num_tokens": 136944410.0, + "step": 41275 + }, + { + "entropy": 0.06412827260792256, + "epoch": 9.622450169017368, + "grad_norm": 0.0400390625, + "learning_rate": 4.623565103064591e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999616265296936, + "num_tokens": 136956566.0, + "step": 41280 + }, + { + "entropy": 0.06835590191185474, + "epoch": 9.623615806038, + "grad_norm": 0.07958984375, + "learning_rate": 4.623455917717035e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9986554086208344, + "num_tokens": 136987616.0, + "step": 41285 + }, + { + "entropy": 0.06827231720089913, + "epoch": 9.624781443058632, + "grad_norm": 0.2119140625, + "learning_rate": 4.623346719344937e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9994838953018188, + "num_tokens": 137009985.0, + "step": 41290 + }, + { + "entropy": 0.06732688648626209, + "epoch": 9.625947080079264, + "grad_norm": 0.1376953125, + "learning_rate": 4.623237507949926e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9995437383651733, + "num_tokens": 137033747.0, + "step": 41295 + }, + { + "entropy": 0.04815381010994315, + "epoch": 9.627112717099894, + "grad_norm": 0.06591796875, + "learning_rate": 4.6231282835336306e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.999988979101181, + "num_tokens": 137057779.0, + "step": 41300 + }, + { + "entropy": 0.05480205034837127, + "epoch": 9.628278354120527, + "grad_norm": 0.65234375, + "learning_rate": 4.623019046097679e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9996123731136322, + "num_tokens": 137074912.0, + "step": 41305 + }, + { + "entropy": 0.06899555269628763, + "epoch": 9.629443991141159, + "grad_norm": 1.2109375, + "learning_rate": 4.6229097956437e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9994629144668579, + "num_tokens": 137094135.0, + "step": 41310 + }, + { + "entropy": 0.06584795317612588, + "epoch": 9.63060962816179, + "grad_norm": 0.05859375, + "learning_rate": 4.622800532173324e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 137116123.0, + "step": 41315 + }, + { + "entropy": 0.04917632364667952, + "epoch": 9.631775265182423, + "grad_norm": 0.546875, + "learning_rate": 4.6226912556881786e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9999045550823211, + "num_tokens": 137144216.0, + "step": 41320 + }, + { + "entropy": 0.05311643872410059, + "epoch": 9.632940902203053, + "grad_norm": 0.0174560546875, + "learning_rate": 4.622581966189894e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 137159915.0, + "step": 41325 + }, + { + "entropy": 0.041334444051608445, + "epoch": 9.634106539223685, + "grad_norm": 0.10791015625, + "learning_rate": 4.6224726636801e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9996349096298218, + "num_tokens": 137189497.0, + "step": 41330 + }, + { + "entropy": 0.05849685426801443, + "epoch": 9.635272176244317, + "grad_norm": 1.078125, + "learning_rate": 4.622363348160426e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.999852204322815, + "num_tokens": 137213906.0, + "step": 41335 + }, + { + "entropy": 0.06110354913398623, + "epoch": 9.63643781326495, + "grad_norm": 0.52734375, + "learning_rate": 4.622254019632503e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9995953798294067, + "num_tokens": 137242822.0, + "step": 41340 + }, + { + "entropy": 0.045990268047899005, + "epoch": 9.637603450285582, + "grad_norm": 0.1455078125, + "learning_rate": 4.6221446780979594e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999152421951294, + "num_tokens": 137260574.0, + "step": 41345 + }, + { + "entropy": 0.04493403248488903, + "epoch": 9.638769087306212, + "grad_norm": 1.1953125, + "learning_rate": 4.622035323558428e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9997753381729126, + "num_tokens": 137283109.0, + "step": 41350 + }, + { + "entropy": 0.06357506215572357, + "epoch": 9.639934724326844, + "grad_norm": 0.033935546875, + "learning_rate": 4.621925956015538e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 137297805.0, + "step": 41355 + }, + { + "entropy": 0.057191804703325036, + "epoch": 9.641100361347476, + "grad_norm": 0.251953125, + "learning_rate": 4.6218165754709204e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 137321489.0, + "step": 41360 + }, + { + "entropy": 0.05335645116865635, + "epoch": 9.642265998368108, + "grad_norm": 0.212890625, + "learning_rate": 4.621707181926206e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9995932877063751, + "num_tokens": 137338002.0, + "step": 41365 + }, + { + "entropy": 0.05435511860996485, + "epoch": 9.64343163538874, + "grad_norm": 0.1044921875, + "learning_rate": 4.621597775383027e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.999989116191864, + "num_tokens": 137357160.0, + "step": 41370 + }, + { + "entropy": 0.06837129667401314, + "epoch": 9.644597272409372, + "grad_norm": 1.703125, + "learning_rate": 4.6214883558430144e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9996031939983367, + "num_tokens": 137367199.0, + "step": 41375 + }, + { + "entropy": 0.05769171183928847, + "epoch": 9.645762909430003, + "grad_norm": 0.0693359375, + "learning_rate": 4.621378923307799e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 137383857.0, + "step": 41380 + }, + { + "entropy": 0.0587625206913799, + "epoch": 9.646928546450635, + "grad_norm": 0.10986328125, + "learning_rate": 4.6212694777790134e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999234795570373, + "num_tokens": 137402085.0, + "step": 41385 + }, + { + "entropy": 0.06259220317006112, + "epoch": 9.648094183471267, + "grad_norm": 0.02880859375, + "learning_rate": 4.62116001925829e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9997849702835083, + "num_tokens": 137423375.0, + "step": 41390 + }, + { + "entropy": 0.08494818285107612, + "epoch": 9.6492598204919, + "grad_norm": 1.3828125, + "learning_rate": 4.6210505477472596e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999876201152802, + "num_tokens": 137440289.0, + "step": 41395 + }, + { + "entropy": 0.049837575666606425, + "epoch": 9.650425457512531, + "grad_norm": 0.01239013671875, + "learning_rate": 4.620941063247556e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 137453733.0, + "step": 41400 + }, + { + "entropy": 0.054591873474419114, + "epoch": 9.651591094533162, + "grad_norm": 0.13671875, + "learning_rate": 4.620831565760811e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9996463716030121, + "num_tokens": 137480363.0, + "step": 41405 + }, + { + "entropy": 0.05170064996927977, + "epoch": 9.652756731553794, + "grad_norm": 0.76171875, + "learning_rate": 4.6207220552886575e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9996154427528381, + "num_tokens": 137502929.0, + "step": 41410 + }, + { + "entropy": 0.05048687579110265, + "epoch": 9.653922368574426, + "grad_norm": 0.3203125, + "learning_rate": 4.620612531832728e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9997367084026336, + "num_tokens": 137530077.0, + "step": 41415 + }, + { + "entropy": 0.06301561882719398, + "epoch": 9.655088005595058, + "grad_norm": 0.1796875, + "learning_rate": 4.620502995394657e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9993149757385253, + "num_tokens": 137560535.0, + "step": 41420 + }, + { + "entropy": 0.047367416135966775, + "epoch": 9.65625364261569, + "grad_norm": 0.050537109375, + "learning_rate": 4.620393445976077e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9998161792755127, + "num_tokens": 137574452.0, + "step": 41425 + }, + { + "entropy": 0.04114743801765144, + "epoch": 9.657419279636322, + "grad_norm": 0.337890625, + "learning_rate": 4.620283883578621e-05, + "loss": 0.0026, + "mean_token_accuracy": 0.9996399223804474, + "num_tokens": 137607358.0, + "step": 41430 + }, + { + "entropy": 0.05030833915807307, + "epoch": 9.658584916656952, + "grad_norm": 0.134765625, + "learning_rate": 4.620174308203924e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999332249164581, + "num_tokens": 137640131.0, + "step": 41435 + }, + { + "entropy": 0.04978873711079359, + "epoch": 9.659750553677585, + "grad_norm": 0.1708984375, + "learning_rate": 4.620064719853618e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9998363316059112, + "num_tokens": 137665585.0, + "step": 41440 + }, + { + "entropy": 0.09417327288538217, + "epoch": 9.660916190698217, + "grad_norm": 0.0498046875, + "learning_rate": 4.61995511852934e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 137680253.0, + "step": 41445 + }, + { + "entropy": 0.06099643521010876, + "epoch": 9.662081827718849, + "grad_norm": 1.75, + "learning_rate": 4.619845504232722e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 137689510.0, + "step": 41450 + }, + { + "entropy": 0.05705720148980618, + "epoch": 9.66324746473948, + "grad_norm": 0.111328125, + "learning_rate": 4.6197358769653985e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9993055582046508, + "num_tokens": 137703980.0, + "step": 41455 + }, + { + "entropy": 0.06064219465479255, + "epoch": 9.664413101760111, + "grad_norm": 0.201171875, + "learning_rate": 4.619626236729006e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999850571155549, + "num_tokens": 137720798.0, + "step": 41460 + }, + { + "entropy": 0.057933222688734534, + "epoch": 9.665578738780743, + "grad_norm": 0.083984375, + "learning_rate": 4.6195165835251775e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 137734242.0, + "step": 41465 + }, + { + "entropy": 0.10937680806964636, + "epoch": 9.666744375801375, + "grad_norm": 0.09130859375, + "learning_rate": 4.61940691735555e-05, + "loss": 0.0729, + "mean_token_accuracy": 0.990872037410736, + "num_tokens": 137757622.0, + "step": 41470 + }, + { + "entropy": 0.08084865333512425, + "epoch": 9.667910012822007, + "grad_norm": 0.09130859375, + "learning_rate": 4.6192972382217566e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9995614051818847, + "num_tokens": 137778970.0, + "step": 41475 + }, + { + "entropy": 0.0488979984074831, + "epoch": 9.66907564984264, + "grad_norm": 0.06494140625, + "learning_rate": 4.619187546125434e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999493420124054, + "num_tokens": 137806820.0, + "step": 41480 + }, + { + "entropy": 0.05168530084192753, + "epoch": 9.67024128686327, + "grad_norm": 0.91015625, + "learning_rate": 4.619077841068218e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9999542713165284, + "num_tokens": 137834902.0, + "step": 41485 + }, + { + "entropy": 0.07210210133343935, + "epoch": 9.671406923883902, + "grad_norm": 0.0849609375, + "learning_rate": 4.6189681230517446e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 137855260.0, + "step": 41490 + }, + { + "entropy": 0.08489122875034809, + "epoch": 9.672572560904534, + "grad_norm": 0.10595703125, + "learning_rate": 4.618858392077649e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9996070742607117, + "num_tokens": 137865330.0, + "step": 41495 + }, + { + "entropy": 0.046634336933493614, + "epoch": 9.673738197925166, + "grad_norm": 0.056396484375, + "learning_rate": 4.618748648147568e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 137879045.0, + "step": 41500 + }, + { + "entropy": 0.05105196684598923, + "epoch": 9.674903834945798, + "grad_norm": 0.154296875, + "learning_rate": 4.6186388912631376e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999715268611908, + "num_tokens": 137902675.0, + "step": 41505 + }, + { + "entropy": 0.05192754324525595, + "epoch": 9.67606947196643, + "grad_norm": 0.380859375, + "learning_rate": 4.6185291214259954e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999767959117889, + "num_tokens": 137931187.0, + "step": 41510 + }, + { + "entropy": 0.0527059281244874, + "epoch": 9.67723510898706, + "grad_norm": 0.1005859375, + "learning_rate": 4.618419338637778e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9996753036975861, + "num_tokens": 137955839.0, + "step": 41515 + }, + { + "entropy": 0.06725910743698478, + "epoch": 9.678400746007693, + "grad_norm": 2.859375, + "learning_rate": 4.618309542900122e-05, + "loss": 0.0075, + "mean_token_accuracy": 0.9991827428340911, + "num_tokens": 137973813.0, + "step": 41520 + }, + { + "entropy": 0.06493516713380813, + "epoch": 9.679566383028325, + "grad_norm": 0.0625, + "learning_rate": 4.618199734214664e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9998061001300812, + "num_tokens": 137992578.0, + "step": 41525 + }, + { + "entropy": 0.07467560321092606, + "epoch": 9.680732020048957, + "grad_norm": 0.041748046875, + "learning_rate": 4.6180899125830425e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9997429311275482, + "num_tokens": 138002434.0, + "step": 41530 + }, + { + "entropy": 0.04612938100472093, + "epoch": 9.68189765706959, + "grad_norm": 0.04150390625, + "learning_rate": 4.617980078006894e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 138022320.0, + "step": 41535 + }, + { + "entropy": 0.05380245940759778, + "epoch": 9.68306329409022, + "grad_norm": 0.03515625, + "learning_rate": 4.617870230487858e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9998850584030151, + "num_tokens": 138042476.0, + "step": 41540 + }, + { + "entropy": 0.04941651728004217, + "epoch": 9.684228931110852, + "grad_norm": 0.337890625, + "learning_rate": 4.617760370027571e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9998548626899719, + "num_tokens": 138066165.0, + "step": 41545 + }, + { + "entropy": 0.07211229205131531, + "epoch": 9.685394568131484, + "grad_norm": 0.1162109375, + "learning_rate": 4.6176504966276726e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 138075102.0, + "step": 41550 + }, + { + "entropy": 0.048767372686415913, + "epoch": 9.686560205152116, + "grad_norm": 0.06982421875, + "learning_rate": 4.617540610289799e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 138090963.0, + "step": 41555 + }, + { + "entropy": 0.07153227487578988, + "epoch": 9.687725842172748, + "grad_norm": 0.390625, + "learning_rate": 4.6174307110155905e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9998355269432068, + "num_tokens": 138108155.0, + "step": 41560 + }, + { + "entropy": 0.04240287458524108, + "epoch": 9.68889147919338, + "grad_norm": 0.1123046875, + "learning_rate": 4.617320798806686e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9999687135219574, + "num_tokens": 138144713.0, + "step": 41565 + }, + { + "entropy": 0.0623874238692224, + "epoch": 9.69005711621401, + "grad_norm": 0.04833984375, + "learning_rate": 4.6172108736647234e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9997009575366974, + "num_tokens": 138164675.0, + "step": 41570 + }, + { + "entropy": 0.055486177653074266, + "epoch": 9.691222753234642, + "grad_norm": 0.0286865234375, + "learning_rate": 4.617100935591342e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9997545659542084, + "num_tokens": 138188695.0, + "step": 41575 + }, + { + "entropy": 0.045479421876370905, + "epoch": 9.692388390255275, + "grad_norm": 0.10107421875, + "learning_rate": 4.616990984588182e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 138203879.0, + "step": 41580 + }, + { + "entropy": 0.053182793455198406, + "epoch": 9.693554027275907, + "grad_norm": 0.26171875, + "learning_rate": 4.6168810206568816e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 138227419.0, + "step": 41585 + }, + { + "entropy": 0.04395967880263925, + "epoch": 9.694719664296539, + "grad_norm": 0.2392578125, + "learning_rate": 4.6167710437990826e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999681532382965, + "num_tokens": 138253694.0, + "step": 41590 + }, + { + "entropy": 0.05410387422889471, + "epoch": 9.695885301317169, + "grad_norm": 0.181640625, + "learning_rate": 4.616661054016422e-05, + "loss": 0.001, + "mean_token_accuracy": 0.999914425611496, + "num_tokens": 138290391.0, + "step": 41595 + }, + { + "entropy": 0.059342716634273526, + "epoch": 9.697050938337801, + "grad_norm": 0.06396484375, + "learning_rate": 4.616551051310543e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 138303049.0, + "step": 41600 + }, + { + "entropy": 0.05551241189241409, + "epoch": 9.698216575358433, + "grad_norm": 0.298828125, + "learning_rate": 4.616441035683084e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9993800580501556, + "num_tokens": 138331547.0, + "step": 41605 + }, + { + "entropy": 0.07022674959152937, + "epoch": 9.699382212379065, + "grad_norm": 0.1748046875, + "learning_rate": 4.616331007135686e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9995691180229187, + "num_tokens": 138352702.0, + "step": 41610 + }, + { + "entropy": 0.07191054821014405, + "epoch": 9.700547849399697, + "grad_norm": 0.236328125, + "learning_rate": 4.6162209656699895e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9997867822647095, + "num_tokens": 138362706.0, + "step": 41615 + }, + { + "entropy": 0.06880695289000868, + "epoch": 9.701713486420328, + "grad_norm": 0.134765625, + "learning_rate": 4.6161109112876355e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999779343605042, + "num_tokens": 138381870.0, + "step": 41620 + }, + { + "entropy": 0.05174048515036702, + "epoch": 9.70287912344096, + "grad_norm": 0.1416015625, + "learning_rate": 4.616000843990265e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999647498130798, + "num_tokens": 138413997.0, + "step": 41625 + }, + { + "entropy": 0.04192606620490551, + "epoch": 9.704044760461592, + "grad_norm": 0.0260009765625, + "learning_rate": 4.61589076377952e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9997602283954621, + "num_tokens": 138450577.0, + "step": 41630 + }, + { + "entropy": 0.06087205857038498, + "epoch": 9.705210397482224, + "grad_norm": 0.275390625, + "learning_rate": 4.6157806706570406e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9993548393249512, + "num_tokens": 138462445.0, + "step": 41635 + }, + { + "entropy": 0.06275629969313741, + "epoch": 9.706376034502856, + "grad_norm": 0.177734375, + "learning_rate": 4.6156705646244694e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999878585338593, + "num_tokens": 138480334.0, + "step": 41640 + }, + { + "entropy": 0.0454404903575778, + "epoch": 9.707541671523488, + "grad_norm": 2.1875, + "learning_rate": 4.615560445683448e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9993854582309722, + "num_tokens": 138503994.0, + "step": 41645 + }, + { + "entropy": 0.06599656357429921, + "epoch": 9.708707308544119, + "grad_norm": 0.0308837890625, + "learning_rate": 4.6154503138356186e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999789416790008, + "num_tokens": 138523903.0, + "step": 41650 + }, + { + "entropy": 0.047876712214201686, + "epoch": 9.70987294556475, + "grad_norm": 0.185546875, + "learning_rate": 4.6153401690826235e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999779760837555, + "num_tokens": 138545309.0, + "step": 41655 + }, + { + "entropy": 0.05285617532208562, + "epoch": 9.711038582585383, + "grad_norm": 0.224609375, + "learning_rate": 4.615230011426105e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999444663524628, + "num_tokens": 138568014.0, + "step": 41660 + }, + { + "entropy": 0.06128771533258259, + "epoch": 9.712204219606015, + "grad_norm": 0.125, + "learning_rate": 4.615119840867705e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9996610164642334, + "num_tokens": 138587152.0, + "step": 41665 + }, + { + "entropy": 0.05335593800991774, + "epoch": 9.713369856626647, + "grad_norm": 0.058837890625, + "learning_rate": 4.6150096574090674e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999667584896088, + "num_tokens": 138621828.0, + "step": 41670 + }, + { + "entropy": 0.05973550733178854, + "epoch": 9.714535493647277, + "grad_norm": 1.7890625, + "learning_rate": 4.614899461051835e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9995299935340881, + "num_tokens": 138639928.0, + "step": 41675 + }, + { + "entropy": 0.06601793747395276, + "epoch": 9.71570113066791, + "grad_norm": 2.21875, + "learning_rate": 4.614789251797651e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9994257688522339, + "num_tokens": 138660350.0, + "step": 41680 + }, + { + "entropy": 0.04857506472617388, + "epoch": 9.716866767688542, + "grad_norm": 0.07080078125, + "learning_rate": 4.614679029648157e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9991885244846344, + "num_tokens": 138687417.0, + "step": 41685 + }, + { + "entropy": 0.04925138596445322, + "epoch": 9.718032404709174, + "grad_norm": 0.0198974609375, + "learning_rate": 4.614568794604999e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 138712767.0, + "step": 41690 + }, + { + "entropy": 0.049935361836105585, + "epoch": 9.719198041729806, + "grad_norm": 0.2099609375, + "learning_rate": 4.6144585466698204e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9995476484298706, + "num_tokens": 138739256.0, + "step": 41695 + }, + { + "entropy": 0.06216288320720196, + "epoch": 9.720363678750438, + "grad_norm": 0.04150390625, + "learning_rate": 4.6143482858442644e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9996389865875244, + "num_tokens": 138755595.0, + "step": 41700 + }, + { + "entropy": 0.03968005385249853, + "epoch": 9.721529315771068, + "grad_norm": 0.193359375, + "learning_rate": 4.614238012129975e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9995426118373871, + "num_tokens": 138792739.0, + "step": 41705 + }, + { + "entropy": 0.06131018102169037, + "epoch": 9.7226949527917, + "grad_norm": 0.10595703125, + "learning_rate": 4.6141277255285974e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 138803123.0, + "step": 41710 + }, + { + "entropy": 0.05428974824026227, + "epoch": 9.723860589812332, + "grad_norm": 0.953125, + "learning_rate": 4.614017426041776e-05, + "loss": 0.0009, + "mean_token_accuracy": 1.0, + "num_tokens": 138818913.0, + "step": 41715 + }, + { + "entropy": 0.07367751612327993, + "epoch": 9.725026226832965, + "grad_norm": 0.099609375, + "learning_rate": 4.6139071136711545e-05, + "loss": 0.0241, + "mean_token_accuracy": 0.9963607609272003, + "num_tokens": 138849574.0, + "step": 41720 + }, + { + "entropy": 0.05248088203370571, + "epoch": 9.726191863853597, + "grad_norm": 0.07177734375, + "learning_rate": 4.6137967884183786e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999784231185913, + "num_tokens": 138869838.0, + "step": 41725 + }, + { + "entropy": 0.052217611204832794, + "epoch": 9.727357500874227, + "grad_norm": 0.029296875, + "learning_rate": 4.613686450285094e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999537229537964, + "num_tokens": 138887125.0, + "step": 41730 + }, + { + "entropy": 0.058260629698634145, + "epoch": 9.72852313789486, + "grad_norm": 0.08544921875, + "learning_rate": 4.613576099272944e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9992043793201446, + "num_tokens": 138909178.0, + "step": 41735 + }, + { + "entropy": 0.07376976646482944, + "epoch": 9.729688774915491, + "grad_norm": 0.06787109375, + "learning_rate": 4.613465735383576e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9981070458889008, + "num_tokens": 138917466.0, + "step": 41740 + }, + { + "entropy": 0.07478579832240939, + "epoch": 9.730854411936123, + "grad_norm": 0.022216796875, + "learning_rate": 4.613355358618636e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 138934280.0, + "step": 41745 + }, + { + "entropy": 0.04984639883041382, + "epoch": 9.732020048956755, + "grad_norm": 0.0966796875, + "learning_rate": 4.6132449689797686e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999508142471314, + "num_tokens": 138965992.0, + "step": 41750 + }, + { + "entropy": 0.05700819352641702, + "epoch": 9.733185685977386, + "grad_norm": 0.76171875, + "learning_rate": 4.61313456646862e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9998159170150757, + "num_tokens": 138982514.0, + "step": 41755 + }, + { + "entropy": 0.05794349061325192, + "epoch": 9.734351322998018, + "grad_norm": 0.72265625, + "learning_rate": 4.613024151086838e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9998278856277466, + "num_tokens": 138999275.0, + "step": 41760 + }, + { + "entropy": 0.06126205716282129, + "epoch": 9.73551696001865, + "grad_norm": 0.06298828125, + "learning_rate": 4.6129137228360665e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9998766958713532, + "num_tokens": 139012755.0, + "step": 41765 + }, + { + "entropy": 0.06369129400700331, + "epoch": 9.736682597039282, + "grad_norm": 0.0400390625, + "learning_rate": 4.612803281717954e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9996539771556854, + "num_tokens": 139023890.0, + "step": 41770 + }, + { + "entropy": 0.055011258088052274, + "epoch": 9.737848234059914, + "grad_norm": 0.08642578125, + "learning_rate": 4.612692827734146e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 139036019.0, + "step": 41775 + }, + { + "entropy": 0.06937271989881992, + "epoch": 9.739013871080546, + "grad_norm": 0.240234375, + "learning_rate": 4.6125823608862916e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999895572662354, + "num_tokens": 139055200.0, + "step": 41780 + }, + { + "entropy": 0.051238779630512, + "epoch": 9.740179508101177, + "grad_norm": 0.08154296875, + "learning_rate": 4.6124718811760366e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9998046875, + "num_tokens": 139076963.0, + "step": 41785 + }, + { + "entropy": 0.050241039227694274, + "epoch": 9.741345145121809, + "grad_norm": 0.115234375, + "learning_rate": 4.612361388605028e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9995464980602264, + "num_tokens": 139102393.0, + "step": 41790 + }, + { + "entropy": 0.050885317660868165, + "epoch": 9.74251078214244, + "grad_norm": 0.1328125, + "learning_rate": 4.6122508831749145e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9997395157814026, + "num_tokens": 139122773.0, + "step": 41795 + }, + { + "entropy": 0.04559088721871376, + "epoch": 9.743676419163073, + "grad_norm": 0.482421875, + "learning_rate": 4.6121403648873435e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999362051486969, + "num_tokens": 139164920.0, + "step": 41800 + }, + { + "entropy": 0.058302114717662334, + "epoch": 9.744842056183705, + "grad_norm": 0.48828125, + "learning_rate": 4.6120298337439624e-05, + "loss": 0.0007, + "mean_token_accuracy": 1.0, + "num_tokens": 139181052.0, + "step": 41805 + }, + { + "entropy": 0.0644199687987566, + "epoch": 9.746007693204335, + "grad_norm": 0.1591796875, + "learning_rate": 4.61191928974642e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9990922808647156, + "num_tokens": 139202201.0, + "step": 41810 + }, + { + "entropy": 0.04844171958975494, + "epoch": 9.747173330224967, + "grad_norm": 0.072265625, + "learning_rate": 4.6118087328963655e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9993967711925507, + "num_tokens": 139233615.0, + "step": 41815 + }, + { + "entropy": 0.04493777519091964, + "epoch": 9.7483389672456, + "grad_norm": 0.061767578125, + "learning_rate": 4.611698163195446e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9998775362968445, + "num_tokens": 139256815.0, + "step": 41820 + }, + { + "entropy": 0.05382460365071893, + "epoch": 9.749504604266232, + "grad_norm": 0.11767578125, + "learning_rate": 4.6115875806453105e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999469757080078, + "num_tokens": 139277285.0, + "step": 41825 + }, + { + "entropy": 0.045704776979982854, + "epoch": 9.750670241286864, + "grad_norm": 0.10009765625, + "learning_rate": 4.6114769852476086e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9996791303157806, + "num_tokens": 139309332.0, + "step": 41830 + }, + { + "entropy": 0.04757586750201881, + "epoch": 9.751835878307496, + "grad_norm": 0.10009765625, + "learning_rate": 4.6113663770039886e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999895751476288, + "num_tokens": 139333860.0, + "step": 41835 + }, + { + "entropy": 0.053784340433776376, + "epoch": 9.753001515328126, + "grad_norm": 0.1279296875, + "learning_rate": 4.611255755916101e-05, + "loss": 0.009, + "mean_token_accuracy": 0.9990944027900696, + "num_tokens": 139357575.0, + "step": 41840 + }, + { + "entropy": 0.0499279048293829, + "epoch": 9.754167152348758, + "grad_norm": 0.0211181640625, + "learning_rate": 4.6111451219855946e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999744951725006, + "num_tokens": 139371869.0, + "step": 41845 + }, + { + "entropy": 0.06379685923457146, + "epoch": 9.75533278936939, + "grad_norm": 0.2197265625, + "learning_rate": 4.611034475214119e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9996087431907654, + "num_tokens": 139391127.0, + "step": 41850 + }, + { + "entropy": 0.04493435020558536, + "epoch": 9.756498426390023, + "grad_norm": 0.0224609375, + "learning_rate": 4.6109238156033247e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999648272991181, + "num_tokens": 139419874.0, + "step": 41855 + }, + { + "entropy": 0.051786020305007695, + "epoch": 9.757664063410655, + "grad_norm": 0.6796875, + "learning_rate": 4.610813143154861e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9998482525348663, + "num_tokens": 139435046.0, + "step": 41860 + }, + { + "entropy": 0.060211922880262135, + "epoch": 9.758829700431285, + "grad_norm": 0.1474609375, + "learning_rate": 4.610702457870378e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999869108200073, + "num_tokens": 139453747.0, + "step": 41865 + }, + { + "entropy": 0.06323374919593334, + "epoch": 9.759995337451917, + "grad_norm": 0.1005859375, + "learning_rate": 4.610591759751528e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.999958622455597, + "num_tokens": 139475495.0, + "step": 41870 + }, + { + "entropy": 0.05418466171249747, + "epoch": 9.76116097447255, + "grad_norm": 0.0537109375, + "learning_rate": 4.61048104879996e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.999826854467392, + "num_tokens": 139508996.0, + "step": 41875 + }, + { + "entropy": 0.049932948080822825, + "epoch": 9.762326611493181, + "grad_norm": 0.061279296875, + "learning_rate": 4.610370325017325e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9995410740375519, + "num_tokens": 139537999.0, + "step": 41880 + }, + { + "entropy": 0.053233534656465055, + "epoch": 9.763492248513813, + "grad_norm": 0.0869140625, + "learning_rate": 4.610259588405275e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 139553111.0, + "step": 41885 + }, + { + "entropy": 0.05464749345555901, + "epoch": 9.764657885534444, + "grad_norm": 0.1513671875, + "learning_rate": 4.61014883896546e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9997717440128326, + "num_tokens": 139572733.0, + "step": 41890 + }, + { + "entropy": 0.06824347507208586, + "epoch": 9.765823522555076, + "grad_norm": 1.4609375, + "learning_rate": 4.610038076699532e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9993533074855805, + "num_tokens": 139581909.0, + "step": 41895 + }, + { + "entropy": 0.06773032071068882, + "epoch": 9.766989159575708, + "grad_norm": 0.1015625, + "learning_rate": 4.609927301609143e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 139602961.0, + "step": 41900 + }, + { + "entropy": 0.06699398197233677, + "epoch": 9.76815479659634, + "grad_norm": 0.1474609375, + "learning_rate": 4.609816513695945e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999671339988708, + "num_tokens": 139625934.0, + "step": 41905 + }, + { + "entropy": 0.06858031582087279, + "epoch": 9.769320433616972, + "grad_norm": 0.12890625, + "learning_rate": 4.609705712961589e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9996108949184418, + "num_tokens": 139635226.0, + "step": 41910 + }, + { + "entropy": 0.07053380329161882, + "epoch": 9.770486070637604, + "grad_norm": 2.640625, + "learning_rate": 4.6095948994077276e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9996874988079071, + "num_tokens": 139647792.0, + "step": 41915 + }, + { + "entropy": 0.062039784714579584, + "epoch": 9.771651707658235, + "grad_norm": 0.310546875, + "learning_rate": 4.6094840730360136e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 139661792.0, + "step": 41920 + }, + { + "entropy": 0.05674058310687542, + "epoch": 9.772817344678867, + "grad_norm": 0.17578125, + "learning_rate": 4.6093732338480996e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9998069524765014, + "num_tokens": 139673558.0, + "step": 41925 + }, + { + "entropy": 0.057136714551597835, + "epoch": 9.773982981699499, + "grad_norm": 0.06787109375, + "learning_rate": 4.6092623818456376e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9996316730976105, + "num_tokens": 139693234.0, + "step": 41930 + }, + { + "entropy": 0.056234538089483976, + "epoch": 9.77514861872013, + "grad_norm": 0.23046875, + "learning_rate": 4.609151517030281e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999083876609802, + "num_tokens": 139738061.0, + "step": 41935 + }, + { + "entropy": 0.10356351681984961, + "epoch": 9.776314255740763, + "grad_norm": 0.1728515625, + "learning_rate": 4.609040639403684e-05, + "loss": 0.0942, + "mean_token_accuracy": 0.9904897749423981, + "num_tokens": 139771667.0, + "step": 41940 + }, + { + "entropy": 0.049145910609513524, + "epoch": 9.777479892761393, + "grad_norm": 0.318359375, + "learning_rate": 4.608929748967498e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999116659164429, + "num_tokens": 139797587.0, + "step": 41945 + }, + { + "entropy": 0.04926884109154343, + "epoch": 9.778645529782025, + "grad_norm": 0.1357421875, + "learning_rate": 4.608818845723378e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9998193085193634, + "num_tokens": 139823734.0, + "step": 41950 + }, + { + "entropy": 0.06200911607593298, + "epoch": 9.779811166802657, + "grad_norm": 0.10888671875, + "learning_rate": 4.6087079296729774e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.999501520395279, + "num_tokens": 139843061.0, + "step": 41955 + }, + { + "entropy": 0.06600505784153939, + "epoch": 9.78097680382329, + "grad_norm": 0.44140625, + "learning_rate": 4.6085970008179496e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999466836452484, + "num_tokens": 139861857.0, + "step": 41960 + }, + { + "entropy": 0.05788415623828769, + "epoch": 9.782142440843922, + "grad_norm": 0.06396484375, + "learning_rate": 4.6084860591599497e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9997093617916107, + "num_tokens": 139881212.0, + "step": 41965 + }, + { + "entropy": 0.04697915324941278, + "epoch": 9.783308077864554, + "grad_norm": 0.1025390625, + "learning_rate": 4.608375104700631e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999893486499787, + "num_tokens": 139911891.0, + "step": 41970 + }, + { + "entropy": 0.051614006608724596, + "epoch": 9.784473714885184, + "grad_norm": 0.0272216796875, + "learning_rate": 4.608264137441648e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9994884729385376, + "num_tokens": 139935765.0, + "step": 41975 + }, + { + "entropy": 0.06123457215726376, + "epoch": 9.785639351905816, + "grad_norm": 0.3828125, + "learning_rate": 4.608153157384657e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9993305623531341, + "num_tokens": 139953020.0, + "step": 41980 + }, + { + "entropy": 0.049324381072074173, + "epoch": 9.786804988926448, + "grad_norm": 0.1025390625, + "learning_rate": 4.6080421645313106e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 139976015.0, + "step": 41985 + }, + { + "entropy": 0.06161579126492143, + "epoch": 9.78797062594708, + "grad_norm": 3.046875, + "learning_rate": 4.6079311588832654e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.9989308536052703, + "num_tokens": 139988700.0, + "step": 41990 + }, + { + "entropy": 0.06346913799643517, + "epoch": 9.789136262967713, + "grad_norm": 3.4375, + "learning_rate": 4.6078201404421754e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9995437502861023, + "num_tokens": 140002194.0, + "step": 41995 + }, + { + "entropy": 0.07399707436561584, + "epoch": 9.790301899988343, + "grad_norm": 1.15625, + "learning_rate": 4.607709109209698e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9995833337306976, + "num_tokens": 140015771.0, + "step": 42000 + }, + { + "entropy": 0.056039393041282894, + "epoch": 9.791467537008975, + "grad_norm": 0.014404296875, + "learning_rate": 4.607598065187487e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999895513057708, + "num_tokens": 140038244.0, + "step": 42005 + }, + { + "entropy": 0.044340645615011456, + "epoch": 9.792633174029607, + "grad_norm": 0.20703125, + "learning_rate": 4.607487008377199e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9995433449745178, + "num_tokens": 140056539.0, + "step": 42010 + }, + { + "entropy": 0.06757498234510421, + "epoch": 9.79379881105024, + "grad_norm": 0.87109375, + "learning_rate": 4.607375938780491e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9990313827991486, + "num_tokens": 140074674.0, + "step": 42015 + }, + { + "entropy": 0.04378555864095688, + "epoch": 9.794964448070871, + "grad_norm": 0.103515625, + "learning_rate": 4.6072648563990174e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 140090614.0, + "step": 42020 + }, + { + "entropy": 0.047512101009488106, + "epoch": 9.796130085091502, + "grad_norm": 0.50390625, + "learning_rate": 4.607153761234435e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.999925297498703, + "num_tokens": 140115414.0, + "step": 42025 + }, + { + "entropy": 0.046311000688001515, + "epoch": 9.797295722112134, + "grad_norm": 0.1923828125, + "learning_rate": 4.6070426532884016e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999650657176972, + "num_tokens": 140144210.0, + "step": 42030 + }, + { + "entropy": 0.05368498945608735, + "epoch": 9.798461359132766, + "grad_norm": 0.384765625, + "learning_rate": 4.606931532562572e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9998359262943268, + "num_tokens": 140170962.0, + "step": 42035 + }, + { + "entropy": 0.05441671833395958, + "epoch": 9.799626996153398, + "grad_norm": 0.59375, + "learning_rate": 4.6068203990586054e-05, + "loss": 0.0044, + "mean_token_accuracy": 0.9996001899242402, + "num_tokens": 140212748.0, + "step": 42040 + }, + { + "entropy": 0.060715678706765176, + "epoch": 9.80079263317403, + "grad_norm": 1.3046875, + "learning_rate": 4.606709252778158e-05, + "loss": 0.0033, + "mean_token_accuracy": 0.9981433093547821, + "num_tokens": 140223455.0, + "step": 42045 + }, + { + "entropy": 0.06080267839133739, + "epoch": 9.801958270194662, + "grad_norm": 0.043212890625, + "learning_rate": 4.6065980937228865e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.999952882528305, + "num_tokens": 140256426.0, + "step": 42050 + }, + { + "entropy": 0.0371818239800632, + "epoch": 9.803123907215292, + "grad_norm": 0.06103515625, + "learning_rate": 4.6064869218944487e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999569833278656, + "num_tokens": 140295217.0, + "step": 42055 + }, + { + "entropy": 0.0610663041472435, + "epoch": 9.804289544235925, + "grad_norm": 0.640625, + "learning_rate": 4.6063757372945036e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9996368050575256, + "num_tokens": 140318166.0, + "step": 42060 + }, + { + "entropy": 0.060305179469287394, + "epoch": 9.805455181256557, + "grad_norm": 0.07568359375, + "learning_rate": 4.606264539924708e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9997816622257233, + "num_tokens": 140329547.0, + "step": 42065 + }, + { + "entropy": 0.06976679842919112, + "epoch": 9.806620818277189, + "grad_norm": 0.060302734375, + "learning_rate": 4.6061533297867196e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 140342013.0, + "step": 42070 + }, + { + "entropy": 0.07246024245396257, + "epoch": 9.807786455297821, + "grad_norm": 0.3203125, + "learning_rate": 4.606042106882198e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 140357969.0, + "step": 42075 + }, + { + "entropy": 0.05506151458248496, + "epoch": 9.808952092318451, + "grad_norm": 0.09033203125, + "learning_rate": 4.6059308712128005e-05, + "loss": 0.0008, + "mean_token_accuracy": 1.0, + "num_tokens": 140381723.0, + "step": 42080 + }, + { + "entropy": 0.058845835737884045, + "epoch": 9.810117729339083, + "grad_norm": 0.208984375, + "learning_rate": 4.605819622780187e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999732673168182, + "num_tokens": 140406918.0, + "step": 42085 + }, + { + "entropy": 0.053271705750375986, + "epoch": 9.811283366359715, + "grad_norm": 1.546875, + "learning_rate": 4.605708361586015e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9994361996650696, + "num_tokens": 140428351.0, + "step": 42090 + }, + { + "entropy": 0.06628831773996353, + "epoch": 9.812449003380348, + "grad_norm": 1.71875, + "learning_rate": 4.605597087631945e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9993700981140137, + "num_tokens": 140440728.0, + "step": 42095 + }, + { + "entropy": 0.04875661414116621, + "epoch": 9.81361464040098, + "grad_norm": 0.09619140625, + "learning_rate": 4.605485800919635e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9992801308631897, + "num_tokens": 140466856.0, + "step": 42100 + }, + { + "entropy": 0.10183219909667969, + "epoch": 9.814780277421612, + "grad_norm": 2.03125, + "learning_rate": 4.605374501450745e-05, + "loss": 0.1021, + "mean_token_accuracy": 0.9827478647232055, + "num_tokens": 140493294.0, + "step": 42105 + }, + { + "entropy": 0.05042801667004824, + "epoch": 9.815945914442242, + "grad_norm": 0.06982421875, + "learning_rate": 4.605263189226935e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9998808383941651, + "num_tokens": 140527597.0, + "step": 42110 + }, + { + "entropy": 0.05014429837465286, + "epoch": 9.817111551462874, + "grad_norm": 0.1396484375, + "learning_rate": 4.6051518642498635e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9996078431606292, + "num_tokens": 140545375.0, + "step": 42115 + }, + { + "entropy": 0.07266998011618853, + "epoch": 9.818277188483506, + "grad_norm": 0.1904296875, + "learning_rate": 4.605040526521193e-05, + "loss": 0.004, + "mean_token_accuracy": 0.9994434773921966, + "num_tokens": 140575461.0, + "step": 42120 + }, + { + "entropy": 0.04460279541090131, + "epoch": 9.819442825504138, + "grad_norm": 0.048583984375, + "learning_rate": 4.604929176042581e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 140600307.0, + "step": 42125 + }, + { + "entropy": 0.06226409394294023, + "epoch": 9.82060846252477, + "grad_norm": 0.3671875, + "learning_rate": 4.6048178128156895e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999888122081757, + "num_tokens": 140621568.0, + "step": 42130 + }, + { + "entropy": 0.06694214371964335, + "epoch": 9.8217740995454, + "grad_norm": 0.154296875, + "learning_rate": 4.604706436842178e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999789297580719, + "num_tokens": 140647240.0, + "step": 42135 + }, + { + "entropy": 0.06501425355672837, + "epoch": 9.822939736566033, + "grad_norm": 0.271484375, + "learning_rate": 4.604595048123709e-05, + "loss": 0.0048, + "mean_token_accuracy": 0.9990079879760743, + "num_tokens": 140655241.0, + "step": 42140 + }, + { + "entropy": 0.05788929592818022, + "epoch": 9.824105373586665, + "grad_norm": 0.2255859375, + "learning_rate": 4.604483646661942e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9995981812477112, + "num_tokens": 140668117.0, + "step": 42145 + }, + { + "entropy": 0.05733644086867571, + "epoch": 9.825271010607297, + "grad_norm": 0.0673828125, + "learning_rate": 4.6043722324585385e-05, + "loss": 0.0042, + "mean_token_accuracy": 0.9997166275978089, + "num_tokens": 140693790.0, + "step": 42150 + }, + { + "entropy": 0.0445280559360981, + "epoch": 9.82643664762793, + "grad_norm": 0.3515625, + "learning_rate": 4.60426080551516e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9996988236904144, + "num_tokens": 140717437.0, + "step": 42155 + }, + { + "entropy": 0.05390657912939787, + "epoch": 9.82760228464856, + "grad_norm": 0.322265625, + "learning_rate": 4.6041493658334685e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9997225880622864, + "num_tokens": 140731334.0, + "step": 42160 + }, + { + "entropy": 0.057373590767383575, + "epoch": 9.828767921669192, + "grad_norm": 0.1435546875, + "learning_rate": 4.6040379134151245e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999842882156372, + "num_tokens": 140759282.0, + "step": 42165 + }, + { + "entropy": 0.0680330197326839, + "epoch": 9.829933558689824, + "grad_norm": 0.51171875, + "learning_rate": 4.603926448261791e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9999366581439972, + "num_tokens": 140778046.0, + "step": 42170 + }, + { + "entropy": 0.044882729463279245, + "epoch": 9.831099195710456, + "grad_norm": 0.138671875, + "learning_rate": 4.6038149703751296e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9997124314308167, + "num_tokens": 140798807.0, + "step": 42175 + }, + { + "entropy": 0.05645276643335819, + "epoch": 9.832264832731088, + "grad_norm": 0.2490234375, + "learning_rate": 4.603703479756803e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 140809550.0, + "step": 42180 + }, + { + "entropy": 0.07319327550940216, + "epoch": 9.83343046975172, + "grad_norm": 0.03369140625, + "learning_rate": 4.603591976408474e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 140833990.0, + "step": 42185 + }, + { + "entropy": 0.050815807469189164, + "epoch": 9.83459610677235, + "grad_norm": 0.12890625, + "learning_rate": 4.603480460331804e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999539196491242, + "num_tokens": 140866343.0, + "step": 42190 + }, + { + "entropy": 0.04648887384682894, + "epoch": 9.835761743792983, + "grad_norm": 1.3203125, + "learning_rate": 4.603368931528457e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9995949506759644, + "num_tokens": 140888272.0, + "step": 42195 + }, + { + "entropy": 0.0669131524860859, + "epoch": 9.836927380813615, + "grad_norm": 1.4140625, + "learning_rate": 4.6032573900000956e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9996757328510284, + "num_tokens": 140900672.0, + "step": 42200 + }, + { + "entropy": 0.05158605314791202, + "epoch": 9.838093017834247, + "grad_norm": 0.181640625, + "learning_rate": 4.603145835748383e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999863147735596, + "num_tokens": 140921895.0, + "step": 42205 + }, + { + "entropy": 0.05032551847398281, + "epoch": 9.839258654854879, + "grad_norm": 0.041015625, + "learning_rate": 4.6030342687749825e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999283790588379, + "num_tokens": 140948592.0, + "step": 42210 + }, + { + "entropy": 0.03406341969966888, + "epoch": 9.84042429187551, + "grad_norm": 0.62890625, + "learning_rate": 4.602922689081559e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9996773183345795, + "num_tokens": 140977425.0, + "step": 42215 + }, + { + "entropy": 0.05516029987484217, + "epoch": 9.841589928896141, + "grad_norm": 0.6640625, + "learning_rate": 4.6028110966697745e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999792456626893, + "num_tokens": 141008850.0, + "step": 42220 + }, + { + "entropy": 0.0569972931407392, + "epoch": 9.842755565916773, + "grad_norm": 0.134765625, + "learning_rate": 4.602699491541294e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999261140823364, + "num_tokens": 141043992.0, + "step": 42225 + }, + { + "entropy": 0.08039007391780614, + "epoch": 9.843921202937405, + "grad_norm": 0.7421875, + "learning_rate": 4.6025878736977817e-05, + "loss": 0.0007, + "mean_token_accuracy": 1.0, + "num_tokens": 141055054.0, + "step": 42230 + }, + { + "entropy": 0.06913083251565695, + "epoch": 9.845086839958038, + "grad_norm": 0.07080078125, + "learning_rate": 4.602476243140902e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999841928482056, + "num_tokens": 141070624.0, + "step": 42235 + }, + { + "entropy": 0.07660244479775428, + "epoch": 9.84625247697867, + "grad_norm": 0.19140625, + "learning_rate": 4.602364599872319e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999557554721832, + "num_tokens": 141088148.0, + "step": 42240 + }, + { + "entropy": 0.06507962569594383, + "epoch": 9.8474181139993, + "grad_norm": 0.037353515625, + "learning_rate": 4.602252943893698e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9996131360530853, + "num_tokens": 141099636.0, + "step": 42245 + }, + { + "entropy": 0.09644481688737869, + "epoch": 9.848583751019932, + "grad_norm": 0.275390625, + "learning_rate": 4.602141275206704e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 141108319.0, + "step": 42250 + }, + { + "entropy": 0.06280481992289424, + "epoch": 9.849749388040564, + "grad_norm": 0.0303955078125, + "learning_rate": 4.602029593813001e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999892771244049, + "num_tokens": 141129078.0, + "step": 42255 + }, + { + "entropy": 0.05322151854634285, + "epoch": 9.850915025061196, + "grad_norm": 0.578125, + "learning_rate": 4.601917899714256e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9996376812458039, + "num_tokens": 141139131.0, + "step": 42260 + }, + { + "entropy": 0.05685127004981041, + "epoch": 9.852080662081828, + "grad_norm": 0.076171875, + "learning_rate": 4.6018061929121335e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999690473079681, + "num_tokens": 141163027.0, + "step": 42265 + }, + { + "entropy": 0.04338296307250857, + "epoch": 9.853246299102459, + "grad_norm": 0.142578125, + "learning_rate": 4.6016944734083e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999771177768707, + "num_tokens": 141185371.0, + "step": 42270 + }, + { + "entropy": 0.0789628304541111, + "epoch": 9.85441193612309, + "grad_norm": 0.051513671875, + "learning_rate": 4.60158274120442e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9997252762317658, + "num_tokens": 141197892.0, + "step": 42275 + }, + { + "entropy": 0.05556338932365179, + "epoch": 9.855577573143723, + "grad_norm": 0.050537109375, + "learning_rate": 4.601470996302161e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9993739485740661, + "num_tokens": 141209692.0, + "step": 42280 + }, + { + "entropy": 0.06027071522548795, + "epoch": 9.856743210164355, + "grad_norm": 0.79296875, + "learning_rate": 4.601359238703188e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999654769897461, + "num_tokens": 141240991.0, + "step": 42285 + }, + { + "entropy": 0.052701130695641044, + "epoch": 9.857908847184987, + "grad_norm": 0.169921875, + "learning_rate": 4.601247468409169e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999734103679657, + "num_tokens": 141261119.0, + "step": 42290 + }, + { + "entropy": 0.050172841548919676, + "epoch": 9.859074484205617, + "grad_norm": 0.045654296875, + "learning_rate": 4.6011356854217695e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 141278454.0, + "step": 42295 + }, + { + "entropy": 0.05537276756949723, + "epoch": 9.86024012122625, + "grad_norm": 0.07568359375, + "learning_rate": 4.601023889742657e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999526858329773, + "num_tokens": 141315661.0, + "step": 42300 + }, + { + "entropy": 0.048711988516151904, + "epoch": 9.861405758246882, + "grad_norm": 0.6015625, + "learning_rate": 4.6009120813734985e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9993138968944549, + "num_tokens": 141342496.0, + "step": 42305 + }, + { + "entropy": 0.05058379173278808, + "epoch": 9.862571395267514, + "grad_norm": 0.0269775390625, + "learning_rate": 4.600800260315961e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999231338500977, + "num_tokens": 141366952.0, + "step": 42310 + }, + { + "entropy": 0.049850033968687056, + "epoch": 9.863737032288146, + "grad_norm": 0.047119140625, + "learning_rate": 4.600688426571711e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9995084285736084, + "num_tokens": 141385183.0, + "step": 42315 + }, + { + "entropy": 0.04541545612737537, + "epoch": 9.864902669308778, + "grad_norm": 0.2578125, + "learning_rate": 4.6005765801424184e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999175369739532, + "num_tokens": 141408920.0, + "step": 42320 + }, + { + "entropy": 0.05511685479432345, + "epoch": 9.866068306329408, + "grad_norm": 1.546875, + "learning_rate": 4.600464721029749e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9994753241539002, + "num_tokens": 141424253.0, + "step": 42325 + }, + { + "entropy": 0.05739809488877654, + "epoch": 9.86723394335004, + "grad_norm": 0.6015625, + "learning_rate": 4.6003528492353714e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9998377740383149, + "num_tokens": 141443989.0, + "step": 42330 + }, + { + "entropy": 0.08445545099675655, + "epoch": 9.868399580370673, + "grad_norm": 0.2353515625, + "learning_rate": 4.600240964760954e-05, + "loss": 0.0032, + "mean_token_accuracy": 0.9985578715801239, + "num_tokens": 141460527.0, + "step": 42335 + }, + { + "entropy": 0.0682337274774909, + "epoch": 9.869565217391305, + "grad_norm": 1.6484375, + "learning_rate": 4.600129067608165e-05, + "loss": 0.001, + "mean_token_accuracy": 1.0, + "num_tokens": 141473532.0, + "step": 42340 + }, + { + "entropy": 0.05082208625972271, + "epoch": 9.870730854411937, + "grad_norm": 0.026611328125, + "learning_rate": 4.600017157778673e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 141490093.0, + "step": 42345 + }, + { + "entropy": 0.0639322081580758, + "epoch": 9.871896491432567, + "grad_norm": 0.06640625, + "learning_rate": 4.5999052352741464e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.99965980052948, + "num_tokens": 141515258.0, + "step": 42350 + }, + { + "entropy": 0.06224740967154503, + "epoch": 9.8730621284532, + "grad_norm": 0.2431640625, + "learning_rate": 4.599793300096255e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9995000004768372, + "num_tokens": 141525066.0, + "step": 42355 + }, + { + "entropy": 0.06684228293597698, + "epoch": 9.874227765473831, + "grad_norm": 0.0791015625, + "learning_rate": 4.599681352246666e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 141535957.0, + "step": 42360 + }, + { + "entropy": 0.07877216022461653, + "epoch": 9.875393402494463, + "grad_norm": 0.396484375, + "learning_rate": 4.599569391727051e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9993925511837005, + "num_tokens": 141554599.0, + "step": 42365 + }, + { + "entropy": 0.0896750895306468, + "epoch": 9.876559039515096, + "grad_norm": 0.1875, + "learning_rate": 4.599457418539079e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 141569724.0, + "step": 42370 + }, + { + "entropy": 0.041266056802123786, + "epoch": 9.877724676535728, + "grad_norm": 0.1787109375, + "learning_rate": 4.5993454326844183e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.9990445971488953, + "num_tokens": 141608109.0, + "step": 42375 + }, + { + "entropy": 0.055930220521986485, + "epoch": 9.878890313556358, + "grad_norm": 0.046875, + "learning_rate": 4.59923343416474e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9999681651592255, + "num_tokens": 141642229.0, + "step": 42380 + }, + { + "entropy": 0.05225104205310345, + "epoch": 9.88005595057699, + "grad_norm": 0.97265625, + "learning_rate": 4.599121422981714e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9991886019706726, + "num_tokens": 141657383.0, + "step": 42385 + }, + { + "entropy": 0.05562779037281871, + "epoch": 9.881221587597622, + "grad_norm": 0.0308837890625, + "learning_rate": 4.5990093991370105e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999800384044647, + "num_tokens": 141675797.0, + "step": 42390 + }, + { + "entropy": 0.0824700677767396, + "epoch": 9.882387224618254, + "grad_norm": 0.236328125, + "learning_rate": 4.5988973626323e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999433159828186, + "num_tokens": 141698010.0, + "step": 42395 + }, + { + "entropy": 0.054821851290762426, + "epoch": 9.883552861638886, + "grad_norm": 0.466796875, + "learning_rate": 4.598785313469253e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999880850315094, + "num_tokens": 141723941.0, + "step": 42400 + }, + { + "entropy": 0.05956423785537481, + "epoch": 9.884718498659517, + "grad_norm": 2.25, + "learning_rate": 4.5986732516495394e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9996835470199585, + "num_tokens": 141736876.0, + "step": 42405 + }, + { + "entropy": 0.04881628667935729, + "epoch": 9.885884135680149, + "grad_norm": 0.185546875, + "learning_rate": 4.598561177174832e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9995063245296478, + "num_tokens": 141756360.0, + "step": 42410 + }, + { + "entropy": 0.051187887974083426, + "epoch": 9.887049772700781, + "grad_norm": 0.0162353515625, + "learning_rate": 4.598449090046801e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9998794257640838, + "num_tokens": 141800002.0, + "step": 42415 + }, + { + "entropy": 0.07245470732450485, + "epoch": 9.888215409721413, + "grad_norm": 0.052001953125, + "learning_rate": 4.598336990267118e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 141809958.0, + "step": 42420 + }, + { + "entropy": 0.0633705073967576, + "epoch": 9.889381046742045, + "grad_norm": 0.265625, + "learning_rate": 4.598224877837454e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999843239784241, + "num_tokens": 141832394.0, + "step": 42425 + }, + { + "entropy": 0.054108756873756644, + "epoch": 9.890546683762675, + "grad_norm": 0.0859375, + "learning_rate": 4.598112752759482e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999885141849518, + "num_tokens": 141853885.0, + "step": 42430 + }, + { + "entropy": 0.0420980092138052, + "epoch": 9.891712320783308, + "grad_norm": 3.59375, + "learning_rate": 4.5980006150348734e-05, + "loss": 0.0053, + "mean_token_accuracy": 0.999622642993927, + "num_tokens": 141877275.0, + "step": 42435 + }, + { + "entropy": 0.04029533253051341, + "epoch": 9.89287795780394, + "grad_norm": 0.07861328125, + "learning_rate": 4.597888464665299e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999532997608185, + "num_tokens": 141901488.0, + "step": 42440 + }, + { + "entropy": 0.06203906927257776, + "epoch": 9.894043594824572, + "grad_norm": 0.03173828125, + "learning_rate": 4.5977763016524336e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 141916010.0, + "step": 42445 + }, + { + "entropy": 0.04797376096248627, + "epoch": 9.895209231845204, + "grad_norm": 0.050048828125, + "learning_rate": 4.597664125997947e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999784409999848, + "num_tokens": 141949456.0, + "step": 42450 + }, + { + "entropy": 0.0533231851644814, + "epoch": 9.896374868865836, + "grad_norm": 0.1611328125, + "learning_rate": 4.597551937703515e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9998474419116974, + "num_tokens": 141971590.0, + "step": 42455 + }, + { + "entropy": 0.03850670075044036, + "epoch": 9.897540505886466, + "grad_norm": 0.07421875, + "learning_rate": 4.597439736770807e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999563872814179, + "num_tokens": 142008843.0, + "step": 42460 + }, + { + "entropy": 0.05502571975812316, + "epoch": 9.898706142907098, + "grad_norm": 0.0179443359375, + "learning_rate": 4.597327523201499e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9997801601886749, + "num_tokens": 142034549.0, + "step": 42465 + }, + { + "entropy": 0.05452665351331234, + "epoch": 9.89987177992773, + "grad_norm": 0.255859375, + "learning_rate": 4.597215296997263e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9993485331535339, + "num_tokens": 142049199.0, + "step": 42470 + }, + { + "entropy": 0.054085666034370664, + "epoch": 9.901037416948363, + "grad_norm": 0.267578125, + "learning_rate": 4.597103058159773e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999523103237152, + "num_tokens": 142067986.0, + "step": 42475 + }, + { + "entropy": 0.062441179621964694, + "epoch": 9.902203053968995, + "grad_norm": 0.05859375, + "learning_rate": 4.596990806690702e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9998196721076965, + "num_tokens": 142089011.0, + "step": 42480 + }, + { + "entropy": 0.04920492600649595, + "epoch": 9.903368690989625, + "grad_norm": 0.173828125, + "learning_rate": 4.596878542591724e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9997519671916961, + "num_tokens": 142114493.0, + "step": 42485 + }, + { + "entropy": 0.04822962349280715, + "epoch": 9.904534328010257, + "grad_norm": 0.0615234375, + "learning_rate": 4.5967662658645135e-05, + "loss": 0.0022, + "mean_token_accuracy": 0.9994761765003204, + "num_tokens": 142135758.0, + "step": 42490 + }, + { + "entropy": 0.053807942755520345, + "epoch": 9.90569996503089, + "grad_norm": 0.1767578125, + "learning_rate": 4.5966539765107434e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.999971354007721, + "num_tokens": 142157371.0, + "step": 42495 + }, + { + "entropy": 0.05380201395601034, + "epoch": 9.906865602051521, + "grad_norm": 0.03759765625, + "learning_rate": 4.59654167453209e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999534487724304, + "num_tokens": 142183055.0, + "step": 42500 + }, + { + "entropy": 0.06522977026179433, + "epoch": 9.908031239072153, + "grad_norm": 0.041015625, + "learning_rate": 4.596429359930227e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999375700950622, + "num_tokens": 142206833.0, + "step": 42505 + }, + { + "entropy": 0.06614532265812159, + "epoch": 9.909196876092786, + "grad_norm": 3.375, + "learning_rate": 4.5963170327068286e-05, + "loss": 0.0443, + "mean_token_accuracy": 0.9935894548892975, + "num_tokens": 142251545.0, + "step": 42510 + }, + { + "entropy": 0.04537292215973139, + "epoch": 9.910362513113416, + "grad_norm": 0.23046875, + "learning_rate": 4.5962046928635706e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9999306857585907, + "num_tokens": 142280886.0, + "step": 42515 + }, + { + "entropy": 0.08216597959399223, + "epoch": 9.911528150134048, + "grad_norm": 0.1396484375, + "learning_rate": 4.596092340402128e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 142294713.0, + "step": 42520 + }, + { + "entropy": 0.04948452245444059, + "epoch": 9.91269378715468, + "grad_norm": 0.051513671875, + "learning_rate": 4.595979975324176e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999775171279908, + "num_tokens": 142318059.0, + "step": 42525 + }, + { + "entropy": 0.05829036943614483, + "epoch": 9.913859424175312, + "grad_norm": 0.205078125, + "learning_rate": 4.5958675976313895e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 142336852.0, + "step": 42530 + }, + { + "entropy": 0.07351097948849201, + "epoch": 9.915025061195944, + "grad_norm": 0.029541015625, + "learning_rate": 4.5957552073254456e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 142347592.0, + "step": 42535 + }, + { + "entropy": 0.0548619513399899, + "epoch": 9.916190698216575, + "grad_norm": 0.94140625, + "learning_rate": 4.5956428044080194e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9996331691741943, + "num_tokens": 142365828.0, + "step": 42540 + }, + { + "entropy": 0.08164572985842825, + "epoch": 9.917356335237207, + "grad_norm": 0.14453125, + "learning_rate": 4.5955303888807865e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999838948249817, + "num_tokens": 142382841.0, + "step": 42545 + }, + { + "entropy": 0.06366458348929882, + "epoch": 9.918521972257839, + "grad_norm": 0.050048828125, + "learning_rate": 4.595417960745424e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9994186043739319, + "num_tokens": 142394380.0, + "step": 42550 + }, + { + "entropy": 0.050054412055760625, + "epoch": 9.919687609278471, + "grad_norm": 0.1689453125, + "learning_rate": 4.595305520003609e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9997562050819397, + "num_tokens": 142414179.0, + "step": 42555 + }, + { + "entropy": 0.05203136084601283, + "epoch": 9.920853246299103, + "grad_norm": 0.1943359375, + "learning_rate": 4.5951930666570164e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9996435225009919, + "num_tokens": 142430075.0, + "step": 42560 + }, + { + "entropy": 0.053751358296722174, + "epoch": 9.922018883319733, + "grad_norm": 0.2578125, + "learning_rate": 4.595080600707324e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999409317970276, + "num_tokens": 142457226.0, + "step": 42565 + }, + { + "entropy": 0.06119176633656025, + "epoch": 9.923184520340365, + "grad_norm": 1.4375, + "learning_rate": 4.594968122156209e-05, + "loss": 0.0038, + "mean_token_accuracy": 0.9991643846035003, + "num_tokens": 142471257.0, + "step": 42570 + }, + { + "entropy": 0.04179687043651938, + "epoch": 9.924350157360998, + "grad_norm": 0.0390625, + "learning_rate": 4.594855631005348e-05, + "loss": 0.0031, + "mean_token_accuracy": 0.9988110899925232, + "num_tokens": 142502096.0, + "step": 42575 + }, + { + "entropy": 0.049814547039568426, + "epoch": 9.92551579438163, + "grad_norm": 0.21875, + "learning_rate": 4.594743127256419e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999563336372376, + "num_tokens": 142528528.0, + "step": 42580 + }, + { + "entropy": 0.07513895481824875, + "epoch": 9.926681431402262, + "grad_norm": 0.043212890625, + "learning_rate": 4.594630610911099e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 142539302.0, + "step": 42585 + }, + { + "entropy": 0.05776936365291476, + "epoch": 9.927847068422894, + "grad_norm": 0.12109375, + "learning_rate": 4.5945180819710673e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9997615098953248, + "num_tokens": 142568093.0, + "step": 42590 + }, + { + "entropy": 0.0796022929251194, + "epoch": 9.929012705443524, + "grad_norm": 3.421875, + "learning_rate": 4.594405540438e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999736607074737, + "num_tokens": 142585063.0, + "step": 42595 + }, + { + "entropy": 0.061672138934955004, + "epoch": 9.930178342464156, + "grad_norm": 0.6171875, + "learning_rate": 4.5942929863135765e-05, + "loss": 0.003, + "mean_token_accuracy": 0.9992265224456787, + "num_tokens": 142612185.0, + "step": 42600 + }, + { + "entropy": 0.06622321843169629, + "epoch": 9.931343979484788, + "grad_norm": 0.1005859375, + "learning_rate": 4.594180419599474e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9999887228012085, + "num_tokens": 142640222.0, + "step": 42605 + }, + { + "entropy": 0.04617886040359735, + "epoch": 9.93250961650542, + "grad_norm": 0.051513671875, + "learning_rate": 4.594067840297372e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999405086040497, + "num_tokens": 142673271.0, + "step": 42610 + }, + { + "entropy": 0.06401413748972118, + "epoch": 9.933675253526053, + "grad_norm": 0.046142578125, + "learning_rate": 4.5939552484089485e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999673068523407, + "num_tokens": 142697553.0, + "step": 42615 + }, + { + "entropy": 0.06120873279869556, + "epoch": 9.934840890546683, + "grad_norm": 0.1904296875, + "learning_rate": 4.593842643935884e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.999710202217102, + "num_tokens": 142718665.0, + "step": 42620 + }, + { + "entropy": 0.08795096650719643, + "epoch": 9.936006527567315, + "grad_norm": 0.07177734375, + "learning_rate": 4.593730026879856e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 142725823.0, + "step": 42625 + }, + { + "entropy": 0.0548077093437314, + "epoch": 9.937172164587947, + "grad_norm": 0.0849609375, + "learning_rate": 4.593617397242544e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 142741644.0, + "step": 42630 + }, + { + "entropy": 0.0867973305284977, + "epoch": 9.93833780160858, + "grad_norm": 1.9453125, + "learning_rate": 4.5935047550256274e-05, + "loss": 0.0042, + "mean_token_accuracy": 0.9996503353118896, + "num_tokens": 142770440.0, + "step": 42635 + }, + { + "entropy": 0.060863146930933, + "epoch": 9.939503438629211, + "grad_norm": 2.546875, + "learning_rate": 4.5933921002307875e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9998944044113159, + "num_tokens": 142784755.0, + "step": 42640 + }, + { + "entropy": 0.050374433491379024, + "epoch": 9.940669075649843, + "grad_norm": 0.2392578125, + "learning_rate": 4.593279432859702e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9991772949695588, + "num_tokens": 142813398.0, + "step": 42645 + }, + { + "entropy": 0.06011525299400091, + "epoch": 9.941834712670474, + "grad_norm": 0.1806640625, + "learning_rate": 4.5931667529140516e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9998737394809722, + "num_tokens": 142827724.0, + "step": 42650 + }, + { + "entropy": 0.059044943377375605, + "epoch": 9.943000349691106, + "grad_norm": 0.177734375, + "learning_rate": 4.593054060395517e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9989231169223786, + "num_tokens": 142851612.0, + "step": 42655 + }, + { + "entropy": 0.057979169255122545, + "epoch": 9.944165986711738, + "grad_norm": 0.189453125, + "learning_rate": 4.592941355305778e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9997536420822144, + "num_tokens": 142876056.0, + "step": 42660 + }, + { + "entropy": 0.0455438518896699, + "epoch": 9.94533162373237, + "grad_norm": 0.078125, + "learning_rate": 4.5928286376465156e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999211847782135, + "num_tokens": 142898663.0, + "step": 42665 + }, + { + "entropy": 0.06030480302870274, + "epoch": 9.946497260753002, + "grad_norm": 0.09765625, + "learning_rate": 4.592715907419411e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9997260272502899, + "num_tokens": 142908469.0, + "step": 42670 + }, + { + "entropy": 0.053213020972907546, + "epoch": 9.947662897773633, + "grad_norm": 0.197265625, + "learning_rate": 4.5926031646261445e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9995137393474579, + "num_tokens": 142921349.0, + "step": 42675 + }, + { + "entropy": 0.055709246825426814, + "epoch": 9.948828534794265, + "grad_norm": 0.02490234375, + "learning_rate": 4.5924904092683974e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 142932311.0, + "step": 42680 + }, + { + "entropy": 0.05174860148690641, + "epoch": 9.949994171814897, + "grad_norm": 0.033935546875, + "learning_rate": 4.5923776413478506e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9995390951633454, + "num_tokens": 142970140.0, + "step": 42685 + }, + { + "entropy": 0.06010691300034523, + "epoch": 9.951159808835529, + "grad_norm": 0.609375, + "learning_rate": 4.592264860866187e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 142980699.0, + "step": 42690 + }, + { + "entropy": 0.05457963831722736, + "epoch": 9.952325445856161, + "grad_norm": 1.859375, + "learning_rate": 4.592152067825087e-05, + "loss": 0.002, + "mean_token_accuracy": 0.999420291185379, + "num_tokens": 142993009.0, + "step": 42695 + }, + { + "entropy": 0.15338611789047718, + "epoch": 9.953491082876791, + "grad_norm": 1.4140625, + "learning_rate": 4.592039262226233e-05, + "loss": 0.2135, + "mean_token_accuracy": 0.9780685484409333, + "num_tokens": 143016315.0, + "step": 42700 + }, + { + "entropy": 0.04838618785142899, + "epoch": 9.954656719897423, + "grad_norm": 0.1689453125, + "learning_rate": 4.591926444071307e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999370753765107, + "num_tokens": 143043190.0, + "step": 42705 + }, + { + "entropy": 0.06054220134392381, + "epoch": 9.955822356918056, + "grad_norm": 0.2578125, + "learning_rate": 4.5918136133619924e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999464511871338, + "num_tokens": 143067614.0, + "step": 42710 + }, + { + "entropy": 0.053869908582419156, + "epoch": 9.956987993938688, + "grad_norm": 0.1669921875, + "learning_rate": 4.59170077009997e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9998924374580384, + "num_tokens": 143104360.0, + "step": 42715 + }, + { + "entropy": 0.05953489625826478, + "epoch": 9.95815363095932, + "grad_norm": 0.0260009765625, + "learning_rate": 4.591587914286923e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999382019042968, + "num_tokens": 143121331.0, + "step": 42720 + }, + { + "entropy": 0.06068947771564126, + "epoch": 9.959319267979952, + "grad_norm": 1.90625, + "learning_rate": 4.591475045924534e-05, + "loss": 0.0021, + "mean_token_accuracy": 0.9996376812458039, + "num_tokens": 143137585.0, + "step": 42725 + }, + { + "entropy": 0.05690534273162484, + "epoch": 9.960484905000582, + "grad_norm": 0.038330078125, + "learning_rate": 4.5913621650144866e-05, + "loss": 0.001, + "mean_token_accuracy": 1.0, + "num_tokens": 143160525.0, + "step": 42730 + }, + { + "entropy": 0.0698689054697752, + "epoch": 9.961650542021214, + "grad_norm": 0.0927734375, + "learning_rate": 4.5912492715584645e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 143170908.0, + "step": 42735 + }, + { + "entropy": 0.059409552905708554, + "epoch": 9.962816179041846, + "grad_norm": 0.578125, + "learning_rate": 4.59113636555815e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999563694000244, + "num_tokens": 143186397.0, + "step": 42740 + }, + { + "entropy": 0.05956787364557385, + "epoch": 9.963981816062478, + "grad_norm": 0.1748046875, + "learning_rate": 4.591023447015227e-05, + "loss": 0.0028, + "mean_token_accuracy": 0.9996439158916474, + "num_tokens": 143208394.0, + "step": 42745 + }, + { + "entropy": 0.061505304835736754, + "epoch": 9.96514745308311, + "grad_norm": 0.053466796875, + "learning_rate": 4.5909105159313796e-05, + "loss": 0.0006, + "mean_token_accuracy": 1.0, + "num_tokens": 143217121.0, + "step": 42750 + }, + { + "entropy": 0.05436940034851432, + "epoch": 9.966313090103741, + "grad_norm": 0.06689453125, + "learning_rate": 4.5907975723082916e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999572575092316, + "num_tokens": 143237233.0, + "step": 42755 + }, + { + "entropy": 0.06057135965675116, + "epoch": 9.967478727124373, + "grad_norm": 0.55859375, + "learning_rate": 4.5906846161476475e-05, + "loss": 0.0024, + "mean_token_accuracy": 0.9984305322170257, + "num_tokens": 143259643.0, + "step": 42760 + }, + { + "entropy": 0.044597274530678986, + "epoch": 9.968644364145005, + "grad_norm": 0.1962890625, + "learning_rate": 4.5905716474511307e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9998950362205505, + "num_tokens": 143283034.0, + "step": 42765 + }, + { + "entropy": 0.06079157404601574, + "epoch": 9.969810001165637, + "grad_norm": 0.212890625, + "learning_rate": 4.590458666220427e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.999790358543396, + "num_tokens": 143299578.0, + "step": 42770 + }, + { + "entropy": 0.0736723642796278, + "epoch": 9.97097563818627, + "grad_norm": 0.212890625, + "learning_rate": 4.59034567245722e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9993957698345184, + "num_tokens": 143315016.0, + "step": 42775 + }, + { + "entropy": 0.07209227010607719, + "epoch": 9.972141275206901, + "grad_norm": 0.06787109375, + "learning_rate": 4.590232666163196e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9989817023277283, + "num_tokens": 143343550.0, + "step": 42780 + }, + { + "entropy": 0.052204666286706926, + "epoch": 9.973306912227532, + "grad_norm": 0.042236328125, + "learning_rate": 4.5901196473400374e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 143365780.0, + "step": 42785 + }, + { + "entropy": 0.06627508513629436, + "epoch": 9.974472549248164, + "grad_norm": 0.0693359375, + "learning_rate": 4.590006615989432e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 143376379.0, + "step": 42790 + }, + { + "entropy": 0.05749188121408224, + "epoch": 9.975638186268796, + "grad_norm": 0.4140625, + "learning_rate": 4.589893572113065e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.99997838139534, + "num_tokens": 143395659.0, + "step": 42795 + }, + { + "entropy": 0.04636556897312403, + "epoch": 9.976803823289428, + "grad_norm": 0.0262451171875, + "learning_rate": 4.589780515712622e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999452292919159, + "num_tokens": 143415294.0, + "step": 42800 + }, + { + "entropy": 0.059783428255468604, + "epoch": 9.97796946031006, + "grad_norm": 0.41796875, + "learning_rate": 4.589667446789787e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999665439128875, + "num_tokens": 143451315.0, + "step": 42805 + }, + { + "entropy": 0.04871940072625876, + "epoch": 9.97913509733069, + "grad_norm": 0.1123046875, + "learning_rate": 4.589554365346248e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 143465653.0, + "step": 42810 + }, + { + "entropy": 0.07266805339604616, + "epoch": 9.980300734351323, + "grad_norm": 0.032958984375, + "learning_rate": 4.5894412713836906e-05, + "loss": 0.0018, + "mean_token_accuracy": 0.9993788838386536, + "num_tokens": 143475465.0, + "step": 42815 + }, + { + "entropy": 0.05912731643766165, + "epoch": 9.981466371371955, + "grad_norm": 0.0247802734375, + "learning_rate": 4.589328164903802e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 143489561.0, + "step": 42820 + }, + { + "entropy": 0.05282509876415133, + "epoch": 9.982632008392587, + "grad_norm": 0.09716796875, + "learning_rate": 4.589215045908267e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.999288922548294, + "num_tokens": 143516069.0, + "step": 42825 + }, + { + "entropy": 0.062217441760003565, + "epoch": 9.983797645413219, + "grad_norm": 0.9375, + "learning_rate": 4.589101914398774e-05, + "loss": 0.0027, + "mean_token_accuracy": 0.9992483615875244, + "num_tokens": 143530501.0, + "step": 42830 + }, + { + "entropy": 0.07485974226146937, + "epoch": 9.98496328243385, + "grad_norm": 0.2001953125, + "learning_rate": 4.58898877037701e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9997389018535614, + "num_tokens": 143550036.0, + "step": 42835 + }, + { + "entropy": 0.04717427408322692, + "epoch": 9.986128919454481, + "grad_norm": 0.74609375, + "learning_rate": 4.58887561384466e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9997922837734222, + "num_tokens": 143571664.0, + "step": 42840 + }, + { + "entropy": 0.049185032676905396, + "epoch": 9.987294556475113, + "grad_norm": 0.1650390625, + "learning_rate": 4.588762444803414e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.999546229839325, + "num_tokens": 143599859.0, + "step": 42845 + }, + { + "entropy": 0.045813700463622806, + "epoch": 9.988460193495746, + "grad_norm": 0.09375, + "learning_rate": 4.5886492632549575e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999786972999573, + "num_tokens": 143629787.0, + "step": 42850 + }, + { + "entropy": 0.07945517208427191, + "epoch": 9.989625830516378, + "grad_norm": 0.1357421875, + "learning_rate": 4.58853606920098e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 143655188.0, + "step": 42855 + }, + { + "entropy": 0.05326961185783148, + "epoch": 9.99079146753701, + "grad_norm": 0.2021484375, + "learning_rate": 4.588422862643168e-05, + "loss": 0.0123, + "mean_token_accuracy": 0.9983867526054382, + "num_tokens": 143680228.0, + "step": 42860 + }, + { + "entropy": 0.061639095610007646, + "epoch": 9.99195710455764, + "grad_norm": 0.65234375, + "learning_rate": 4.58830964358321e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999892473220825, + "num_tokens": 143697468.0, + "step": 42865 + }, + { + "entropy": 0.05384563766419888, + "epoch": 9.993122741578272, + "grad_norm": 0.13671875, + "learning_rate": 4.588196412022795e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.999766594171524, + "num_tokens": 143720560.0, + "step": 42870 + }, + { + "entropy": 0.05485376380383968, + "epoch": 9.994288378598904, + "grad_norm": 0.177734375, + "learning_rate": 4.58808316796361e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9998770773410797, + "num_tokens": 143731863.0, + "step": 42875 + }, + { + "entropy": 0.06633745562285184, + "epoch": 9.995454015619536, + "grad_norm": 0.12255859375, + "learning_rate": 4.587969911407344e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 143748516.0, + "step": 42880 + }, + { + "entropy": 0.041839384008198977, + "epoch": 9.996619652640168, + "grad_norm": 0.1923828125, + "learning_rate": 4.587856642355687e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999523460865021, + "num_tokens": 143773800.0, + "step": 42885 + }, + { + "entropy": 0.03867072528228164, + "epoch": 9.997785289660799, + "grad_norm": 0.10888671875, + "learning_rate": 4.5877433608103275e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9996968567371368, + "num_tokens": 143807957.0, + "step": 42890 + }, + { + "entropy": 0.044101893063634635, + "epoch": 9.998950926681431, + "grad_norm": 0.06591796875, + "learning_rate": 4.587630066772954e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999754369258881, + "num_tokens": 143832484.0, + "step": 42895 + }, + { + "entropy": 0.05562862671083874, + "epoch": 10.0, + "grad_norm": 0.0458984375, + "learning_rate": 4.587516760245257e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9997179905573527, + "num_tokens": 143846000.0, + "step": 42900 + }, + { + "entropy": 0.05632606642320752, + "epoch": 10.001165637020632, + "grad_norm": 0.64453125, + "learning_rate": 4.587403441228924e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 143862104.0, + "step": 42905 + }, + { + "entropy": 0.052796078659594056, + "epoch": 10.002331274041264, + "grad_norm": 0.09130859375, + "learning_rate": 4.5872901097256474e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 143880552.0, + "step": 42910 + }, + { + "entropy": 0.06833078693598509, + "epoch": 10.003496911061895, + "grad_norm": 0.04443359375, + "learning_rate": 4.587176765737116e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 143894599.0, + "step": 42915 + }, + { + "entropy": 0.03659242931753397, + "epoch": 10.004662548082527, + "grad_norm": 0.0216064453125, + "learning_rate": 4.5870634092650186e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 143924185.0, + "step": 42920 + }, + { + "entropy": 0.06099397018551826, + "epoch": 10.005828185103159, + "grad_norm": 0.1708984375, + "learning_rate": 4.586950040311048e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9995762705802917, + "num_tokens": 143936581.0, + "step": 42925 + }, + { + "entropy": 0.06240507024340332, + "epoch": 10.00699382212379, + "grad_norm": 0.020263671875, + "learning_rate": 4.586836658876893e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 143964883.0, + "step": 42930 + }, + { + "entropy": 0.05507293636910617, + "epoch": 10.008159459144423, + "grad_norm": 0.08203125, + "learning_rate": 4.586723264964245e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999892473220825, + "num_tokens": 143984179.0, + "step": 42935 + }, + { + "entropy": 0.05958865638822317, + "epoch": 10.009325096165055, + "grad_norm": 0.0712890625, + "learning_rate": 4.586609858574794e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144000657.0, + "step": 42940 + }, + { + "entropy": 0.06111275050789118, + "epoch": 10.010490733185685, + "grad_norm": 0.115234375, + "learning_rate": 4.5864964397102324e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 144012450.0, + "step": 42945 + }, + { + "entropy": 0.05097737601026893, + "epoch": 10.011656370206317, + "grad_norm": 0.0166015625, + "learning_rate": 4.58638300837225e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144031839.0, + "step": 42950 + }, + { + "entropy": 0.0694691475480795, + "epoch": 10.01282200722695, + "grad_norm": 0.0234375, + "learning_rate": 4.586269564562539e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144050020.0, + "step": 42955 + }, + { + "entropy": 0.06655300110578537, + "epoch": 10.013987644247582, + "grad_norm": 0.0634765625, + "learning_rate": 4.586156108282791e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144061209.0, + "step": 42960 + }, + { + "entropy": 0.046417646063491705, + "epoch": 10.015153281268214, + "grad_norm": 0.0137939453125, + "learning_rate": 4.586042639534699e-05, + "loss": 0.0001, + "mean_token_accuracy": 0.999989265203476, + "num_tokens": 144085648.0, + "step": 42965 + }, + { + "entropy": 0.0366755124181509, + "epoch": 10.016318918288844, + "grad_norm": 0.0118408203125, + "learning_rate": 4.585929158319952e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 144107360.0, + "step": 42970 + }, + { + "entropy": 0.053187299706041816, + "epoch": 10.017484555309476, + "grad_norm": 0.01251220703125, + "learning_rate": 4.585815664640245e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144129854.0, + "step": 42975 + }, + { + "entropy": 0.043479549791663887, + "epoch": 10.018650192330108, + "grad_norm": 0.032470703125, + "learning_rate": 4.585702158497269e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999892294406891, + "num_tokens": 144165456.0, + "step": 42980 + }, + { + "entropy": 0.046352763567119835, + "epoch": 10.01981582935074, + "grad_norm": 0.140625, + "learning_rate": 4.585588639892717e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999870836734772, + "num_tokens": 144186276.0, + "step": 42985 + }, + { + "entropy": 0.03585439161397517, + "epoch": 10.020981466371373, + "grad_norm": 0.0269775390625, + "learning_rate": 4.5854751088282815e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999523818492889, + "num_tokens": 144220443.0, + "step": 42990 + }, + { + "entropy": 0.05620026285760105, + "epoch": 10.022147103392003, + "grad_norm": 0.236328125, + "learning_rate": 4.585361565305656e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144239270.0, + "step": 42995 + }, + { + "entropy": 0.06483755446970463, + "epoch": 10.023312740412635, + "grad_norm": 0.03173828125, + "learning_rate": 4.585248009326532e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144248183.0, + "step": 43000 + }, + { + "entropy": 0.039686411060392855, + "epoch": 10.024478377433267, + "grad_norm": 0.0673828125, + "learning_rate": 4.5851344408926046e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144273189.0, + "step": 43005 + }, + { + "entropy": 0.04534797742962837, + "epoch": 10.0256440144539, + "grad_norm": 0.08642578125, + "learning_rate": 4.585020860005567e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999887585639954, + "num_tokens": 144296910.0, + "step": 43010 + }, + { + "entropy": 0.046770491264760496, + "epoch": 10.026809651474531, + "grad_norm": 0.0147705078125, + "learning_rate": 4.5849072666671116e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144310853.0, + "step": 43015 + }, + { + "entropy": 0.061034923605620864, + "epoch": 10.027975288495163, + "grad_norm": 0.2138671875, + "learning_rate": 4.5847936608789336e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9998384475708008, + "num_tokens": 144324377.0, + "step": 43020 + }, + { + "entropy": 0.07491187937557697, + "epoch": 10.029140925515794, + "grad_norm": 0.06884765625, + "learning_rate": 4.584680042642726e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 144334358.0, + "step": 43025 + }, + { + "entropy": 0.05644649108871817, + "epoch": 10.030306562536426, + "grad_norm": 0.2109375, + "learning_rate": 4.584566411960184e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9999792277812958, + "num_tokens": 144360323.0, + "step": 43030 + }, + { + "entropy": 0.0484986113384366, + "epoch": 10.031472199557058, + "grad_norm": 0.06640625, + "learning_rate": 4.5844527688330013e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144383411.0, + "step": 43035 + }, + { + "entropy": 0.061395833175629376, + "epoch": 10.03263783657769, + "grad_norm": 0.0216064453125, + "learning_rate": 4.584339113262873e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9997023820877076, + "num_tokens": 144396897.0, + "step": 43040 + }, + { + "entropy": 0.05118941427208483, + "epoch": 10.033803473598322, + "grad_norm": 0.038330078125, + "learning_rate": 4.584225445251493e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9996197700500489, + "num_tokens": 144427047.0, + "step": 43045 + }, + { + "entropy": 0.05451108440756798, + "epoch": 10.034969110618952, + "grad_norm": 0.01434326171875, + "learning_rate": 4.584111764800557e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 144439872.0, + "step": 43050 + }, + { + "entropy": 0.0546447460539639, + "epoch": 10.036134747639585, + "grad_norm": 0.048095703125, + "learning_rate": 4.58399807191176e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 144454004.0, + "step": 43055 + }, + { + "entropy": 0.04826419707387686, + "epoch": 10.037300384660217, + "grad_norm": 0.044677734375, + "learning_rate": 4.583884366586798e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9998730480670929, + "num_tokens": 144486122.0, + "step": 43060 + }, + { + "entropy": 0.04088318916037679, + "epoch": 10.038466021680849, + "grad_norm": 0.031494140625, + "learning_rate": 4.583770648827366e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 144510438.0, + "step": 43065 + }, + { + "entropy": 0.06502441363409162, + "epoch": 10.03963165870148, + "grad_norm": 0.1201171875, + "learning_rate": 4.583656918635159e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999777853488923, + "num_tokens": 144534876.0, + "step": 43070 + }, + { + "entropy": 0.04362553898245096, + "epoch": 10.040797295722113, + "grad_norm": 0.055419921875, + "learning_rate": 4.583543176011874e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999790906906127, + "num_tokens": 144568855.0, + "step": 43075 + }, + { + "entropy": 0.06911927331238985, + "epoch": 10.041962932742743, + "grad_norm": 0.026123046875, + "learning_rate": 4.5834294209592055e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 144579570.0, + "step": 43080 + }, + { + "entropy": 0.060473337210714816, + "epoch": 10.043128569763375, + "grad_norm": 0.049072265625, + "learning_rate": 4.5833156534788515e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144607964.0, + "step": 43085 + }, + { + "entropy": 0.0562533063814044, + "epoch": 10.044294206784008, + "grad_norm": 0.0419921875, + "learning_rate": 4.5832018735725066e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 144621562.0, + "step": 43090 + }, + { + "entropy": 0.06382388435304165, + "epoch": 10.04545984380464, + "grad_norm": 0.228515625, + "learning_rate": 4.58308808124187e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 144641956.0, + "step": 43095 + }, + { + "entropy": 0.055002011358737946, + "epoch": 10.046625480825272, + "grad_norm": 0.050537109375, + "learning_rate": 4.5829742764886365e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 144651672.0, + "step": 43100 + }, + { + "entropy": 0.05091058509424329, + "epoch": 10.047791117845902, + "grad_norm": 0.0703125, + "learning_rate": 4.582860459314504e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144673179.0, + "step": 43105 + }, + { + "entropy": 0.07972749415785074, + "epoch": 10.048956754866534, + "grad_norm": 0.1328125, + "learning_rate": 4.582746629721169e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144685739.0, + "step": 43110 + }, + { + "entropy": 0.04776022732257843, + "epoch": 10.050122391887166, + "grad_norm": 0.036865234375, + "learning_rate": 4.5826327877103294e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999884486198425, + "num_tokens": 144722168.0, + "step": 43115 + }, + { + "entropy": 0.05045590978115797, + "epoch": 10.051288028907798, + "grad_norm": 0.03564453125, + "learning_rate": 4.5825189332836826e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 144742010.0, + "step": 43120 + }, + { + "entropy": 0.04327065721154213, + "epoch": 10.05245366592843, + "grad_norm": 0.06787109375, + "learning_rate": 4.582405066442926e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999754190444946, + "num_tokens": 144761437.0, + "step": 43125 + }, + { + "entropy": 0.053365825302898885, + "epoch": 10.05361930294906, + "grad_norm": 0.0181884765625, + "learning_rate": 4.582291187189758e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 144775121.0, + "step": 43130 + }, + { + "entropy": 0.05459557678550482, + "epoch": 10.054784939969693, + "grad_norm": 0.049072265625, + "learning_rate": 4.5821772955258766e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 144790489.0, + "step": 43135 + }, + { + "entropy": 0.07486938592046499, + "epoch": 10.055950576990325, + "grad_norm": 0.04052734375, + "learning_rate": 4.58206339145298e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999890625476837, + "num_tokens": 144809884.0, + "step": 43140 + }, + { + "entropy": 0.06250056177377701, + "epoch": 10.057116214010957, + "grad_norm": 0.062255859375, + "learning_rate": 4.5819494749727673e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144843171.0, + "step": 43145 + }, + { + "entropy": 0.07772515416145324, + "epoch": 10.05828185103159, + "grad_norm": 0.1142578125, + "learning_rate": 4.581835546086936e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999781608581543, + "num_tokens": 144863844.0, + "step": 43150 + }, + { + "entropy": 0.06099540926516056, + "epoch": 10.059447488052221, + "grad_norm": 0.0255126953125, + "learning_rate": 4.581721604797186e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9997014939785004, + "num_tokens": 144875069.0, + "step": 43155 + }, + { + "entropy": 0.04972206288948655, + "epoch": 10.060613125072852, + "grad_norm": 0.0150146484375, + "learning_rate": 4.5816076511052156e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999792039394378, + "num_tokens": 144894553.0, + "step": 43160 + }, + { + "entropy": 0.07169721573591233, + "epoch": 10.061778762093484, + "grad_norm": 0.064453125, + "learning_rate": 4.5814936850127246e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999859213829041, + "num_tokens": 144910261.0, + "step": 43165 + }, + { + "entropy": 0.06280940165743232, + "epoch": 10.062944399114116, + "grad_norm": 0.0157470703125, + "learning_rate": 4.5813797065214114e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 144926046.0, + "step": 43170 + }, + { + "entropy": 0.06737614311277866, + "epoch": 10.064110036134748, + "grad_norm": 0.10009765625, + "learning_rate": 4.581265715632977e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144941287.0, + "step": 43175 + }, + { + "entropy": 0.06027284953743219, + "epoch": 10.06527567315538, + "grad_norm": 0.0159912109375, + "learning_rate": 4.5811517123491197e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9996774196624756, + "num_tokens": 144963613.0, + "step": 43180 + }, + { + "entropy": 0.05328985899686813, + "epoch": 10.06644131017601, + "grad_norm": 0.1435546875, + "learning_rate": 4.5810376966715415e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 144984479.0, + "step": 43185 + }, + { + "entropy": 0.057596872001886366, + "epoch": 10.067606947196643, + "grad_norm": 10.0625, + "learning_rate": 4.5809236686019404e-05, + "loss": 0.002, + "mean_token_accuracy": 0.9995798289775848, + "num_tokens": 144995502.0, + "step": 43190 + }, + { + "entropy": 0.05896051302552223, + "epoch": 10.068772584217275, + "grad_norm": 0.0291748046875, + "learning_rate": 4.580809628142018e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145009177.0, + "step": 43195 + }, + { + "entropy": 0.048872120585292576, + "epoch": 10.069938221237907, + "grad_norm": 0.1181640625, + "learning_rate": 4.5806955752934736e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145038730.0, + "step": 43200 + }, + { + "entropy": 0.055742715485394, + "epoch": 10.071103858258539, + "grad_norm": 0.031494140625, + "learning_rate": 4.5805815100580096e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 145048608.0, + "step": 43205 + }, + { + "entropy": 0.05829884652048349, + "epoch": 10.072269495279171, + "grad_norm": 0.034912109375, + "learning_rate": 4.580467432437326e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 145065347.0, + "step": 43210 + }, + { + "entropy": 0.069200593046844, + "epoch": 10.073435132299801, + "grad_norm": 0.0308837890625, + "learning_rate": 4.580353342433124e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 145077310.0, + "step": 43215 + }, + { + "entropy": 0.05498428577557206, + "epoch": 10.074600769320433, + "grad_norm": 0.0849609375, + "learning_rate": 4.580239240047104e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999893128871917, + "num_tokens": 145099238.0, + "step": 43220 + }, + { + "entropy": 0.0957097552716732, + "epoch": 10.075766406341065, + "grad_norm": 0.111328125, + "learning_rate": 4.5801251252809686e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145109037.0, + "step": 43225 + }, + { + "entropy": 0.06944717965088784, + "epoch": 10.076932043361698, + "grad_norm": 0.01708984375, + "learning_rate": 4.5800109981364194e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145127293.0, + "step": 43230 + }, + { + "entropy": 0.051800630846992136, + "epoch": 10.07809768038233, + "grad_norm": 0.039306640625, + "learning_rate": 4.579896858615157e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999708771705628, + "num_tokens": 145151103.0, + "step": 43235 + }, + { + "entropy": 0.05186516717076302, + "epoch": 10.07926331740296, + "grad_norm": 0.0260009765625, + "learning_rate": 4.5797827067188847e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145166630.0, + "step": 43240 + }, + { + "entropy": 0.07183772623538971, + "epoch": 10.080428954423592, + "grad_norm": 0.06884765625, + "learning_rate": 4.579668542449304e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999724447727203, + "num_tokens": 145179262.0, + "step": 43245 + }, + { + "entropy": 0.0538714830763638, + "epoch": 10.081594591444224, + "grad_norm": 0.435546875, + "learning_rate": 4.579554365808118e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 145207774.0, + "step": 43250 + }, + { + "entropy": 0.06259715519845485, + "epoch": 10.082760228464856, + "grad_norm": 0.08740234375, + "learning_rate": 4.579440176797028e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145229440.0, + "step": 43255 + }, + { + "entropy": 0.050983167439699176, + "epoch": 10.083925865485488, + "grad_norm": 0.0361328125, + "learning_rate": 4.5793259754177376e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145242812.0, + "step": 43260 + }, + { + "entropy": 0.08244118951261044, + "epoch": 10.085091502506119, + "grad_norm": 0.033203125, + "learning_rate": 4.5792117616719496e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145251750.0, + "step": 43265 + }, + { + "entropy": 0.06357784420251847, + "epoch": 10.08625713952675, + "grad_norm": 0.0267333984375, + "learning_rate": 4.579097535561367e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999827563762664, + "num_tokens": 145268581.0, + "step": 43270 + }, + { + "entropy": 0.07260563299059868, + "epoch": 10.087422776547383, + "grad_norm": 0.072265625, + "learning_rate": 4.578983297087693e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145278177.0, + "step": 43275 + }, + { + "entropy": 0.04490415137261152, + "epoch": 10.088588413568015, + "grad_norm": 0.029052734375, + "learning_rate": 4.5788690462526315e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145292038.0, + "step": 43280 + }, + { + "entropy": 0.04673054702579975, + "epoch": 10.089754050588647, + "grad_norm": 0.01177978515625, + "learning_rate": 4.5787547830578855e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 145309088.0, + "step": 43285 + }, + { + "entropy": 0.07409633975476027, + "epoch": 10.09091968760928, + "grad_norm": 0.035400390625, + "learning_rate": 4.578640507505159e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145321036.0, + "step": 43290 + }, + { + "entropy": 0.05097045348957181, + "epoch": 10.09208532462991, + "grad_norm": 0.10791015625, + "learning_rate": 4.5785262195961567e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999896228313446, + "num_tokens": 145345470.0, + "step": 43295 + }, + { + "entropy": 0.05900251679122448, + "epoch": 10.093250961650542, + "grad_norm": 0.0203857421875, + "learning_rate": 4.5784119193325824e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 145355866.0, + "step": 43300 + }, + { + "entropy": 0.04195380094461143, + "epoch": 10.094416598671174, + "grad_norm": 0.026123046875, + "learning_rate": 4.57829760671614e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999420940876007, + "num_tokens": 145391273.0, + "step": 43305 + }, + { + "entropy": 0.06040984988212585, + "epoch": 10.095582235691806, + "grad_norm": 0.021484375, + "learning_rate": 4.5781832817485345e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 145404008.0, + "step": 43310 + }, + { + "entropy": 0.05451854122802615, + "epoch": 10.096747872712438, + "grad_norm": 0.1279296875, + "learning_rate": 4.578068944431471e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 145428004.0, + "step": 43315 + }, + { + "entropy": 0.07288854941725731, + "epoch": 10.097913509733068, + "grad_norm": 0.0146484375, + "learning_rate": 4.577954594766653e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 145437308.0, + "step": 43320 + }, + { + "entropy": 0.04537887102924287, + "epoch": 10.0990791467537, + "grad_norm": 0.07470703125, + "learning_rate": 4.5778402327557874e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 145466503.0, + "step": 43325 + }, + { + "entropy": 0.05773584768176079, + "epoch": 10.100244783774333, + "grad_norm": 0.057861328125, + "learning_rate": 4.577725858400579e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999887704849243, + "num_tokens": 145488191.0, + "step": 43330 + }, + { + "entropy": 0.06356157390400767, + "epoch": 10.101410420794965, + "grad_norm": 0.018798828125, + "learning_rate": 4.577611471702733e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 145500787.0, + "step": 43335 + }, + { + "entropy": 0.05118578230030835, + "epoch": 10.102576057815597, + "grad_norm": 0.0164794921875, + "learning_rate": 4.5774970726639546e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999885082244873, + "num_tokens": 145524395.0, + "step": 43340 + }, + { + "entropy": 0.07704499680548907, + "epoch": 10.103741694836229, + "grad_norm": 0.02734375, + "learning_rate": 4.5773826612859505e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145535984.0, + "step": 43345 + }, + { + "entropy": 0.04595871288329363, + "epoch": 10.10490733185686, + "grad_norm": 0.1015625, + "learning_rate": 4.577268237570427e-05, + "loss": 0.0039, + "mean_token_accuracy": 0.9998726367950439, + "num_tokens": 145572556.0, + "step": 43350 + }, + { + "entropy": 0.05619607055559754, + "epoch": 10.106072968877491, + "grad_norm": 0.0299072265625, + "learning_rate": 4.577153801519089e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 145588353.0, + "step": 43355 + }, + { + "entropy": 0.05760320192202926, + "epoch": 10.107238605898123, + "grad_norm": 0.01513671875, + "learning_rate": 4.577039353133644e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999895393848419, + "num_tokens": 145613438.0, + "step": 43360 + }, + { + "entropy": 0.06066440977156162, + "epoch": 10.108404242918756, + "grad_norm": 0.111328125, + "learning_rate": 4.576924892415799e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999894499778748, + "num_tokens": 145635962.0, + "step": 43365 + }, + { + "entropy": 0.05439153090119362, + "epoch": 10.109569879939388, + "grad_norm": 0.0184326171875, + "learning_rate": 4.576810419367259e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.999989140033722, + "num_tokens": 145666330.0, + "step": 43370 + }, + { + "entropy": 0.06398234134539962, + "epoch": 10.110735516960018, + "grad_norm": 0.0252685546875, + "learning_rate": 4.5766959339897325e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145686348.0, + "step": 43375 + }, + { + "entropy": 0.06492618154734373, + "epoch": 10.11190115398065, + "grad_norm": 0.06298828125, + "learning_rate": 4.576581436284926e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145698197.0, + "step": 43380 + }, + { + "entropy": 0.05615097945556045, + "epoch": 10.113066791001282, + "grad_norm": 0.0478515625, + "learning_rate": 4.576466926254547e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145714330.0, + "step": 43385 + }, + { + "entropy": 0.047344126645475625, + "epoch": 10.114232428021914, + "grad_norm": 0.0849609375, + "learning_rate": 4.576352403900304e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145738619.0, + "step": 43390 + }, + { + "entropy": 0.05977406506426632, + "epoch": 10.115398065042546, + "grad_norm": 0.034423828125, + "learning_rate": 4.5762378692239025e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145761272.0, + "step": 43395 + }, + { + "entropy": 0.0675832625478506, + "epoch": 10.116563702063177, + "grad_norm": 0.060546875, + "learning_rate": 4.576123322227053e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999685943126678, + "num_tokens": 145782553.0, + "step": 43400 + }, + { + "entropy": 0.05482005216181278, + "epoch": 10.117729339083809, + "grad_norm": 0.04345703125, + "learning_rate": 4.576008762911461e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 145801907.0, + "step": 43405 + }, + { + "entropy": 0.06072709150612354, + "epoch": 10.11889497610444, + "grad_norm": 0.047119140625, + "learning_rate": 4.5758941912788364e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 145810627.0, + "step": 43410 + }, + { + "entropy": 0.04225499797612429, + "epoch": 10.120060613125073, + "grad_norm": 0.0546875, + "learning_rate": 4.575779607330887e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999840795993805, + "num_tokens": 145833376.0, + "step": 43415 + }, + { + "entropy": 0.06278912676498294, + "epoch": 10.121226250145705, + "grad_norm": 0.039306640625, + "learning_rate": 4.5756650110693225e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 145852253.0, + "step": 43420 + }, + { + "entropy": 0.05084234895184636, + "epoch": 10.122391887166337, + "grad_norm": 0.07666015625, + "learning_rate": 4.5755504024958493e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999851584434509, + "num_tokens": 145870162.0, + "step": 43425 + }, + { + "entropy": 0.048171498160809276, + "epoch": 10.123557524186968, + "grad_norm": 0.125, + "learning_rate": 4.575435781612179e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999879360198974, + "num_tokens": 145899437.0, + "step": 43430 + }, + { + "entropy": 0.04637369932606816, + "epoch": 10.1247231612076, + "grad_norm": 0.0791015625, + "learning_rate": 4.5753211484200195e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 145921647.0, + "step": 43435 + }, + { + "entropy": 0.046095291478559376, + "epoch": 10.125888798228232, + "grad_norm": 0.024169921875, + "learning_rate": 4.57520650292108e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 145952959.0, + "step": 43440 + }, + { + "entropy": 0.03387051681056619, + "epoch": 10.127054435248864, + "grad_norm": 0.11865234375, + "learning_rate": 4.57509184511707e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999893724918365, + "num_tokens": 145988615.0, + "step": 43445 + }, + { + "entropy": 0.07406612485647202, + "epoch": 10.128220072269496, + "grad_norm": 0.00927734375, + "learning_rate": 4.5749771750097e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 146009149.0, + "step": 43450 + }, + { + "entropy": 0.05293272449634969, + "epoch": 10.129385709290126, + "grad_norm": 0.0269775390625, + "learning_rate": 4.5748624926006806e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999882042407989, + "num_tokens": 146036249.0, + "step": 43455 + }, + { + "entropy": 0.07195199579000473, + "epoch": 10.130551346310758, + "grad_norm": 0.2451171875, + "learning_rate": 4.5747477978917196e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 146046878.0, + "step": 43460 + }, + { + "entropy": 0.049850366963073614, + "epoch": 10.13171698333139, + "grad_norm": 0.0308837890625, + "learning_rate": 4.5746330908845286e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9996632993221283, + "num_tokens": 146067672.0, + "step": 43465 + }, + { + "entropy": 0.046344765927642584, + "epoch": 10.132882620352023, + "grad_norm": 0.294921875, + "learning_rate": 4.574518371580818e-05, + "loss": 0.0071, + "mean_token_accuracy": 0.9994359970092773, + "num_tokens": 146102759.0, + "step": 43470 + }, + { + "entropy": 0.058653326518833634, + "epoch": 10.134048257372655, + "grad_norm": 0.0208740234375, + "learning_rate": 4.574403639982298e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 146124319.0, + "step": 43475 + }, + { + "entropy": 0.06518659181892872, + "epoch": 10.135213894393287, + "grad_norm": 0.0206298828125, + "learning_rate": 4.57428889609068e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 146136593.0, + "step": 43480 + }, + { + "entropy": 0.05135323880240321, + "epoch": 10.136379531413917, + "grad_norm": 0.048583984375, + "learning_rate": 4.574174139907675e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 146158241.0, + "step": 43485 + }, + { + "entropy": 0.18844906222075225, + "epoch": 10.13754516843455, + "grad_norm": 5.40625, + "learning_rate": 4.574059371434993e-05, + "loss": 0.2538, + "mean_token_accuracy": 0.9702293097972869, + "num_tokens": 146197255.0, + "step": 43490 + }, + { + "entropy": 0.03568605692125857, + "epoch": 10.138710805455181, + "grad_norm": 0.046630859375, + "learning_rate": 4.573944590674347e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999648869037628, + "num_tokens": 146230349.0, + "step": 43495 + }, + { + "entropy": 0.05035318732261658, + "epoch": 10.139876442475813, + "grad_norm": 0.0361328125, + "learning_rate": 4.573829797627449e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 146243193.0, + "step": 43500 + }, + { + "entropy": 0.06015748176723719, + "epoch": 10.141042079496446, + "grad_norm": 0.03857421875, + "learning_rate": 4.573714992296008e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999892771244049, + "num_tokens": 146266143.0, + "step": 43505 + }, + { + "entropy": 0.07109114537015557, + "epoch": 10.142207716517076, + "grad_norm": 0.0537109375, + "learning_rate": 4.573600174681738e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 146284877.0, + "step": 43510 + }, + { + "entropy": 0.06056566163897514, + "epoch": 10.143373353537708, + "grad_norm": 0.1669921875, + "learning_rate": 4.573485344786351e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999842584133148, + "num_tokens": 146310349.0, + "step": 43515 + }, + { + "entropy": 0.040589083032682535, + "epoch": 10.14453899055834, + "grad_norm": 0.06689453125, + "learning_rate": 4.5733705026115584e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 146340851.0, + "step": 43520 + }, + { + "entropy": 0.06647374760359526, + "epoch": 10.145704627578972, + "grad_norm": 0.05419921875, + "learning_rate": 4.5732556481590736e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 146354591.0, + "step": 43525 + }, + { + "entropy": 0.0501191689632833, + "epoch": 10.146870264599604, + "grad_norm": 0.01611328125, + "learning_rate": 4.573140781430608e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 146369355.0, + "step": 43530 + }, + { + "entropy": 0.0644486665725708, + "epoch": 10.148035901620235, + "grad_norm": 0.10791015625, + "learning_rate": 4.573025902427876e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 146378703.0, + "step": 43535 + }, + { + "entropy": 0.048806856386363506, + "epoch": 10.149201538640867, + "grad_norm": 0.0177001953125, + "learning_rate": 4.5729110111525904e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999835252761841, + "num_tokens": 146406804.0, + "step": 43540 + }, + { + "entropy": 0.060486155655235053, + "epoch": 10.150367175661499, + "grad_norm": 0.0186767578125, + "learning_rate": 4.572796107606463e-05, + "loss": 0.0001, + "mean_token_accuracy": 0.9999890804290772, + "num_tokens": 146435773.0, + "step": 43545 + }, + { + "entropy": 0.057251500664278865, + "epoch": 10.151532812682131, + "grad_norm": 0.0269775390625, + "learning_rate": 4.572681191791208e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999665915966034, + "num_tokens": 146462467.0, + "step": 43550 + }, + { + "entropy": 0.0698289708700031, + "epoch": 10.152698449702763, + "grad_norm": 1.71875, + "learning_rate": 4.57256626370854e-05, + "loss": 0.0159, + "mean_token_accuracy": 0.9981661915779114, + "num_tokens": 146489551.0, + "step": 43555 + }, + { + "entropy": 0.06030179150402546, + "epoch": 10.153864086723395, + "grad_norm": 0.0712890625, + "learning_rate": 4.572451323360171e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 146502896.0, + "step": 43560 + }, + { + "entropy": 0.06003944650292396, + "epoch": 10.155029723744025, + "grad_norm": 0.01373291015625, + "learning_rate": 4.572336370747816e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 146513190.0, + "step": 43565 + }, + { + "entropy": 0.0528954841196537, + "epoch": 10.156195360764658, + "grad_norm": 0.0537109375, + "learning_rate": 4.572221405873189e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 146529473.0, + "step": 43570 + }, + { + "entropy": 0.05009576361626387, + "epoch": 10.15736099778529, + "grad_norm": 0.04638671875, + "learning_rate": 4.572106428738003e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 146547608.0, + "step": 43575 + }, + { + "entropy": 0.05700251702219248, + "epoch": 10.158526634805922, + "grad_norm": 0.07421875, + "learning_rate": 4.571991439343976e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 146575736.0, + "step": 43580 + }, + { + "entropy": 0.058728303946554664, + "epoch": 10.159692271826554, + "grad_norm": 0.049560546875, + "learning_rate": 4.571876437692818e-05, + "loss": 0.0101, + "mean_token_accuracy": 0.9980162858963013, + "num_tokens": 146603676.0, + "step": 43585 + }, + { + "entropy": 0.05301784612238407, + "epoch": 10.160857908847184, + "grad_norm": 0.0189208984375, + "learning_rate": 4.571761423786247e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 146622318.0, + "step": 43590 + }, + { + "entropy": 0.07999129854142666, + "epoch": 10.162023545867816, + "grad_norm": 0.034423828125, + "learning_rate": 4.571646397625977e-05, + "loss": 0.0067, + "mean_token_accuracy": 0.9984594106674194, + "num_tokens": 146645929.0, + "step": 43595 + }, + { + "entropy": 0.07351434212177992, + "epoch": 10.163189182888448, + "grad_norm": 0.06787109375, + "learning_rate": 4.5715313592137235e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 146661177.0, + "step": 43600 + }, + { + "entropy": 0.05000458974391222, + "epoch": 10.16435481990908, + "grad_norm": 0.025146484375, + "learning_rate": 4.5714163085512024e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999616265296936, + "num_tokens": 146687619.0, + "step": 43605 + }, + { + "entropy": 0.05602185511961579, + "epoch": 10.165520456929713, + "grad_norm": 0.1865234375, + "learning_rate": 4.571301245640128e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999705493450165, + "num_tokens": 146710382.0, + "step": 43610 + }, + { + "entropy": 0.0893972160294652, + "epoch": 10.166686093950345, + "grad_norm": 0.080078125, + "learning_rate": 4.5711861704822165e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 146722652.0, + "step": 43615 + }, + { + "entropy": 0.053737116511911154, + "epoch": 10.167851730970975, + "grad_norm": 0.019775390625, + "learning_rate": 4.5710710830791846e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999891698360444, + "num_tokens": 146751021.0, + "step": 43620 + }, + { + "entropy": 0.04591502221301198, + "epoch": 10.169017367991607, + "grad_norm": 0.026611328125, + "learning_rate": 4.5709559834327475e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999671936035156, + "num_tokens": 146784280.0, + "step": 43625 + }, + { + "entropy": 0.06887899702414871, + "epoch": 10.17018300501224, + "grad_norm": 0.030029296875, + "learning_rate": 4.570840871544622e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999736905097961, + "num_tokens": 146823176.0, + "step": 43630 + }, + { + "entropy": 0.045278893690556286, + "epoch": 10.171348642032871, + "grad_norm": 0.020751953125, + "learning_rate": 4.570725747416525e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 146861771.0, + "step": 43635 + }, + { + "entropy": 0.06565503738820552, + "epoch": 10.172514279053503, + "grad_norm": 0.062255859375, + "learning_rate": 4.5706106110501725e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 146883279.0, + "step": 43640 + }, + { + "entropy": 0.07009013332426547, + "epoch": 10.173679916074134, + "grad_norm": 0.06201171875, + "learning_rate": 4.570495462447281e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 146893649.0, + "step": 43645 + }, + { + "entropy": 0.06315956339240074, + "epoch": 10.174845553094766, + "grad_norm": 0.0498046875, + "learning_rate": 4.570380301609568e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 146904111.0, + "step": 43650 + }, + { + "entropy": 0.056829939410090444, + "epoch": 10.176011190115398, + "grad_norm": 0.0247802734375, + "learning_rate": 4.570265128538751e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 146917594.0, + "step": 43655 + }, + { + "entropy": 0.05483479988761246, + "epoch": 10.17717682713603, + "grad_norm": 0.01177978515625, + "learning_rate": 4.5701499432365474e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 146936107.0, + "step": 43660 + }, + { + "entropy": 0.04961704201996327, + "epoch": 10.178342464156662, + "grad_norm": 0.0194091796875, + "learning_rate": 4.570034745704674e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 146954632.0, + "step": 43665 + }, + { + "entropy": 0.047052628640085456, + "epoch": 10.179508101177293, + "grad_norm": 0.019287109375, + "learning_rate": 4.56991953594485e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.999933409690857, + "num_tokens": 146974621.0, + "step": 43670 + }, + { + "entropy": 0.06985645415261388, + "epoch": 10.180673738197925, + "grad_norm": 0.0380859375, + "learning_rate": 4.569804313958792e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 146998567.0, + "step": 43675 + }, + { + "entropy": 0.05309359449893236, + "epoch": 10.181839375218557, + "grad_norm": 0.0380859375, + "learning_rate": 4.569689079748219e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 147017026.0, + "step": 43680 + }, + { + "entropy": 0.058295493759214875, + "epoch": 10.183005012239189, + "grad_norm": 0.0201416015625, + "learning_rate": 4.569573833314849e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 147034319.0, + "step": 43685 + }, + { + "entropy": 0.05003042472526431, + "epoch": 10.184170649259821, + "grad_norm": 0.0263671875, + "learning_rate": 4.5694585746604e-05, + "loss": 0.0029, + "mean_token_accuracy": 0.9996855318546295, + "num_tokens": 147052788.0, + "step": 43690 + }, + { + "entropy": 0.04796261852607131, + "epoch": 10.185336286280453, + "grad_norm": 0.06396484375, + "learning_rate": 4.5693433037865905e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 147076027.0, + "step": 43695 + }, + { + "entropy": 0.04751263782382011, + "epoch": 10.186501923301083, + "grad_norm": 0.036376953125, + "learning_rate": 4.569228020695141e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147099569.0, + "step": 43700 + }, + { + "entropy": 0.050695561431348325, + "epoch": 10.187667560321715, + "grad_norm": 0.019775390625, + "learning_rate": 4.569112725387769e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 147117475.0, + "step": 43705 + }, + { + "entropy": 0.04532196051441133, + "epoch": 10.188833197342348, + "grad_norm": 0.06640625, + "learning_rate": 4.568997417866195e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 147147684.0, + "step": 43710 + }, + { + "entropy": 0.052403824776411055, + "epoch": 10.18999883436298, + "grad_norm": 0.038330078125, + "learning_rate": 4.5688820981321364e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 147172304.0, + "step": 43715 + }, + { + "entropy": 0.054084846563637254, + "epoch": 10.191164471383612, + "grad_norm": 0.0125732421875, + "learning_rate": 4.5687667661873154e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147191474.0, + "step": 43720 + }, + { + "entropy": 0.06174150565639138, + "epoch": 10.192330108404242, + "grad_norm": 0.01422119140625, + "learning_rate": 4.5686514220334495e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.999846625328064, + "num_tokens": 147206627.0, + "step": 43725 + }, + { + "entropy": 0.07426982149481773, + "epoch": 10.193495745424874, + "grad_norm": 0.033935546875, + "learning_rate": 4.56853606567226e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9997890293598175, + "num_tokens": 147216221.0, + "step": 43730 + }, + { + "entropy": 0.04849873506464064, + "epoch": 10.194661382445506, + "grad_norm": 0.0361328125, + "learning_rate": 4.5684206971054666e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 147243363.0, + "step": 43735 + }, + { + "entropy": 0.05038243047893047, + "epoch": 10.195827019466138, + "grad_norm": 0.01953125, + "learning_rate": 4.568305316334789e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147265800.0, + "step": 43740 + }, + { + "entropy": 0.06462544896639884, + "epoch": 10.19699265648677, + "grad_norm": 0.0169677734375, + "learning_rate": 4.568189923361949e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999897480010986, + "num_tokens": 147294528.0, + "step": 43745 + }, + { + "entropy": 0.04675520276650787, + "epoch": 10.198158293507403, + "grad_norm": 0.0262451171875, + "learning_rate": 4.5680745181886665e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 147316504.0, + "step": 43750 + }, + { + "entropy": 0.06306911818683147, + "epoch": 10.199323930528033, + "grad_norm": 0.38671875, + "learning_rate": 4.567959100816663e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147338573.0, + "step": 43755 + }, + { + "entropy": 0.05053731370717287, + "epoch": 10.200489567548665, + "grad_norm": 0.0257568359375, + "learning_rate": 4.567843671247658e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 147349623.0, + "step": 43760 + }, + { + "entropy": 0.04980887686833739, + "epoch": 10.201655204569297, + "grad_norm": 0.078125, + "learning_rate": 4.567728229483374e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147370342.0, + "step": 43765 + }, + { + "entropy": 0.06197886522859335, + "epoch": 10.20282084158993, + "grad_norm": 0.0498046875, + "learning_rate": 4.567612775525533e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147386351.0, + "step": 43770 + }, + { + "entropy": 0.049138284660875796, + "epoch": 10.203986478610561, + "grad_norm": 0.041748046875, + "learning_rate": 4.567497309375854e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9997716903686523, + "num_tokens": 147403614.0, + "step": 43775 + }, + { + "entropy": 0.0651495123282075, + "epoch": 10.205152115631192, + "grad_norm": 0.01806640625, + "learning_rate": 4.567381831036062e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999730885028839, + "num_tokens": 147428634.0, + "step": 43780 + }, + { + "entropy": 0.04915805528871715, + "epoch": 10.206317752651824, + "grad_norm": 0.0263671875, + "learning_rate": 4.5672663405078775e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 147448143.0, + "step": 43785 + }, + { + "entropy": 0.07239555716514587, + "epoch": 10.207483389672456, + "grad_norm": 0.0166015625, + "learning_rate": 4.5671508377930224e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147467194.0, + "step": 43790 + }, + { + "entropy": 0.053422879241406915, + "epoch": 10.208649026693088, + "grad_norm": 0.029541015625, + "learning_rate": 4.567035322893219e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 147486871.0, + "step": 43795 + }, + { + "entropy": 0.049904408678412435, + "epoch": 10.20981466371372, + "grad_norm": 0.0361328125, + "learning_rate": 4.56691979581019e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 147509184.0, + "step": 43800 + }, + { + "entropy": 0.05743863061070442, + "epoch": 10.21098030073435, + "grad_norm": 0.022705078125, + "learning_rate": 4.566804256545659e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9995495498180389, + "num_tokens": 147523305.0, + "step": 43805 + }, + { + "entropy": 0.047840119106695056, + "epoch": 10.212145937754983, + "grad_norm": 0.026123046875, + "learning_rate": 4.5666887051013466e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147554093.0, + "step": 43810 + }, + { + "entropy": 0.05289085814729333, + "epoch": 10.213311574775615, + "grad_norm": 0.045166015625, + "learning_rate": 4.566573141478978e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9997448980808258, + "num_tokens": 147583164.0, + "step": 43815 + }, + { + "entropy": 0.046384062990546225, + "epoch": 10.214477211796247, + "grad_norm": 0.0267333984375, + "learning_rate": 4.566457565680275e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9995967745780945, + "num_tokens": 147607328.0, + "step": 43820 + }, + { + "entropy": 0.05956487860530615, + "epoch": 10.215642848816879, + "grad_norm": 0.0225830078125, + "learning_rate": 4.566341977706963e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147632027.0, + "step": 43825 + }, + { + "entropy": 0.057423918321728704, + "epoch": 10.216808485837511, + "grad_norm": 0.062255859375, + "learning_rate": 4.5662263775607636e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147646068.0, + "step": 43830 + }, + { + "entropy": 0.0800579123198986, + "epoch": 10.217974122858141, + "grad_norm": 0.03125, + "learning_rate": 4.566110765243401e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147662620.0, + "step": 43835 + }, + { + "entropy": 0.0751149789430201, + "epoch": 10.219139759878773, + "grad_norm": 0.0263671875, + "learning_rate": 4.5659951407565995e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 147676792.0, + "step": 43840 + }, + { + "entropy": 0.057619784399867056, + "epoch": 10.220305396899406, + "grad_norm": 0.08935546875, + "learning_rate": 4.5658795041020834e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999888896942138, + "num_tokens": 147701947.0, + "step": 43845 + }, + { + "entropy": 0.07080321535468101, + "epoch": 10.221471033920038, + "grad_norm": 0.037109375, + "learning_rate": 4.565763855281576e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 147711260.0, + "step": 43850 + }, + { + "entropy": 0.042559560388326645, + "epoch": 10.22263667094067, + "grad_norm": 0.0191650390625, + "learning_rate": 4.5656481942968035e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999866664409638, + "num_tokens": 147741647.0, + "step": 43855 + }, + { + "entropy": 0.05230944976210594, + "epoch": 10.2238023079613, + "grad_norm": 0.0263671875, + "learning_rate": 4.5655325211494896e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999886631965638, + "num_tokens": 147771976.0, + "step": 43860 + }, + { + "entropy": 0.07495128940790892, + "epoch": 10.224967944981932, + "grad_norm": 0.0390625, + "learning_rate": 4.565416835841358e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 147786026.0, + "step": 43865 + }, + { + "entropy": 0.060472107119858266, + "epoch": 10.226133582002564, + "grad_norm": 0.0242919921875, + "learning_rate": 4.565301138374136e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999890267848969, + "num_tokens": 147806748.0, + "step": 43870 + }, + { + "entropy": 0.04094974570907652, + "epoch": 10.227299219023196, + "grad_norm": 0.036376953125, + "learning_rate": 4.5651854287495474e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147829803.0, + "step": 43875 + }, + { + "entropy": 0.06098997332155705, + "epoch": 10.228464856043828, + "grad_norm": 0.0224609375, + "learning_rate": 4.5650697069693185e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147841209.0, + "step": 43880 + }, + { + "entropy": 0.062342527136206624, + "epoch": 10.22963049306446, + "grad_norm": 0.028564453125, + "learning_rate": 4.564953973035174e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 147857426.0, + "step": 43885 + }, + { + "entropy": 0.054973821109160784, + "epoch": 10.230796130085091, + "grad_norm": 0.54296875, + "learning_rate": 4.5648382269488396e-05, + "loss": 0.0011, + "mean_token_accuracy": 1.0, + "num_tokens": 147876279.0, + "step": 43890 + }, + { + "entropy": 0.05381541419774294, + "epoch": 10.231961767105723, + "grad_norm": 0.01513671875, + "learning_rate": 4.5647224687120416e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.999957150220871, + "num_tokens": 147904289.0, + "step": 43895 + }, + { + "entropy": 0.042642751894891265, + "epoch": 10.233127404126355, + "grad_norm": 0.0225830078125, + "learning_rate": 4.564606698326506e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 147930693.0, + "step": 43900 + }, + { + "entropy": 0.06699599586427211, + "epoch": 10.234293041146987, + "grad_norm": 0.040771484375, + "learning_rate": 4.5644909157939605e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147939675.0, + "step": 43905 + }, + { + "entropy": 0.061661123856902124, + "epoch": 10.23545867816762, + "grad_norm": 0.09033203125, + "learning_rate": 4.564375121116129e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 147954674.0, + "step": 43910 + }, + { + "entropy": 0.047797265090048315, + "epoch": 10.23662431518825, + "grad_norm": 0.03564453125, + "learning_rate": 4.5642593142947404e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9997142851352692, + "num_tokens": 147991518.0, + "step": 43915 + }, + { + "entropy": 0.04810534734278917, + "epoch": 10.237789952208882, + "grad_norm": 0.04443359375, + "learning_rate": 4.5641434953315205e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999752402305603, + "num_tokens": 148017831.0, + "step": 43920 + }, + { + "entropy": 0.06693259365856648, + "epoch": 10.238955589229514, + "grad_norm": 0.7890625, + "learning_rate": 4.564027664228196e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9996845424175262, + "num_tokens": 148029953.0, + "step": 43925 + }, + { + "entropy": 0.060024634934961796, + "epoch": 10.240121226250146, + "grad_norm": 0.015380859375, + "learning_rate": 4.563911820986495e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148044411.0, + "step": 43930 + }, + { + "entropy": 0.08300753589719534, + "epoch": 10.241286863270778, + "grad_norm": 0.0299072265625, + "learning_rate": 4.563795965608145e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148054513.0, + "step": 43935 + }, + { + "entropy": 0.057464625034481284, + "epoch": 10.242452500291408, + "grad_norm": 0.0164794921875, + "learning_rate": 4.563680098094874e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148070212.0, + "step": 43940 + }, + { + "entropy": 0.04505125693976879, + "epoch": 10.24361813731204, + "grad_norm": 0.026123046875, + "learning_rate": 4.5635642184484076e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9993670880794525, + "num_tokens": 148102139.0, + "step": 43945 + }, + { + "entropy": 0.07114105895161629, + "epoch": 10.244783774332673, + "grad_norm": 0.039306640625, + "learning_rate": 4.563448326670475e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148113838.0, + "step": 43950 + }, + { + "entropy": 0.04451997820287943, + "epoch": 10.245949411353305, + "grad_norm": 0.0203857421875, + "learning_rate": 4.5633324227628044e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999620139598846, + "num_tokens": 148135553.0, + "step": 43955 + }, + { + "entropy": 0.05354665564373136, + "epoch": 10.247115048373937, + "grad_norm": 0.0419921875, + "learning_rate": 4.5632165067271246e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148163500.0, + "step": 43960 + }, + { + "entropy": 0.07594393603503705, + "epoch": 10.248280685394569, + "grad_norm": 0.0147705078125, + "learning_rate": 4.5631005785651625e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 148175929.0, + "step": 43965 + }, + { + "entropy": 0.07074794918298721, + "epoch": 10.2494463224152, + "grad_norm": 0.0233154296875, + "learning_rate": 4.562984638278649e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 148184960.0, + "step": 43970 + }, + { + "entropy": 0.04213942000642419, + "epoch": 10.250611959435831, + "grad_norm": 0.0390625, + "learning_rate": 4.562868685869311e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999892354011536, + "num_tokens": 148218408.0, + "step": 43975 + }, + { + "entropy": 0.05735755441710353, + "epoch": 10.251777596456463, + "grad_norm": 0.0135498046875, + "learning_rate": 4.5627527213388786e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148244867.0, + "step": 43980 + }, + { + "entropy": 0.07946206014603377, + "epoch": 10.252943233477096, + "grad_norm": 0.09716796875, + "learning_rate": 4.5626367446890806e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 148266308.0, + "step": 43985 + }, + { + "entropy": 0.04815430268645286, + "epoch": 10.254108870497728, + "grad_norm": 0.03125, + "learning_rate": 4.562520755921647e-05, + "loss": 0.0058, + "mean_token_accuracy": 0.9998537182807923, + "num_tokens": 148288621.0, + "step": 43990 + }, + { + "entropy": 0.04941950519569218, + "epoch": 10.255274507518358, + "grad_norm": 0.017333984375, + "learning_rate": 4.562404755038305e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9997955024242401, + "num_tokens": 148315529.0, + "step": 43995 + }, + { + "entropy": 0.07427412495017052, + "epoch": 10.25644014453899, + "grad_norm": 0.0233154296875, + "learning_rate": 4.562288742040788e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148325547.0, + "step": 44000 + }, + { + "entropy": 0.04696356984786689, + "epoch": 10.257605781559622, + "grad_norm": 0.392578125, + "learning_rate": 4.5621727169308237e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9997361481189728, + "num_tokens": 148354682.0, + "step": 44005 + }, + { + "entropy": 0.04103820836171508, + "epoch": 10.258771418580254, + "grad_norm": 0.0240478515625, + "learning_rate": 4.562056679710142e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 148391296.0, + "step": 44010 + }, + { + "entropy": 0.0682970798574388, + "epoch": 10.259937055600886, + "grad_norm": 0.2431640625, + "learning_rate": 4.561940630380475e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 148403790.0, + "step": 44015 + }, + { + "entropy": 0.04868617909960449, + "epoch": 10.261102692621517, + "grad_norm": 0.0240478515625, + "learning_rate": 4.561824568943551e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 148423715.0, + "step": 44020 + }, + { + "entropy": 0.05339447120204568, + "epoch": 10.262268329642149, + "grad_norm": 0.013427734375, + "learning_rate": 4.561708495401102e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148439737.0, + "step": 44025 + }, + { + "entropy": 0.059647050127387045, + "epoch": 10.263433966662781, + "grad_norm": 0.021728515625, + "learning_rate": 4.561592409754858e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148451360.0, + "step": 44030 + }, + { + "entropy": 0.049726602528244256, + "epoch": 10.264599603683413, + "grad_norm": 0.0211181640625, + "learning_rate": 4.5614763120065505e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999891757965088, + "num_tokens": 148474790.0, + "step": 44035 + }, + { + "entropy": 0.04956343998201192, + "epoch": 10.265765240704045, + "grad_norm": 0.07177734375, + "learning_rate": 4.5613602021579105e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9995555579662323, + "num_tokens": 148503518.0, + "step": 44040 + }, + { + "entropy": 0.047596500441432, + "epoch": 10.266930877724677, + "grad_norm": 0.020751953125, + "learning_rate": 4.5612440802106706e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 148529529.0, + "step": 44045 + }, + { + "entropy": 0.07422376275062562, + "epoch": 10.268096514745308, + "grad_norm": 0.1083984375, + "learning_rate": 4.56112794616656e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 148543502.0, + "step": 44050 + }, + { + "entropy": 0.04665753650479019, + "epoch": 10.26926215176594, + "grad_norm": 0.0135498046875, + "learning_rate": 4.5610118000273126e-05, + "loss": 0.0007, + "mean_token_accuracy": 1.0, + "num_tokens": 148569697.0, + "step": 44055 + }, + { + "entropy": 0.05942644737660885, + "epoch": 10.270427788786572, + "grad_norm": 0.189453125, + "learning_rate": 4.560895641794659e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.99957115650177, + "num_tokens": 148592942.0, + "step": 44060 + }, + { + "entropy": 0.059556891489773986, + "epoch": 10.271593425807204, + "grad_norm": 0.07666015625, + "learning_rate": 4.5607794714703325e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 148615967.0, + "step": 44065 + }, + { + "entropy": 0.056974831596016885, + "epoch": 10.272759062827836, + "grad_norm": 0.037109375, + "learning_rate": 4.5606632890560634e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 148646794.0, + "step": 44070 + }, + { + "entropy": 0.05813450757414103, + "epoch": 10.273924699848466, + "grad_norm": 0.037353515625, + "learning_rate": 4.560547094553586e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999894082546235, + "num_tokens": 148669486.0, + "step": 44075 + }, + { + "entropy": 0.06914445832371711, + "epoch": 10.275090336869098, + "grad_norm": 0.0159912109375, + "learning_rate": 4.5604308879646326e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 148680933.0, + "step": 44080 + }, + { + "entropy": 0.05653935582377016, + "epoch": 10.27625597388973, + "grad_norm": 0.03125, + "learning_rate": 4.5603146692909356e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 148703638.0, + "step": 44085 + }, + { + "entropy": 0.050142921041697264, + "epoch": 10.277421610910363, + "grad_norm": 0.0184326171875, + "learning_rate": 4.5601984385342275e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148720770.0, + "step": 44090 + }, + { + "entropy": 0.051549593452364206, + "epoch": 10.278587247930995, + "grad_norm": 0.033203125, + "learning_rate": 4.5600821956962424e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999508917331695, + "num_tokens": 148754773.0, + "step": 44095 + }, + { + "entropy": 0.06519920136779547, + "epoch": 10.279752884951627, + "grad_norm": 0.0771484375, + "learning_rate": 4.559965940778713e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 148772964.0, + "step": 44100 + }, + { + "entropy": 0.059289961494505404, + "epoch": 10.280918521972257, + "grad_norm": 0.1865234375, + "learning_rate": 4.559849673783374e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 148787873.0, + "step": 44105 + }, + { + "entropy": 0.06752317813225091, + "epoch": 10.28208415899289, + "grad_norm": 0.0228271484375, + "learning_rate": 4.559733394711958e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148802377.0, + "step": 44110 + }, + { + "entropy": 0.04858728079125285, + "epoch": 10.283249796013521, + "grad_norm": 0.427734375, + "learning_rate": 4.559617103566199e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999848067760467, + "num_tokens": 148836126.0, + "step": 44115 + }, + { + "entropy": 0.04201396256685257, + "epoch": 10.284415433034154, + "grad_norm": 0.134765625, + "learning_rate": 4.559500800347831e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 148855915.0, + "step": 44120 + }, + { + "entropy": 0.08059627935290337, + "epoch": 10.285581070054786, + "grad_norm": 0.0234375, + "learning_rate": 4.559384485058589e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 148874005.0, + "step": 44125 + }, + { + "entropy": 0.044201585277915, + "epoch": 10.286746707075416, + "grad_norm": 0.013427734375, + "learning_rate": 4.559268157700206e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148895357.0, + "step": 44130 + }, + { + "entropy": 0.053767064865678546, + "epoch": 10.287912344096048, + "grad_norm": 0.00860595703125, + "learning_rate": 4.559151818274419e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148908927.0, + "step": 44135 + }, + { + "entropy": 0.0679398087784648, + "epoch": 10.28907798111668, + "grad_norm": 0.0218505859375, + "learning_rate": 4.5590354667829606e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148919535.0, + "step": 44140 + }, + { + "entropy": 0.05648909267038107, + "epoch": 10.290243618137312, + "grad_norm": 0.068359375, + "learning_rate": 4.5589191032275665e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 148941648.0, + "step": 44145 + }, + { + "entropy": 0.05573352323845029, + "epoch": 10.291409255157944, + "grad_norm": 0.01904296875, + "learning_rate": 4.558802727609972e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 148969075.0, + "step": 44150 + }, + { + "entropy": 0.0648234311491251, + "epoch": 10.292574892178575, + "grad_norm": 0.05517578125, + "learning_rate": 4.558686339931912e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148987435.0, + "step": 44155 + }, + { + "entropy": 0.05349012557417154, + "epoch": 10.293740529199207, + "grad_norm": 0.0361328125, + "learning_rate": 4.558569940195122e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 148999688.0, + "step": 44160 + }, + { + "entropy": 0.059884701296687125, + "epoch": 10.294906166219839, + "grad_norm": 0.33984375, + "learning_rate": 4.558453528401339e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9997921824455261, + "num_tokens": 149022227.0, + "step": 44165 + }, + { + "entropy": 0.06587412673979998, + "epoch": 10.296071803240471, + "grad_norm": 0.01300048828125, + "learning_rate": 4.5583371045522975e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 149035404.0, + "step": 44170 + }, + { + "entropy": 0.06613674918189645, + "epoch": 10.297237440261103, + "grad_norm": 0.1201171875, + "learning_rate": 4.558220668649734e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 149063640.0, + "step": 44175 + }, + { + "entropy": 0.07241187756881118, + "epoch": 10.298403077281735, + "grad_norm": 0.12353515625, + "learning_rate": 4.558104220695385e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 149088361.0, + "step": 44180 + }, + { + "entropy": 0.05228366181254387, + "epoch": 10.299568714302366, + "grad_norm": 0.0927734375, + "learning_rate": 4.557987760690986e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 149111431.0, + "step": 44185 + }, + { + "entropy": 0.05424889232963324, + "epoch": 10.300734351322998, + "grad_norm": 0.1416015625, + "learning_rate": 4.557871288638275e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999862551689148, + "num_tokens": 149128277.0, + "step": 44190 + }, + { + "entropy": 0.0512849391438067, + "epoch": 10.30189998834363, + "grad_norm": 0.10498046875, + "learning_rate": 4.557754804538987e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999889254570007, + "num_tokens": 149150972.0, + "step": 44195 + }, + { + "entropy": 0.06689666323363781, + "epoch": 10.303065625364262, + "grad_norm": 0.08447265625, + "learning_rate": 4.55763830839486e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 149170074.0, + "step": 44200 + }, + { + "entropy": 0.060508682392537594, + "epoch": 10.304231262384894, + "grad_norm": 0.2451171875, + "learning_rate": 4.557521800207632e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999882757663727, + "num_tokens": 149198425.0, + "step": 44205 + }, + { + "entropy": 0.07404768951237202, + "epoch": 10.305396899405524, + "grad_norm": 0.0322265625, + "learning_rate": 4.557405279979039e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 149206503.0, + "step": 44210 + }, + { + "entropy": 0.05456382445991039, + "epoch": 10.306562536426156, + "grad_norm": 0.0213623046875, + "learning_rate": 4.5572887477108184e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 149226799.0, + "step": 44215 + }, + { + "entropy": 0.0596602609846741, + "epoch": 10.307728173446788, + "grad_norm": 0.0272216796875, + "learning_rate": 4.557172203404709e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999854505062103, + "num_tokens": 149247136.0, + "step": 44220 + }, + { + "entropy": 0.07430512486025691, + "epoch": 10.30889381046742, + "grad_norm": 0.040771484375, + "learning_rate": 4.5570556470624475e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 149259111.0, + "step": 44225 + }, + { + "entropy": 0.048424013145267966, + "epoch": 10.310059447488053, + "grad_norm": 0.02490234375, + "learning_rate": 4.5569390786857725e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 149285200.0, + "step": 44230 + }, + { + "entropy": 0.0636987192556262, + "epoch": 10.311225084508685, + "grad_norm": 0.027099609375, + "learning_rate": 4.5568224982764225e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999885141849518, + "num_tokens": 149312337.0, + "step": 44235 + }, + { + "entropy": 0.03847971297800541, + "epoch": 10.312390721529315, + "grad_norm": 0.0196533203125, + "learning_rate": 4.556705905836135e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 149333963.0, + "step": 44240 + }, + { + "entropy": 0.05180752845481038, + "epoch": 10.313556358549947, + "grad_norm": 0.064453125, + "learning_rate": 4.5565893013666495e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 149360883.0, + "step": 44245 + }, + { + "entropy": 0.054119398910552265, + "epoch": 10.31472199557058, + "grad_norm": 0.02392578125, + "learning_rate": 4.556472684869704e-05, + "loss": 0.0008, + "mean_token_accuracy": 1.0, + "num_tokens": 149387376.0, + "step": 44250 + }, + { + "entropy": 0.07822755612432956, + "epoch": 10.315887632591211, + "grad_norm": 0.036865234375, + "learning_rate": 4.556356056347038e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 149395371.0, + "step": 44255 + }, + { + "entropy": 0.06847381629049779, + "epoch": 10.317053269611844, + "grad_norm": 0.23046875, + "learning_rate": 4.5562394158003906e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 149407104.0, + "step": 44260 + }, + { + "entropy": 0.06189041472971439, + "epoch": 10.318218906632474, + "grad_norm": 0.0125732421875, + "learning_rate": 4.5561227632315e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 149418449.0, + "step": 44265 + }, + { + "entropy": 0.05848919115960598, + "epoch": 10.319384543653106, + "grad_norm": 0.046142578125, + "learning_rate": 4.556006098642107e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9998461544513703, + "num_tokens": 149432000.0, + "step": 44270 + }, + { + "entropy": 0.047734024748206136, + "epoch": 10.320550180673738, + "grad_norm": 0.03173828125, + "learning_rate": 4.555889422033951e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 149477400.0, + "step": 44275 + }, + { + "entropy": 0.0656460294034332, + "epoch": 10.32171581769437, + "grad_norm": 0.05810546875, + "learning_rate": 4.555772733408772e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 149499213.0, + "step": 44280 + }, + { + "entropy": 0.07494772598147392, + "epoch": 10.322881454715002, + "grad_norm": 0.015625, + "learning_rate": 4.5556560327683085e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 149511555.0, + "step": 44285 + }, + { + "entropy": 0.04982608687132597, + "epoch": 10.324047091735633, + "grad_norm": 0.0208740234375, + "learning_rate": 4.5555393201143025e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 149527136.0, + "step": 44290 + }, + { + "entropy": 0.05682174563407898, + "epoch": 10.325212728756265, + "grad_norm": 0.0498046875, + "learning_rate": 4.555422595448494e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 149540226.0, + "step": 44295 + }, + { + "entropy": 0.05086037190631032, + "epoch": 10.326378365776897, + "grad_norm": 0.01019287109375, + "learning_rate": 4.555305858772622e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 149553341.0, + "step": 44300 + }, + { + "entropy": 0.06853776425123215, + "epoch": 10.327544002797529, + "grad_norm": 0.021240234375, + "learning_rate": 4.555189110088429e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999893784523011, + "num_tokens": 149570177.0, + "step": 44305 + }, + { + "entropy": 0.07153114788234234, + "epoch": 10.328709639818161, + "grad_norm": 0.03271484375, + "learning_rate": 4.555072349397656e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 149584394.0, + "step": 44310 + }, + { + "entropy": 0.0588343221694231, + "epoch": 10.329875276838793, + "grad_norm": 0.0311279296875, + "learning_rate": 4.5549555767020425e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 149599151.0, + "step": 44315 + }, + { + "entropy": 0.06345712374895811, + "epoch": 10.331040913859423, + "grad_norm": 0.0303955078125, + "learning_rate": 4.5548387920033306e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 149619838.0, + "step": 44320 + }, + { + "entropy": 0.06382895335555076, + "epoch": 10.332206550880056, + "grad_norm": 0.0301513671875, + "learning_rate": 4.554721995303262e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999876201152802, + "num_tokens": 149648081.0, + "step": 44325 + }, + { + "entropy": 0.06957971584051847, + "epoch": 10.333372187900688, + "grad_norm": 0.0712890625, + "learning_rate": 4.554605186603578e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 149673066.0, + "step": 44330 + }, + { + "entropy": 0.04718512548133731, + "epoch": 10.33453782492132, + "grad_norm": 0.08740234375, + "learning_rate": 4.554488365906021e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 149702132.0, + "step": 44335 + }, + { + "entropy": 0.05456408876925707, + "epoch": 10.335703461941952, + "grad_norm": 0.0137939453125, + "learning_rate": 4.5543715332123316e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 149720216.0, + "step": 44340 + }, + { + "entropy": 0.05885306540876627, + "epoch": 10.336869098962582, + "grad_norm": 0.1591796875, + "learning_rate": 4.5542546885242535e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 149740075.0, + "step": 44345 + }, + { + "entropy": 0.044297129614278674, + "epoch": 10.338034735983214, + "grad_norm": 0.01348876953125, + "learning_rate": 4.554137831843528e-05, + "loss": 0.0001, + "mean_token_accuracy": 0.999986058473587, + "num_tokens": 149764370.0, + "step": 44350 + }, + { + "entropy": 0.05702058784663677, + "epoch": 10.339200373003846, + "grad_norm": 0.025146484375, + "learning_rate": 4.554020963171898e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 149776968.0, + "step": 44355 + }, + { + "entropy": 0.057385982014238834, + "epoch": 10.340366010024479, + "grad_norm": 0.0274658203125, + "learning_rate": 4.553904082511106e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999786972999573, + "num_tokens": 149814014.0, + "step": 44360 + }, + { + "entropy": 0.047365696541965006, + "epoch": 10.34153164704511, + "grad_norm": 0.017578125, + "learning_rate": 4.553787189862895e-05, + "loss": 0.0001, + "mean_token_accuracy": 0.9997907936573028, + "num_tokens": 149839742.0, + "step": 44365 + }, + { + "entropy": 0.05646844375878572, + "epoch": 10.342697284065743, + "grad_norm": 0.02587890625, + "learning_rate": 4.553670285229008e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 149851155.0, + "step": 44370 + }, + { + "entropy": 0.058501888811588285, + "epoch": 10.343862921086373, + "grad_norm": 0.07421875, + "learning_rate": 4.553553368611188e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9993243217468262, + "num_tokens": 149865248.0, + "step": 44375 + }, + { + "entropy": 0.06298747211694718, + "epoch": 10.345028558107005, + "grad_norm": 0.0211181640625, + "learning_rate": 4.55343644001118e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9994152069091797, + "num_tokens": 149878095.0, + "step": 44380 + }, + { + "entropy": 0.06646413435228168, + "epoch": 10.346194195127637, + "grad_norm": 0.1279296875, + "learning_rate": 4.553319499430725e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999783575534821, + "num_tokens": 149906489.0, + "step": 44385 + }, + { + "entropy": 0.05169821633026004, + "epoch": 10.34735983214827, + "grad_norm": 0.0654296875, + "learning_rate": 4.5532025468715675e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999868750572205, + "num_tokens": 149934017.0, + "step": 44390 + }, + { + "entropy": 0.06346538104116917, + "epoch": 10.348525469168901, + "grad_norm": 0.0311279296875, + "learning_rate": 4.5530855823354526e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 149944762.0, + "step": 44395 + }, + { + "entropy": 0.049836630932986736, + "epoch": 10.349691106189532, + "grad_norm": 0.0260009765625, + "learning_rate": 4.552968605824125e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999893188476563, + "num_tokens": 149970492.0, + "step": 44400 + }, + { + "entropy": 0.06093708393163979, + "epoch": 10.350856743210164, + "grad_norm": 0.03271484375, + "learning_rate": 4.552851617339326e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 149989599.0, + "step": 44405 + }, + { + "entropy": 0.06286875121295452, + "epoch": 10.352022380230796, + "grad_norm": 0.018310546875, + "learning_rate": 4.5527346168828025e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9997101426124573, + "num_tokens": 150000033.0, + "step": 44410 + }, + { + "entropy": 0.05082651572301984, + "epoch": 10.353188017251428, + "grad_norm": 0.0179443359375, + "learning_rate": 4.5526176044562985e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 150028772.0, + "step": 44415 + }, + { + "entropy": 0.14444229658693075, + "epoch": 10.35435365427206, + "grad_norm": 5.0625, + "learning_rate": 4.5525005800615585e-05, + "loss": 0.1725, + "mean_token_accuracy": 0.9819698393344879, + "num_tokens": 150070583.0, + "step": 44420 + }, + { + "entropy": 0.08742633331567048, + "epoch": 10.35551929129269, + "grad_norm": 0.04150390625, + "learning_rate": 4.552383543700328e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9999787509441376, + "num_tokens": 150091656.0, + "step": 44425 + }, + { + "entropy": 0.05972275361418724, + "epoch": 10.356684928313323, + "grad_norm": 0.072265625, + "learning_rate": 4.552266495374352e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9994139850139618, + "num_tokens": 150108536.0, + "step": 44430 + }, + { + "entropy": 0.07121363878250123, + "epoch": 10.357850565333955, + "grad_norm": 0.0303955078125, + "learning_rate": 4.5521494350853764e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 150117275.0, + "step": 44435 + }, + { + "entropy": 0.049450202379375696, + "epoch": 10.359016202354587, + "grad_norm": 0.057861328125, + "learning_rate": 4.552032362835146e-05, + "loss": 0.0043, + "mean_token_accuracy": 0.9999368906021118, + "num_tokens": 150151732.0, + "step": 44440 + }, + { + "entropy": 0.06740026157349348, + "epoch": 10.360181839375219, + "grad_norm": 0.017578125, + "learning_rate": 4.551915278625406e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 150161764.0, + "step": 44445 + }, + { + "entropy": 0.07839980935677886, + "epoch": 10.361347476395851, + "grad_norm": 0.01544189453125, + "learning_rate": 4.5517981824579034e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 150177487.0, + "step": 44450 + }, + { + "entropy": 0.04942024620249867, + "epoch": 10.362513113416481, + "grad_norm": 0.034423828125, + "learning_rate": 4.551681074334384e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 150201662.0, + "step": 44455 + }, + { + "entropy": 0.05563904563896358, + "epoch": 10.363678750437114, + "grad_norm": 0.078125, + "learning_rate": 4.5515639542565946e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999758481979371, + "num_tokens": 150224405.0, + "step": 44460 + }, + { + "entropy": 0.05794704202562571, + "epoch": 10.364844387457746, + "grad_norm": 0.044189453125, + "learning_rate": 4.55144682222628e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 150244252.0, + "step": 44465 + }, + { + "entropy": 0.0887676641345024, + "epoch": 10.366010024478378, + "grad_norm": 0.1484375, + "learning_rate": 4.551329678245189e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 150255084.0, + "step": 44470 + }, + { + "entropy": 0.06721811229363084, + "epoch": 10.36717566149901, + "grad_norm": 0.0284423828125, + "learning_rate": 4.5512125223150656e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 150271849.0, + "step": 44475 + }, + { + "entropy": 0.07131273578852415, + "epoch": 10.36834129851964, + "grad_norm": 0.08935546875, + "learning_rate": 4.55109535443766e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 150297391.0, + "step": 44480 + }, + { + "entropy": 0.061979838041588665, + "epoch": 10.369506935540272, + "grad_norm": 0.02783203125, + "learning_rate": 4.550978174614717e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 150315599.0, + "step": 44485 + }, + { + "entropy": 0.06966787008568645, + "epoch": 10.370672572560904, + "grad_norm": 0.023681640625, + "learning_rate": 4.550860982847985e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9997916400432587, + "num_tokens": 150344717.0, + "step": 44490 + }, + { + "entropy": 0.054807602986693384, + "epoch": 10.371838209581536, + "grad_norm": 0.123046875, + "learning_rate": 4.550743779139211e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 150362497.0, + "step": 44495 + }, + { + "entropy": 0.06291921064257622, + "epoch": 10.373003846602169, + "grad_norm": 0.01434326171875, + "learning_rate": 4.550626563490143e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 150378889.0, + "step": 44500 + }, + { + "entropy": 0.05275652538985014, + "epoch": 10.3741694836228, + "grad_norm": 0.0595703125, + "learning_rate": 4.550509335902529e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 150401787.0, + "step": 44505 + }, + { + "entropy": 0.02468291549012065, + "epoch": 10.375335120643431, + "grad_norm": 0.01904296875, + "learning_rate": 4.5503920963781156e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999732375144958, + "num_tokens": 150443772.0, + "step": 44510 + }, + { + "entropy": 0.06841404465958476, + "epoch": 10.376500757664063, + "grad_norm": 0.0458984375, + "learning_rate": 4.550274844918653e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 150468813.0, + "step": 44515 + }, + { + "entropy": 0.05546982828527689, + "epoch": 10.377666394684695, + "grad_norm": 0.0419921875, + "learning_rate": 4.550157581525889e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 150489590.0, + "step": 44520 + }, + { + "entropy": 0.05243567517027259, + "epoch": 10.378832031705327, + "grad_norm": 0.0966796875, + "learning_rate": 4.550040306201571e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999762177467346, + "num_tokens": 150509588.0, + "step": 44525 + }, + { + "entropy": 0.07348482059314847, + "epoch": 10.37999766872596, + "grad_norm": 0.05810546875, + "learning_rate": 4.5499230189474496e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9998459160327912, + "num_tokens": 150522711.0, + "step": 44530 + }, + { + "entropy": 0.05543697997927666, + "epoch": 10.38116330574659, + "grad_norm": 0.0260009765625, + "learning_rate": 4.5498057197652726e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 150537080.0, + "step": 44535 + }, + { + "entropy": 0.052227580547332765, + "epoch": 10.382328942767222, + "grad_norm": 0.0211181640625, + "learning_rate": 4.549688408656789e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999841928482056, + "num_tokens": 150556156.0, + "step": 44540 + }, + { + "entropy": 0.051933661289513114, + "epoch": 10.383494579787854, + "grad_norm": 0.0279541015625, + "learning_rate": 4.549571085623749e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 150573522.0, + "step": 44545 + }, + { + "entropy": 0.05841661635786295, + "epoch": 10.384660216808486, + "grad_norm": 0.07568359375, + "learning_rate": 4.549453750667901e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 150602640.0, + "step": 44550 + }, + { + "entropy": 0.08519041016697884, + "epoch": 10.385825853829118, + "grad_norm": 0.035888671875, + "learning_rate": 4.549336403790996e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 150612023.0, + "step": 44555 + }, + { + "entropy": 0.06693806014955044, + "epoch": 10.386991490849748, + "grad_norm": 0.01470947265625, + "learning_rate": 4.549219044994782e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 150633112.0, + "step": 44560 + }, + { + "entropy": 0.06107230139896273, + "epoch": 10.38815712787038, + "grad_norm": 0.025634765625, + "learning_rate": 4.54910167428101e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 150661439.0, + "step": 44565 + }, + { + "entropy": 0.0495242724660784, + "epoch": 10.389322764891013, + "grad_norm": 0.0181884765625, + "learning_rate": 4.548984291651431e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999850571155549, + "num_tokens": 150687714.0, + "step": 44570 + }, + { + "entropy": 0.05606261203065514, + "epoch": 10.390488401911645, + "grad_norm": 0.279296875, + "learning_rate": 4.548866897107794e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999887406826019, + "num_tokens": 150710467.0, + "step": 44575 + }, + { + "entropy": 0.05214941115118563, + "epoch": 10.391654038932277, + "grad_norm": 0.021728515625, + "learning_rate": 4.54874949065185e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 150732070.0, + "step": 44580 + }, + { + "entropy": 0.0623500001616776, + "epoch": 10.392819675952909, + "grad_norm": 0.23046875, + "learning_rate": 4.548632072285349e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 150746236.0, + "step": 44585 + }, + { + "entropy": 0.052656793221831324, + "epoch": 10.39398531297354, + "grad_norm": 0.1015625, + "learning_rate": 4.548514642010043e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 150769149.0, + "step": 44590 + }, + { + "entropy": 0.05154966562986374, + "epoch": 10.395150949994171, + "grad_norm": 0.01055908203125, + "learning_rate": 4.5483971998276834e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.999904578924179, + "num_tokens": 150794023.0, + "step": 44595 + }, + { + "entropy": 0.058824519719928504, + "epoch": 10.396316587014804, + "grad_norm": 0.0233154296875, + "learning_rate": 4.54827974574002e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.999965363740921, + "num_tokens": 150821718.0, + "step": 44600 + }, + { + "entropy": 0.05286461748182773, + "epoch": 10.397482224035436, + "grad_norm": 0.062255859375, + "learning_rate": 4.548162279748805e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 150832539.0, + "step": 44605 + }, + { + "entropy": 0.07234964519739151, + "epoch": 10.398647861056068, + "grad_norm": 0.07373046875, + "learning_rate": 4.54804480185579e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 150843175.0, + "step": 44610 + }, + { + "entropy": 0.05047530680894852, + "epoch": 10.399813498076698, + "grad_norm": 0.02734375, + "learning_rate": 4.5479273120627266e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 150862858.0, + "step": 44615 + }, + { + "entropy": 0.051994080375880006, + "epoch": 10.40097913509733, + "grad_norm": 0.036865234375, + "learning_rate": 4.547809810371367e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 150879504.0, + "step": 44620 + }, + { + "entropy": 0.051050873938947916, + "epoch": 10.402144772117962, + "grad_norm": 0.2470703125, + "learning_rate": 4.547692296783463e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999808728694916, + "num_tokens": 150902392.0, + "step": 44625 + }, + { + "entropy": 0.07185981124639511, + "epoch": 10.403310409138594, + "grad_norm": 0.026123046875, + "learning_rate": 4.5475747713007674e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 150913796.0, + "step": 44630 + }, + { + "entropy": 0.08481485210359097, + "epoch": 10.404476046159226, + "grad_norm": 0.08740234375, + "learning_rate": 4.547457233925031e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 150943637.0, + "step": 44635 + }, + { + "entropy": 0.06651202514767647, + "epoch": 10.405641683179859, + "grad_norm": 0.0223388671875, + "learning_rate": 4.5473396846580096e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 150954871.0, + "step": 44640 + }, + { + "entropy": 0.04281379147432744, + "epoch": 10.406807320200489, + "grad_norm": 0.0303955078125, + "learning_rate": 4.5472221235014535e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999888718128205, + "num_tokens": 150988430.0, + "step": 44645 + }, + { + "entropy": 0.06591433379799128, + "epoch": 10.407972957221121, + "grad_norm": 0.024658203125, + "learning_rate": 4.5471045504571166e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999487698078156, + "num_tokens": 151015790.0, + "step": 44650 + }, + { + "entropy": 0.05050462838262319, + "epoch": 10.409138594241753, + "grad_norm": 0.01953125, + "learning_rate": 4.546986965526751e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 151047021.0, + "step": 44655 + }, + { + "entropy": 0.08088272651657462, + "epoch": 10.410304231262385, + "grad_norm": 0.02880859375, + "learning_rate": 4.5468693687121124e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151066724.0, + "step": 44660 + }, + { + "entropy": 0.06316407779231667, + "epoch": 10.411469868283017, + "grad_norm": 0.03271484375, + "learning_rate": 4.546751760014952e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 151090250.0, + "step": 44665 + }, + { + "entropy": 0.07306670425459742, + "epoch": 10.412635505303648, + "grad_norm": 0.0257568359375, + "learning_rate": 4.546634139437025e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151106420.0, + "step": 44670 + }, + { + "entropy": 0.06450021881610155, + "epoch": 10.41380114232428, + "grad_norm": 0.032470703125, + "learning_rate": 4.546516506980084e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151119827.0, + "step": 44675 + }, + { + "entropy": 0.05164213851094246, + "epoch": 10.414966779344912, + "grad_norm": 0.059326171875, + "learning_rate": 4.546398862645885e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 151135806.0, + "step": 44680 + }, + { + "entropy": 0.06281434707343578, + "epoch": 10.416132416365544, + "grad_norm": 0.01251220703125, + "learning_rate": 4.5462812064361806e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151145452.0, + "step": 44685 + }, + { + "entropy": 0.07266581580042838, + "epoch": 10.417298053386176, + "grad_norm": 0.11865234375, + "learning_rate": 4.546163538352725e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 151158311.0, + "step": 44690 + }, + { + "entropy": 0.0545011417940259, + "epoch": 10.418463690406806, + "grad_norm": 0.0302734375, + "learning_rate": 4.5460458583972745e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151175781.0, + "step": 44695 + }, + { + "entropy": 0.06281043328344822, + "epoch": 10.419629327427439, + "grad_norm": 0.041015625, + "learning_rate": 4.545928166571582e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 151187268.0, + "step": 44700 + }, + { + "entropy": 0.045202738512307404, + "epoch": 10.42079496444807, + "grad_norm": 0.03759765625, + "learning_rate": 4.5458104628774046e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151215418.0, + "step": 44705 + }, + { + "entropy": 0.06615263698622584, + "epoch": 10.421960601468703, + "grad_norm": 0.05810546875, + "learning_rate": 4.545692747316495e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151231048.0, + "step": 44710 + }, + { + "entropy": 0.03947287751361728, + "epoch": 10.423126238489335, + "grad_norm": 0.051025390625, + "learning_rate": 4.5455750198906114e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.999989128112793, + "num_tokens": 151274419.0, + "step": 44715 + }, + { + "entropy": 0.054187464900314807, + "epoch": 10.424291875509967, + "grad_norm": 0.3125, + "learning_rate": 4.545457280601506e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999715149402618, + "num_tokens": 151293841.0, + "step": 44720 + }, + { + "entropy": 0.064159763045609, + "epoch": 10.425457512530597, + "grad_norm": 0.06494140625, + "learning_rate": 4.545339529450937e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 151319691.0, + "step": 44725 + }, + { + "entropy": 0.060284094978123905, + "epoch": 10.42662314955123, + "grad_norm": 0.56640625, + "learning_rate": 4.545221766440659e-05, + "loss": 0.0072, + "mean_token_accuracy": 0.9996629536151886, + "num_tokens": 151353006.0, + "step": 44730 + }, + { + "entropy": 0.06142573980614543, + "epoch": 10.427788786571861, + "grad_norm": 0.02587890625, + "learning_rate": 4.545103991572428e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151372233.0, + "step": 44735 + }, + { + "entropy": 0.05108613050542772, + "epoch": 10.428954423592494, + "grad_norm": 0.1513671875, + "learning_rate": 4.5449862048480015e-05, + "loss": 0.0064, + "mean_token_accuracy": 0.9988063097000122, + "num_tokens": 151410291.0, + "step": 44740 + }, + { + "entropy": 0.05234553683549166, + "epoch": 10.430120060613126, + "grad_norm": 0.051513671875, + "learning_rate": 4.544868406269134e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 151435611.0, + "step": 44745 + }, + { + "entropy": 0.059111610800027845, + "epoch": 10.431285697633756, + "grad_norm": 0.018310546875, + "learning_rate": 4.544750595837584e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 151454651.0, + "step": 44750 + }, + { + "entropy": 0.06212139260023832, + "epoch": 10.432451334654388, + "grad_norm": 0.12060546875, + "learning_rate": 4.544632773555107e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999787032604217, + "num_tokens": 151476996.0, + "step": 44755 + }, + { + "entropy": 0.04963586870580912, + "epoch": 10.43361697167502, + "grad_norm": 0.0198974609375, + "learning_rate": 4.5445149394234596e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999685645103454, + "num_tokens": 151495009.0, + "step": 44760 + }, + { + "entropy": 0.06097169406712055, + "epoch": 10.434782608695652, + "grad_norm": 0.01470947265625, + "learning_rate": 4.5443970934444e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 151516934.0, + "step": 44765 + }, + { + "entropy": 0.06304800482466817, + "epoch": 10.435948245716284, + "grad_norm": 0.0230712890625, + "learning_rate": 4.5442792356196846e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 151532322.0, + "step": 44770 + }, + { + "entropy": 0.045744067151099445, + "epoch": 10.437113882736917, + "grad_norm": 0.103515625, + "learning_rate": 4.544161365951071e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999655902385711, + "num_tokens": 151560417.0, + "step": 44775 + }, + { + "entropy": 0.05613409299403429, + "epoch": 10.438279519757547, + "grad_norm": 0.0634765625, + "learning_rate": 4.5440434844403173e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151585302.0, + "step": 44780 + }, + { + "entropy": 0.05387907605618238, + "epoch": 10.439445156778179, + "grad_norm": 0.021484375, + "learning_rate": 4.543925591089181e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9995951414108276, + "num_tokens": 151606647.0, + "step": 44785 + }, + { + "entropy": 0.09123647324740887, + "epoch": 10.440610793798811, + "grad_norm": 0.0294189453125, + "learning_rate": 4.543807685899419e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 151619649.0, + "step": 44790 + }, + { + "entropy": 0.045269330497831106, + "epoch": 10.441776430819443, + "grad_norm": 0.016845703125, + "learning_rate": 4.543689768872792e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151649258.0, + "step": 44795 + }, + { + "entropy": 0.06868037879467011, + "epoch": 10.442942067840075, + "grad_norm": 0.048095703125, + "learning_rate": 4.543571840011056e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 151659263.0, + "step": 44800 + }, + { + "entropy": 0.059371945634484294, + "epoch": 10.444107704860706, + "grad_norm": 0.216796875, + "learning_rate": 4.54345389931597e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151675122.0, + "step": 44805 + }, + { + "entropy": 0.08021088363602757, + "epoch": 10.445273341881338, + "grad_norm": 0.035400390625, + "learning_rate": 4.5433359467892935e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 151699196.0, + "step": 44810 + }, + { + "entropy": 0.040476453490555286, + "epoch": 10.44643897890197, + "grad_norm": 0.035888671875, + "learning_rate": 4.543217982432784e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151719643.0, + "step": 44815 + }, + { + "entropy": 0.06719337925314903, + "epoch": 10.447604615922602, + "grad_norm": 0.0400390625, + "learning_rate": 4.543100006248202e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999888837337494, + "num_tokens": 151736668.0, + "step": 44820 + }, + { + "entropy": 0.055458275601267815, + "epoch": 10.448770252943234, + "grad_norm": 0.03369140625, + "learning_rate": 4.542982018237306e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 151750220.0, + "step": 44825 + }, + { + "entropy": 0.0656410625204444, + "epoch": 10.449935889963864, + "grad_norm": 0.12060546875, + "learning_rate": 4.542864018401855e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999665200710297, + "num_tokens": 151773498.0, + "step": 44830 + }, + { + "entropy": 0.05906023997813463, + "epoch": 10.451101526984496, + "grad_norm": 0.08740234375, + "learning_rate": 4.542746006743609e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 151786423.0, + "step": 44835 + }, + { + "entropy": 0.06858272757381201, + "epoch": 10.452267164005129, + "grad_norm": 0.318359375, + "learning_rate": 4.542627983264328e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 151804263.0, + "step": 44840 + }, + { + "entropy": 0.07860703244805337, + "epoch": 10.45343280102576, + "grad_norm": 0.1298828125, + "learning_rate": 4.542509947965772e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 151812919.0, + "step": 44845 + }, + { + "entropy": 0.05585416806861758, + "epoch": 10.454598438046393, + "grad_norm": 0.020263671875, + "learning_rate": 4.5423919008497e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151829565.0, + "step": 44850 + }, + { + "entropy": 0.04956377726048231, + "epoch": 10.455764075067025, + "grad_norm": 0.04833984375, + "learning_rate": 4.542273841917873e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999893546104431, + "num_tokens": 151854530.0, + "step": 44855 + }, + { + "entropy": 0.054505852470174435, + "epoch": 10.456929712087655, + "grad_norm": 0.01190185546875, + "learning_rate": 4.5421557711720506e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151881715.0, + "step": 44860 + }, + { + "entropy": 0.04270871412009001, + "epoch": 10.458095349108287, + "grad_norm": 0.018310546875, + "learning_rate": 4.5420376886139954e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151900522.0, + "step": 44865 + }, + { + "entropy": 0.05425464333966375, + "epoch": 10.45926098612892, + "grad_norm": 0.038818359375, + "learning_rate": 4.5419195942454665e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999880194664001, + "num_tokens": 151919944.0, + "step": 44870 + }, + { + "entropy": 0.057429822022095324, + "epoch": 10.460426623149552, + "grad_norm": 0.041015625, + "learning_rate": 4.5418014880682256e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999831736087799, + "num_tokens": 151942421.0, + "step": 44875 + }, + { + "entropy": 0.06060660080984235, + "epoch": 10.461592260170184, + "grad_norm": 0.038818359375, + "learning_rate": 4.541683370084033e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999891996383667, + "num_tokens": 151962688.0, + "step": 44880 + }, + { + "entropy": 0.07495781434699893, + "epoch": 10.462757897190814, + "grad_norm": 0.035400390625, + "learning_rate": 4.541565240294651e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 151982335.0, + "step": 44885 + }, + { + "entropy": 0.0536651149392128, + "epoch": 10.463923534211446, + "grad_norm": 0.00994873046875, + "learning_rate": 4.541447098701841e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 152003296.0, + "step": 44890 + }, + { + "entropy": 0.07172315865755081, + "epoch": 10.465089171232078, + "grad_norm": 0.035400390625, + "learning_rate": 4.5413289453073645e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999733746051789, + "num_tokens": 152019012.0, + "step": 44895 + }, + { + "entropy": 0.0632179843261838, + "epoch": 10.46625480825271, + "grad_norm": 0.0201416015625, + "learning_rate": 4.5412107801129824e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 152029719.0, + "step": 44900 + }, + { + "entropy": 0.05272108670324087, + "epoch": 10.467420445273342, + "grad_norm": 0.09521484375, + "learning_rate": 4.541092603120458e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 152051695.0, + "step": 44905 + }, + { + "entropy": 0.036588351242244244, + "epoch": 10.468586082293974, + "grad_norm": 0.034912109375, + "learning_rate": 4.540974414331552e-05, + "loss": 0.0001, + "mean_token_accuracy": 0.9999890267848969, + "num_tokens": 152084842.0, + "step": 44910 + }, + { + "entropy": 0.06178782312199473, + "epoch": 10.469751719314605, + "grad_norm": 0.0101318359375, + "learning_rate": 4.540856213748029e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9998004019260407, + "num_tokens": 152100061.0, + "step": 44915 + }, + { + "entropy": 0.05155020691454411, + "epoch": 10.470917356335237, + "grad_norm": 0.061767578125, + "learning_rate": 4.5407380013716506e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 152116996.0, + "step": 44920 + }, + { + "entropy": 0.032804567646235225, + "epoch": 10.472082993355869, + "grad_norm": 0.1259765625, + "learning_rate": 4.5406197772041784e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999784171581269, + "num_tokens": 152160796.0, + "step": 44925 + }, + { + "entropy": 0.04573766337707639, + "epoch": 10.473248630376501, + "grad_norm": 0.035400390625, + "learning_rate": 4.5405015412473764e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999883711338043, + "num_tokens": 152190185.0, + "step": 44930 + }, + { + "entropy": 0.05881201233714819, + "epoch": 10.474414267397133, + "grad_norm": 0.0250244140625, + "learning_rate": 4.540383293503008e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 152199805.0, + "step": 44935 + }, + { + "entropy": 0.06688203103840351, + "epoch": 10.475579904417764, + "grad_norm": 0.16796875, + "learning_rate": 4.540265033972835e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9996675014495849, + "num_tokens": 152217785.0, + "step": 44940 + }, + { + "entropy": 0.04776301896199584, + "epoch": 10.476745541438396, + "grad_norm": 0.04931640625, + "learning_rate": 4.540146762658622e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999693930149078, + "num_tokens": 152236196.0, + "step": 44945 + }, + { + "entropy": 0.05963452514261007, + "epoch": 10.477911178459028, + "grad_norm": 0.034912109375, + "learning_rate": 4.540028479562132e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 152247581.0, + "step": 44950 + }, + { + "entropy": 0.04292171949055046, + "epoch": 10.47907681547966, + "grad_norm": 0.032958984375, + "learning_rate": 4.53991018468513e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 152286578.0, + "step": 44955 + }, + { + "entropy": 0.04976120926439762, + "epoch": 10.480242452500292, + "grad_norm": 1.5390625, + "learning_rate": 4.539791878029378e-05, + "loss": 0.0015, + "mean_token_accuracy": 0.9997435867786407, + "num_tokens": 152328454.0, + "step": 44960 + }, + { + "entropy": 0.06413271725177765, + "epoch": 10.481408089520922, + "grad_norm": 0.10791015625, + "learning_rate": 4.539673559596642e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 152338536.0, + "step": 44965 + }, + { + "entropy": 0.047244657296687365, + "epoch": 10.482573726541554, + "grad_norm": 0.054443359375, + "learning_rate": 4.539555229388685e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.999899297952652, + "num_tokens": 152372224.0, + "step": 44970 + }, + { + "entropy": 0.0595405210275203, + "epoch": 10.483739363562186, + "grad_norm": 0.0242919921875, + "learning_rate": 4.5394368874072725e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 152391307.0, + "step": 44975 + }, + { + "entropy": 0.07148723490536213, + "epoch": 10.484905000582819, + "grad_norm": 0.042724609375, + "learning_rate": 4.5393185336541684e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 152400417.0, + "step": 44980 + }, + { + "entropy": 0.06768373921513557, + "epoch": 10.48607063760345, + "grad_norm": 0.016357421875, + "learning_rate": 4.5392001681311376e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 152411955.0, + "step": 44985 + }, + { + "entropy": 0.07932289410382509, + "epoch": 10.487236274624083, + "grad_norm": 0.0289306640625, + "learning_rate": 4.539081790839945e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9995620608329773, + "num_tokens": 152430368.0, + "step": 44990 + }, + { + "entropy": 0.050478702038526536, + "epoch": 10.488401911644713, + "grad_norm": 0.373046875, + "learning_rate": 4.538963401782357e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999386966228485, + "num_tokens": 152456906.0, + "step": 44995 + }, + { + "entropy": 0.05402056300081313, + "epoch": 10.489567548665345, + "grad_norm": 0.1923828125, + "learning_rate": 4.5388450009601367e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999709129333496, + "num_tokens": 152479603.0, + "step": 45000 + }, + { + "entropy": 0.04246578803285957, + "epoch": 10.490733185685977, + "grad_norm": 0.076171875, + "learning_rate": 4.538726588375052e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999894201755524, + "num_tokens": 152511917.0, + "step": 45005 + }, + { + "entropy": 0.046724597364664076, + "epoch": 10.49189882270661, + "grad_norm": 0.0230712890625, + "learning_rate": 4.538608164028867e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 152538081.0, + "step": 45010 + }, + { + "entropy": 0.06224795989692211, + "epoch": 10.493064459727242, + "grad_norm": 0.1865234375, + "learning_rate": 4.538489727923348e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 152560465.0, + "step": 45015 + }, + { + "entropy": 0.0667582368478179, + "epoch": 10.494230096747872, + "grad_norm": 0.058837890625, + "learning_rate": 4.5383712800602616e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999842643737793, + "num_tokens": 152577602.0, + "step": 45020 + }, + { + "entropy": 0.056950070336461066, + "epoch": 10.495395733768504, + "grad_norm": 0.017333984375, + "learning_rate": 4.538252820441374e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9998468577861785, + "num_tokens": 152598903.0, + "step": 45025 + }, + { + "entropy": 0.06032578498125076, + "epoch": 10.496561370789136, + "grad_norm": 0.0245361328125, + "learning_rate": 4.538134349068451e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 152611466.0, + "step": 45030 + }, + { + "entropy": 0.06809374392032623, + "epoch": 10.497727007809768, + "grad_norm": 0.0257568359375, + "learning_rate": 4.538015865943259e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 152621414.0, + "step": 45035 + }, + { + "entropy": 0.061638028174638745, + "epoch": 10.4988926448304, + "grad_norm": 0.0458984375, + "learning_rate": 4.537897371067565e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 152633531.0, + "step": 45040 + }, + { + "entropy": 0.04594460809603333, + "epoch": 10.500058281851032, + "grad_norm": 0.14453125, + "learning_rate": 4.537778864443137e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999785780906677, + "num_tokens": 152665138.0, + "step": 45045 + }, + { + "entropy": 0.059464158676564696, + "epoch": 10.501223918871663, + "grad_norm": 0.0279541015625, + "learning_rate": 4.537660346071741e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999689579010009, + "num_tokens": 152685895.0, + "step": 45050 + }, + { + "entropy": 0.058475431287661195, + "epoch": 10.502389555892295, + "grad_norm": 0.064453125, + "learning_rate": 4.5375418159551444e-05, + "loss": 0.0018, + "mean_token_accuracy": 1.0, + "num_tokens": 152716201.0, + "step": 45055 + }, + { + "entropy": 0.03777650678530335, + "epoch": 10.503555192912927, + "grad_norm": 0.04638671875, + "learning_rate": 4.537423274095115e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9998415231704711, + "num_tokens": 152746424.0, + "step": 45060 + }, + { + "entropy": 0.06340892501175403, + "epoch": 10.504720829933559, + "grad_norm": 0.07177734375, + "learning_rate": 4.5373047204934197e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999887526035309, + "num_tokens": 152767071.0, + "step": 45065 + }, + { + "entropy": 0.06310759540647268, + "epoch": 10.505886466954191, + "grad_norm": 0.01708984375, + "learning_rate": 4.5371861551518275e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 152779962.0, + "step": 45070 + }, + { + "entropy": 0.06093995273113251, + "epoch": 10.507052103974821, + "grad_norm": 0.0123291015625, + "learning_rate": 4.537067578072105e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 152797320.0, + "step": 45075 + }, + { + "entropy": 0.04854728020727635, + "epoch": 10.508217740995454, + "grad_norm": 0.025634765625, + "learning_rate": 4.536948989256021e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999867856502533, + "num_tokens": 152817688.0, + "step": 45080 + }, + { + "entropy": 0.06601851750165225, + "epoch": 10.509383378016086, + "grad_norm": 0.09375, + "learning_rate": 4.536830388705346e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 152829817.0, + "step": 45085 + }, + { + "entropy": 0.05404104976914823, + "epoch": 10.510549015036718, + "grad_norm": 0.671875, + "learning_rate": 4.5367117764218436e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 152861766.0, + "step": 45090 + }, + { + "entropy": 0.05845761811360717, + "epoch": 10.51171465205735, + "grad_norm": 0.009521484375, + "learning_rate": 4.536593152407287e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 152884718.0, + "step": 45095 + }, + { + "entropy": 0.048073732480406764, + "epoch": 10.51288028907798, + "grad_norm": 0.146484375, + "learning_rate": 4.5364745166634426e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999864935874939, + "num_tokens": 152920331.0, + "step": 45100 + }, + { + "entropy": 0.056105440203100446, + "epoch": 10.514045926098612, + "grad_norm": 0.060546875, + "learning_rate": 4.5363558691920803e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 152938039.0, + "step": 45105 + }, + { + "entropy": 0.05227671144530177, + "epoch": 10.515211563119244, + "grad_norm": 0.0152587890625, + "learning_rate": 4.53623720999497e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999771773815155, + "num_tokens": 152960373.0, + "step": 45110 + }, + { + "entropy": 0.05718650780618191, + "epoch": 10.516377200139877, + "grad_norm": 0.259765625, + "learning_rate": 4.53611853907388e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 152969974.0, + "step": 45115 + }, + { + "entropy": 0.06564706424251199, + "epoch": 10.517542837160509, + "grad_norm": 0.09765625, + "learning_rate": 4.53599985643058e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999891757965088, + "num_tokens": 152991494.0, + "step": 45120 + }, + { + "entropy": 0.061748063564300536, + "epoch": 10.51870847418114, + "grad_norm": 0.09375, + "learning_rate": 4.53588116206684e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999875843524932, + "num_tokens": 153015942.0, + "step": 45125 + }, + { + "entropy": 0.04634425612166524, + "epoch": 10.519874111201771, + "grad_norm": 0.08154296875, + "learning_rate": 4.53576245598443e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999892115592957, + "num_tokens": 153050371.0, + "step": 45130 + }, + { + "entropy": 0.06081217750906944, + "epoch": 10.521039748222403, + "grad_norm": 0.07080078125, + "learning_rate": 4.5356437381851194e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153065419.0, + "step": 45135 + }, + { + "entropy": 0.03465043311007321, + "epoch": 10.522205385243035, + "grad_norm": 0.044677734375, + "learning_rate": 4.53552500867068e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153097010.0, + "step": 45140 + }, + { + "entropy": 0.05195081997662783, + "epoch": 10.523371022263667, + "grad_norm": 0.028564453125, + "learning_rate": 4.53540626744288e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153123199.0, + "step": 45145 + }, + { + "entropy": 0.0526338548399508, + "epoch": 10.5245366592843, + "grad_norm": 0.0294189453125, + "learning_rate": 4.5352875145034926e-05, + "loss": 0.0019, + "mean_token_accuracy": 0.9996376812458039, + "num_tokens": 153139993.0, + "step": 45150 + }, + { + "entropy": 0.0496996458619833, + "epoch": 10.52570229630493, + "grad_norm": 0.034423828125, + "learning_rate": 4.5351687498542863e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999785900115967, + "num_tokens": 153167766.0, + "step": 45155 + }, + { + "entropy": 0.046733937319368125, + "epoch": 10.526867933325562, + "grad_norm": 0.0693359375, + "learning_rate": 4.535049973497033e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 153192549.0, + "step": 45160 + }, + { + "entropy": 0.06857496574521064, + "epoch": 10.528033570346194, + "grad_norm": 0.1640625, + "learning_rate": 4.5349311854335035e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 153202218.0, + "step": 45165 + }, + { + "entropy": 0.054048574063926935, + "epoch": 10.529199207366826, + "grad_norm": 0.10693359375, + "learning_rate": 4.53481238566547e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999850869178772, + "num_tokens": 153222058.0, + "step": 45170 + }, + { + "entropy": 0.05584253640845418, + "epoch": 10.530364844387458, + "grad_norm": 0.026123046875, + "learning_rate": 4.534693574194703e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 153235925.0, + "step": 45175 + }, + { + "entropy": 0.058231227286159995, + "epoch": 10.53153048140809, + "grad_norm": 0.0284423828125, + "learning_rate": 4.5345747510229754e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153254301.0, + "step": 45180 + }, + { + "entropy": 0.0526955584064126, + "epoch": 10.53269611842872, + "grad_norm": 0.099609375, + "learning_rate": 4.534455916152057e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999891638755798, + "num_tokens": 153276488.0, + "step": 45185 + }, + { + "entropy": 0.06668624244630336, + "epoch": 10.533861755449353, + "grad_norm": 0.2373046875, + "learning_rate": 4.5343370695837215e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 153289492.0, + "step": 45190 + }, + { + "entropy": 0.06033003106713295, + "epoch": 10.535027392469985, + "grad_norm": 0.06298828125, + "learning_rate": 4.5342182113197414e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 153309181.0, + "step": 45195 + }, + { + "entropy": 0.05025592148303985, + "epoch": 10.536193029490617, + "grad_norm": 0.02197265625, + "learning_rate": 4.534099341361887e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 153335484.0, + "step": 45200 + }, + { + "entropy": 0.061456915270537135, + "epoch": 10.537358666511249, + "grad_norm": 0.0693359375, + "learning_rate": 4.5339804597119325e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153355824.0, + "step": 45205 + }, + { + "entropy": 0.041024369467049834, + "epoch": 10.53852430353188, + "grad_norm": 1.4296875, + "learning_rate": 4.533861566371651e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9995857894420623, + "num_tokens": 153386252.0, + "step": 45210 + }, + { + "entropy": 0.07406266573816538, + "epoch": 10.539689940552512, + "grad_norm": 0.09326171875, + "learning_rate": 4.533742661342813e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999888896942138, + "num_tokens": 153414762.0, + "step": 45215 + }, + { + "entropy": 0.07470035180449486, + "epoch": 10.540855577573144, + "grad_norm": 0.0164794921875, + "learning_rate": 4.533623744627194e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 153425035.0, + "step": 45220 + }, + { + "entropy": 0.04976415578275919, + "epoch": 10.542021214593776, + "grad_norm": 0.181640625, + "learning_rate": 4.533504816226567e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999687135219574, + "num_tokens": 153449577.0, + "step": 45225 + }, + { + "entropy": 0.056877507083117965, + "epoch": 10.543186851614408, + "grad_norm": 0.01507568359375, + "learning_rate": 4.533385876142704e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153461321.0, + "step": 45230 + }, + { + "entropy": 0.053165735118091105, + "epoch": 10.544352488635038, + "grad_norm": 0.0181884765625, + "learning_rate": 4.533266924377379e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153479189.0, + "step": 45235 + }, + { + "entropy": 0.05576419588178396, + "epoch": 10.54551812565567, + "grad_norm": 0.04296875, + "learning_rate": 4.533147960932366e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153500382.0, + "step": 45240 + }, + { + "entropy": 0.05502867121249437, + "epoch": 10.546683762676302, + "grad_norm": 0.2890625, + "learning_rate": 4.5330289858094396e-05, + "loss": 0.0016, + "mean_token_accuracy": 0.9996170341968537, + "num_tokens": 153523829.0, + "step": 45245 + }, + { + "entropy": 0.05236600376665592, + "epoch": 10.547849399696934, + "grad_norm": 0.020263671875, + "learning_rate": 4.532909999010373e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 153540333.0, + "step": 45250 + }, + { + "entropy": 0.047642444260418415, + "epoch": 10.549015036717567, + "grad_norm": 0.022216796875, + "learning_rate": 4.5327910005369414e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153558943.0, + "step": 45255 + }, + { + "entropy": 0.06373915579169989, + "epoch": 10.550180673738199, + "grad_norm": 0.053466796875, + "learning_rate": 4.5326719903909176e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153570140.0, + "step": 45260 + }, + { + "entropy": 0.05331633798778057, + "epoch": 10.551346310758829, + "grad_norm": 0.035400390625, + "learning_rate": 4.5325529685740775e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153587093.0, + "step": 45265 + }, + { + "entropy": 0.05336455744691193, + "epoch": 10.552511947779461, + "grad_norm": 0.1240234375, + "learning_rate": 4.5324339350881964e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999893069267273, + "num_tokens": 153613028.0, + "step": 45270 + }, + { + "entropy": 0.04905619155615568, + "epoch": 10.553677584800093, + "grad_norm": 0.01806640625, + "learning_rate": 4.532314889935048e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 153635947.0, + "step": 45275 + }, + { + "entropy": 0.07199727166444063, + "epoch": 10.554843221820725, + "grad_norm": 0.10791015625, + "learning_rate": 4.5321958331164074e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9998493969440461, + "num_tokens": 153648137.0, + "step": 45280 + }, + { + "entropy": 0.07504986342974007, + "epoch": 10.556008858841357, + "grad_norm": 0.125, + "learning_rate": 4.53207676463405e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999784767627716, + "num_tokens": 153677806.0, + "step": 45285 + }, + { + "entropy": 0.06699912054464222, + "epoch": 10.557174495861988, + "grad_norm": 0.022705078125, + "learning_rate": 4.531957684489753e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153697423.0, + "step": 45290 + }, + { + "entropy": 0.04461238384246826, + "epoch": 10.55834013288262, + "grad_norm": 0.0260009765625, + "learning_rate": 4.53183859268529e-05, + "loss": 0.0011, + "mean_token_accuracy": 0.9996108174324035, + "num_tokens": 153728335.0, + "step": 45295 + }, + { + "entropy": 0.04771164106205106, + "epoch": 10.559505769903252, + "grad_norm": 0.06298828125, + "learning_rate": 4.531719489222438e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153754782.0, + "step": 45300 + }, + { + "entropy": 0.04681495912373066, + "epoch": 10.560671406923884, + "grad_norm": 0.0264892578125, + "learning_rate": 4.531600374102973e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999852657318116, + "num_tokens": 153778119.0, + "step": 45305 + }, + { + "entropy": 0.05865760799497366, + "epoch": 10.561837043944516, + "grad_norm": 0.0179443359375, + "learning_rate": 4.53148124732867e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153797206.0, + "step": 45310 + }, + { + "entropy": 0.05252798534929752, + "epoch": 10.563002680965148, + "grad_norm": 0.043701171875, + "learning_rate": 4.531362108901306e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 153825044.0, + "step": 45315 + }, + { + "entropy": 0.07219594558700919, + "epoch": 10.564168317985779, + "grad_norm": 0.0147705078125, + "learning_rate": 4.5312429588226593e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999789476394654, + "num_tokens": 153849354.0, + "step": 45320 + }, + { + "entropy": 0.0687197322025895, + "epoch": 10.56533395500641, + "grad_norm": 0.021240234375, + "learning_rate": 4.531123797094503e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 153868137.0, + "step": 45325 + }, + { + "entropy": 0.04669546764343977, + "epoch": 10.566499592027043, + "grad_norm": 0.09912109375, + "learning_rate": 4.531004623718618e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9997701466083526, + "num_tokens": 153893125.0, + "step": 45330 + }, + { + "entropy": 0.061506491433829066, + "epoch": 10.567665229047675, + "grad_norm": 0.11279296875, + "learning_rate": 4.530885438696778e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999893963336944, + "num_tokens": 153918793.0, + "step": 45335 + }, + { + "entropy": 0.047557246033102275, + "epoch": 10.568830866068307, + "grad_norm": 0.01171875, + "learning_rate": 4.530766242030762e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 153944278.0, + "step": 45340 + }, + { + "entropy": 0.05632236008532345, + "epoch": 10.569996503088937, + "grad_norm": 0.357421875, + "learning_rate": 4.530647033722347e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999646008014679, + "num_tokens": 153967238.0, + "step": 45345 + }, + { + "entropy": 0.04898662269115448, + "epoch": 10.57116214010957, + "grad_norm": 0.015869140625, + "learning_rate": 4.53052781377331e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 153989791.0, + "step": 45350 + }, + { + "entropy": 0.06470034439116716, + "epoch": 10.572327777130202, + "grad_norm": 0.016357421875, + "learning_rate": 4.53040858218543e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154000301.0, + "step": 45355 + }, + { + "entropy": 0.04806965310126543, + "epoch": 10.573493414150834, + "grad_norm": 0.7890625, + "learning_rate": 4.530289338960484e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9995983958244323, + "num_tokens": 154026538.0, + "step": 45360 + }, + { + "entropy": 0.1315355844795704, + "epoch": 10.574659051171466, + "grad_norm": 3.5, + "learning_rate": 4.530170084100251e-05, + "loss": 0.1375, + "mean_token_accuracy": 0.9813533067703247, + "num_tokens": 154046506.0, + "step": 45365 + }, + { + "entropy": 0.05549485264346003, + "epoch": 10.575824688192096, + "grad_norm": 0.048828125, + "learning_rate": 4.5300508176065074e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999867022037506, + "num_tokens": 154074785.0, + "step": 45370 + }, + { + "entropy": 0.03857192704454064, + "epoch": 10.576990325212728, + "grad_norm": 0.0181884765625, + "learning_rate": 4.529931539481034e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9999784290790558, + "num_tokens": 154115955.0, + "step": 45375 + }, + { + "entropy": 0.06022445512935519, + "epoch": 10.57815596223336, + "grad_norm": 0.057861328125, + "learning_rate": 4.529812249725608e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 154131298.0, + "step": 45380 + }, + { + "entropy": 0.06327858474105597, + "epoch": 10.579321599253992, + "grad_norm": 0.1318359375, + "learning_rate": 4.529692948342008e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999776542186737, + "num_tokens": 154160613.0, + "step": 45385 + }, + { + "entropy": 0.06446757782250642, + "epoch": 10.580487236274625, + "grad_norm": 0.044677734375, + "learning_rate": 4.5295736353320135e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154177442.0, + "step": 45390 + }, + { + "entropy": 0.06058889739215374, + "epoch": 10.581652873295257, + "grad_norm": 0.0189208984375, + "learning_rate": 4.5294543106974036e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154188632.0, + "step": 45395 + }, + { + "entropy": 0.05697990693151951, + "epoch": 10.582818510315887, + "grad_norm": 0.0224609375, + "learning_rate": 4.5293349744399574e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999100744724274, + "num_tokens": 154202061.0, + "step": 45400 + }, + { + "entropy": 0.049638493172824386, + "epoch": 10.583984147336519, + "grad_norm": 0.049560546875, + "learning_rate": 4.529215626561455e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.999984622001648, + "num_tokens": 154219966.0, + "step": 45405 + }, + { + "entropy": 0.062036858219653365, + "epoch": 10.585149784357151, + "grad_norm": 0.06201171875, + "learning_rate": 4.529096267063676e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 154236505.0, + "step": 45410 + }, + { + "entropy": 0.04993337215855718, + "epoch": 10.586315421377783, + "grad_norm": 0.0159912109375, + "learning_rate": 4.5289768959483985e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999766051769257, + "num_tokens": 154267797.0, + "step": 45415 + }, + { + "entropy": 0.05615239506587386, + "epoch": 10.587481058398415, + "grad_norm": 0.0296630859375, + "learning_rate": 4.528857513217405e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154286964.0, + "step": 45420 + }, + { + "entropy": 0.057061603758484125, + "epoch": 10.588646695419046, + "grad_norm": 0.09716796875, + "learning_rate": 4.5287381188724745e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999895453453064, + "num_tokens": 154307212.0, + "step": 45425 + }, + { + "entropy": 0.05221607079729438, + "epoch": 10.589812332439678, + "grad_norm": 0.03759765625, + "learning_rate": 4.528618712915386e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154326983.0, + "step": 45430 + }, + { + "entropy": 0.06867706384509802, + "epoch": 10.59097796946031, + "grad_norm": 0.020263671875, + "learning_rate": 4.528499295347923e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 154339509.0, + "step": 45435 + }, + { + "entropy": 0.05803119344636798, + "epoch": 10.592143606480942, + "grad_norm": 0.0166015625, + "learning_rate": 4.5283798661718635e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999827980995178, + "num_tokens": 154360403.0, + "step": 45440 + }, + { + "entropy": 0.05287682805210352, + "epoch": 10.593309243501574, + "grad_norm": 0.111328125, + "learning_rate": 4.5282604253889896e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999831140041351, + "num_tokens": 154380649.0, + "step": 45445 + }, + { + "entropy": 0.057697663782164454, + "epoch": 10.594474880522206, + "grad_norm": 0.0159912109375, + "learning_rate": 4.528140973001083e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9998168468475341, + "num_tokens": 154408373.0, + "step": 45450 + }, + { + "entropy": 0.05991329280659556, + "epoch": 10.595640517542837, + "grad_norm": 0.02685546875, + "learning_rate": 4.5280215090099235e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154421905.0, + "step": 45455 + }, + { + "entropy": 0.05662989765405655, + "epoch": 10.596806154563469, + "grad_norm": 0.1142578125, + "learning_rate": 4.527902033417293e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154442673.0, + "step": 45460 + }, + { + "entropy": 0.05534627726301551, + "epoch": 10.5979717915841, + "grad_norm": 0.03759765625, + "learning_rate": 4.527782546224973e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 154462617.0, + "step": 45465 + }, + { + "entropy": 0.06300230165943503, + "epoch": 10.599137428604733, + "grad_norm": 0.0177001953125, + "learning_rate": 4.527663047434746e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999778687953949, + "num_tokens": 154482320.0, + "step": 45470 + }, + { + "entropy": 0.06169234653934837, + "epoch": 10.600303065625365, + "grad_norm": 0.0289306640625, + "learning_rate": 4.527543537048392e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999802231788635, + "num_tokens": 154498935.0, + "step": 45475 + }, + { + "entropy": 0.0762554974295199, + "epoch": 10.601468702645995, + "grad_norm": 0.10205078125, + "learning_rate": 4.527424015067696e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999882817268372, + "num_tokens": 154521923.0, + "step": 45480 + }, + { + "entropy": 0.045643165893852713, + "epoch": 10.602634339666627, + "grad_norm": 0.05322265625, + "learning_rate": 4.527304481494438e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999760866165162, + "num_tokens": 154550699.0, + "step": 45485 + }, + { + "entropy": 0.06465123752132058, + "epoch": 10.60379997668726, + "grad_norm": 0.03076171875, + "learning_rate": 4.527184936330401e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 154582411.0, + "step": 45490 + }, + { + "entropy": 0.05987545819953084, + "epoch": 10.604965613707892, + "grad_norm": 0.080078125, + "learning_rate": 4.5270653795773676e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999893605709076, + "num_tokens": 154607072.0, + "step": 45495 + }, + { + "entropy": 0.05700749680399895, + "epoch": 10.606131250728524, + "grad_norm": 0.02197265625, + "learning_rate": 4.52694581123712e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 154619307.0, + "step": 45500 + }, + { + "entropy": 0.050329295732080934, + "epoch": 10.607296887749154, + "grad_norm": 0.0703125, + "learning_rate": 4.526826231311443e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 154638654.0, + "step": 45505 + }, + { + "entropy": 0.06347782760858536, + "epoch": 10.608462524769786, + "grad_norm": 0.0272216796875, + "learning_rate": 4.5267066398021174e-05, + "loss": 0.005, + "mean_token_accuracy": 0.9992537200450897, + "num_tokens": 154652379.0, + "step": 45510 + }, + { + "entropy": 0.0653164304792881, + "epoch": 10.609628161790418, + "grad_norm": 0.0194091796875, + "learning_rate": 4.526587036710928e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154665309.0, + "step": 45515 + }, + { + "entropy": 0.10336335860192776, + "epoch": 10.61079379881105, + "grad_norm": 0.0186767578125, + "learning_rate": 4.526467422039658e-05, + "loss": 0.051, + "mean_token_accuracy": 0.9942165672779083, + "num_tokens": 154689631.0, + "step": 45520 + }, + { + "entropy": 0.04605412124656141, + "epoch": 10.611959435831682, + "grad_norm": 0.404296875, + "learning_rate": 4.526347795790091e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999624967575074, + "num_tokens": 154719478.0, + "step": 45525 + }, + { + "entropy": 0.06172304879873991, + "epoch": 10.613125072852315, + "grad_norm": 0.0245361328125, + "learning_rate": 4.5262281579640106e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 154742502.0, + "step": 45530 + }, + { + "entropy": 0.06773852966725827, + "epoch": 10.614290709872945, + "grad_norm": 0.032958984375, + "learning_rate": 4.526108508563201e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154752874.0, + "step": 45535 + }, + { + "entropy": 0.06439133267849684, + "epoch": 10.615456346893577, + "grad_norm": 0.01202392578125, + "learning_rate": 4.5259888475894454e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999867498874664, + "num_tokens": 154776941.0, + "step": 45540 + }, + { + "entropy": 0.05004207184538245, + "epoch": 10.616621983914209, + "grad_norm": 0.043212890625, + "learning_rate": 4.5258691750445295e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154799157.0, + "step": 45545 + }, + { + "entropy": 0.05588539754971862, + "epoch": 10.617787620934841, + "grad_norm": 0.025390625, + "learning_rate": 4.5257494909302366e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 154818796.0, + "step": 45550 + }, + { + "entropy": 0.04940931098535657, + "epoch": 10.618953257955473, + "grad_norm": 0.016845703125, + "learning_rate": 4.525629795248353e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154840657.0, + "step": 45555 + }, + { + "entropy": 0.05910136494785547, + "epoch": 10.620118894976104, + "grad_norm": 0.0157470703125, + "learning_rate": 4.525510088000662e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154852226.0, + "step": 45560 + }, + { + "entropy": 0.0557030210737139, + "epoch": 10.621284531996736, + "grad_norm": 0.027099609375, + "learning_rate": 4.525390369188949e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9997493743896484, + "num_tokens": 154878594.0, + "step": 45565 + }, + { + "entropy": 0.05533176232129335, + "epoch": 10.622450169017368, + "grad_norm": 0.76953125, + "learning_rate": 4.525270638814999e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9996478855609894, + "num_tokens": 154898526.0, + "step": 45570 + }, + { + "entropy": 0.058144222619012, + "epoch": 10.623615806038, + "grad_norm": 0.0361328125, + "learning_rate": 4.525150896880599e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154925922.0, + "step": 45575 + }, + { + "entropy": 0.062190312519669534, + "epoch": 10.624781443058632, + "grad_norm": 0.1767578125, + "learning_rate": 4.525031143387533e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154946470.0, + "step": 45580 + }, + { + "entropy": 0.08605777956545353, + "epoch": 10.625947080079264, + "grad_norm": 1.84375, + "learning_rate": 4.5249113783375855e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9994974851608276, + "num_tokens": 154955771.0, + "step": 45585 + }, + { + "entropy": 0.06026861779391766, + "epoch": 10.627112717099894, + "grad_norm": 0.0146484375, + "learning_rate": 4.524791601732545e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 154967988.0, + "step": 45590 + }, + { + "entropy": 0.05325524704530835, + "epoch": 10.628278354120527, + "grad_norm": 0.017822265625, + "learning_rate": 4.524671813574196e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999680340290069, + "num_tokens": 154982691.0, + "step": 45595 + }, + { + "entropy": 0.06144336890429258, + "epoch": 10.629443991141159, + "grad_norm": 0.02294921875, + "learning_rate": 4.5245520138643254e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999618291854858, + "num_tokens": 155000448.0, + "step": 45600 + }, + { + "entropy": 0.052808401314541696, + "epoch": 10.63060962816179, + "grad_norm": 0.017822265625, + "learning_rate": 4.5244322026047195e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 155018584.0, + "step": 45605 + }, + { + "entropy": 0.03916442524641752, + "epoch": 10.631775265182423, + "grad_norm": 0.06494140625, + "learning_rate": 4.5243123797971644e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 155042625.0, + "step": 45610 + }, + { + "entropy": 0.049193437211215496, + "epoch": 10.632940902203053, + "grad_norm": 0.443359375, + "learning_rate": 4.524192545443446e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 155057525.0, + "step": 45615 + }, + { + "entropy": 0.05743730580434203, + "epoch": 10.634106539223685, + "grad_norm": 0.06298828125, + "learning_rate": 4.5240726995453536e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 155073628.0, + "step": 45620 + }, + { + "entropy": 0.06278720712289214, + "epoch": 10.635272176244317, + "grad_norm": 0.10595703125, + "learning_rate": 4.523952842104673e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999800860881806, + "num_tokens": 155105856.0, + "step": 45625 + }, + { + "entropy": 0.05687279971316457, + "epoch": 10.63643781326495, + "grad_norm": 0.11474609375, + "learning_rate": 4.523832973123191e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999889671802521, + "num_tokens": 155128659.0, + "step": 45630 + }, + { + "entropy": 0.06879281736910343, + "epoch": 10.637603450285582, + "grad_norm": 0.080078125, + "learning_rate": 4.523713092602695e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 155143525.0, + "step": 45635 + }, + { + "entropy": 0.06541344905272126, + "epoch": 10.638769087306212, + "grad_norm": 0.185546875, + "learning_rate": 4.523593200544974e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999894678592682, + "num_tokens": 155163360.0, + "step": 45640 + }, + { + "entropy": 0.04915583487600088, + "epoch": 10.639934724326844, + "grad_norm": 0.025634765625, + "learning_rate": 4.523473296951814e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 155187475.0, + "step": 45645 + }, + { + "entropy": 0.04946916834451258, + "epoch": 10.641100361347476, + "grad_norm": 0.03369140625, + "learning_rate": 4.523353381825004e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.999989241361618, + "num_tokens": 155209442.0, + "step": 45650 + }, + { + "entropy": 0.05402911715209484, + "epoch": 10.642265998368108, + "grad_norm": 0.009765625, + "learning_rate": 4.5232334551663326e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 155223725.0, + "step": 45655 + }, + { + "entropy": 0.05086983609944582, + "epoch": 10.64343163538874, + "grad_norm": 0.0228271484375, + "learning_rate": 4.523113516977586e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 155250672.0, + "step": 45660 + }, + { + "entropy": 0.05399879598990083, + "epoch": 10.644597272409372, + "grad_norm": 0.1171875, + "learning_rate": 4.5229935672605554e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999883711338043, + "num_tokens": 155280736.0, + "step": 45665 + }, + { + "entropy": 0.0479968911036849, + "epoch": 10.645762909430003, + "grad_norm": 0.047607421875, + "learning_rate": 4.5228736060170274e-05, + "loss": 0.0009, + "mean_token_accuracy": 1.0, + "num_tokens": 155314206.0, + "step": 45670 + }, + { + "entropy": 0.04139636503532529, + "epoch": 10.646928546450635, + "grad_norm": 0.01904296875, + "learning_rate": 4.522753633248792e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 155352210.0, + "step": 45675 + }, + { + "entropy": 0.0488029814325273, + "epoch": 10.648094183471267, + "grad_norm": 0.0966796875, + "learning_rate": 4.522633648957637e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 155386905.0, + "step": 45680 + }, + { + "entropy": 0.034656665613874794, + "epoch": 10.6492598204919, + "grad_norm": 0.0908203125, + "learning_rate": 4.522513653145352e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999785244464874, + "num_tokens": 155429889.0, + "step": 45685 + }, + { + "entropy": 0.058836123626679185, + "epoch": 10.650425457512531, + "grad_norm": 0.053955078125, + "learning_rate": 4.5223936458137276e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 155442744.0, + "step": 45690 + }, + { + "entropy": 0.04692996209487319, + "epoch": 10.651591094533162, + "grad_norm": 0.041748046875, + "learning_rate": 4.5222736269645514e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 155476379.0, + "step": 45695 + }, + { + "entropy": 0.0463966122828424, + "epoch": 10.652756731553794, + "grad_norm": 0.01190185546875, + "learning_rate": 4.522153596599614e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 155493833.0, + "step": 45700 + }, + { + "entropy": 0.05225074263289571, + "epoch": 10.653922368574426, + "grad_norm": 0.048095703125, + "learning_rate": 4.522033554720705e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 155514467.0, + "step": 45705 + }, + { + "entropy": 0.05257317861542106, + "epoch": 10.655088005595058, + "grad_norm": 0.05908203125, + "learning_rate": 4.5219135013296145e-05, + "loss": 0.001, + "mean_token_accuracy": 1.0, + "num_tokens": 155547168.0, + "step": 45710 + }, + { + "entropy": 0.07927863914519548, + "epoch": 10.65625364261569, + "grad_norm": 0.0712890625, + "learning_rate": 4.5217934364281324e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 155565651.0, + "step": 45715 + }, + { + "entropy": 0.05985526898875833, + "epoch": 10.657419279636322, + "grad_norm": 0.0223388671875, + "learning_rate": 4.52167336001805e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 155580332.0, + "step": 45720 + }, + { + "entropy": 0.062313320487737654, + "epoch": 10.658584916656952, + "grad_norm": 0.0211181640625, + "learning_rate": 4.5215532721011563e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 155592185.0, + "step": 45725 + }, + { + "entropy": 0.06476543843746185, + "epoch": 10.659750553677585, + "grad_norm": 0.0233154296875, + "learning_rate": 4.5214331726792436e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 155602391.0, + "step": 45730 + }, + { + "entropy": 0.06466516926884651, + "epoch": 10.660916190698217, + "grad_norm": 0.01214599609375, + "learning_rate": 4.5213130617541016e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 155614267.0, + "step": 45735 + }, + { + "entropy": 0.07169229295104743, + "epoch": 10.662081827718849, + "grad_norm": 0.11474609375, + "learning_rate": 4.521192939327522e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 155627312.0, + "step": 45740 + }, + { + "entropy": 0.08903030175715684, + "epoch": 10.66324746473948, + "grad_norm": 0.0111083984375, + "learning_rate": 4.521072805401296e-05, + "loss": 0.0249, + "mean_token_accuracy": 0.9973048746585846, + "num_tokens": 155653835.0, + "step": 45745 + }, + { + "entropy": 0.07297799242660404, + "epoch": 10.664413101760111, + "grad_norm": 0.040283203125, + "learning_rate": 4.520952659977214e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999803006649017, + "num_tokens": 155668901.0, + "step": 45750 + }, + { + "entropy": 0.08213113537058234, + "epoch": 10.665578738780743, + "grad_norm": 2.96875, + "learning_rate": 4.520832503057069e-05, + "loss": 0.0383, + "mean_token_accuracy": 0.9947204291820526, + "num_tokens": 155690681.0, + "step": 45755 + }, + { + "entropy": 0.06827007010579109, + "epoch": 10.666744375801375, + "grad_norm": 0.0225830078125, + "learning_rate": 4.5207123346426513e-05, + "loss": 0.0023, + "mean_token_accuracy": 0.9998970210552216, + "num_tokens": 155709925.0, + "step": 45760 + }, + { + "entropy": 0.06171514326706529, + "epoch": 10.667910012822007, + "grad_norm": 0.0274658203125, + "learning_rate": 4.520592154735753e-05, + "loss": 0.0141, + "mean_token_accuracy": 0.9981945633888245, + "num_tokens": 155740138.0, + "step": 45765 + }, + { + "entropy": 0.056561283254995945, + "epoch": 10.66907564984264, + "grad_norm": 0.03076171875, + "learning_rate": 4.5204719633381676e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 155769312.0, + "step": 45770 + }, + { + "entropy": 0.07458315212279558, + "epoch": 10.67024128686327, + "grad_norm": 0.033203125, + "learning_rate": 4.520351760451686e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 155779671.0, + "step": 45775 + }, + { + "entropy": 0.07302795853465796, + "epoch": 10.671406923883902, + "grad_norm": 0.02001953125, + "learning_rate": 4.520231546078101e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 155793274.0, + "step": 45780 + }, + { + "entropy": 0.06928151659667492, + "epoch": 10.672572560904534, + "grad_norm": 0.0115966796875, + "learning_rate": 4.5201113202192056e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 155806913.0, + "step": 45785 + }, + { + "entropy": 0.07158723613247275, + "epoch": 10.673738197925166, + "grad_norm": 0.0203857421875, + "learning_rate": 4.5199910828767916e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999862253665924, + "num_tokens": 155828497.0, + "step": 45790 + }, + { + "entropy": 0.06955606564879417, + "epoch": 10.674903834945798, + "grad_norm": 0.0181884765625, + "learning_rate": 4.519870834052652e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 155838451.0, + "step": 45795 + }, + { + "entropy": 0.04313288903795183, + "epoch": 10.67606947196643, + "grad_norm": 0.09033203125, + "learning_rate": 4.519750573748581e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 155858054.0, + "step": 45800 + }, + { + "entropy": 0.05261556897312403, + "epoch": 10.67723510898706, + "grad_norm": 0.078125, + "learning_rate": 4.5196303019663715e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 155885122.0, + "step": 45805 + }, + { + "entropy": 0.055707470141351224, + "epoch": 10.678400746007693, + "grad_norm": 0.0277099609375, + "learning_rate": 4.519510018707817e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 155905105.0, + "step": 45810 + }, + { + "entropy": 0.05604796879924834, + "epoch": 10.679566383028325, + "grad_norm": 0.0225830078125, + "learning_rate": 4.519389723974709e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999892711639404, + "num_tokens": 155939506.0, + "step": 45815 + }, + { + "entropy": 0.06491890689358115, + "epoch": 10.680732020048957, + "grad_norm": 0.0283203125, + "learning_rate": 4.519269417768844e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 155959895.0, + "step": 45820 + }, + { + "entropy": 0.0519371272996068, + "epoch": 10.68189765706959, + "grad_norm": 0.01348876953125, + "learning_rate": 4.519149100092015e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999669194221497, + "num_tokens": 155983069.0, + "step": 45825 + }, + { + "entropy": 0.05116039318963885, + "epoch": 10.68306329409022, + "grad_norm": 0.01495361328125, + "learning_rate": 4.519028770946016e-05, + "loss": 0.0017, + "mean_token_accuracy": 0.9999583482742309, + "num_tokens": 156014111.0, + "step": 45830 + }, + { + "entropy": 0.04381315428763628, + "epoch": 10.684228931110852, + "grad_norm": 0.021484375, + "learning_rate": 4.518908430332641e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 156031814.0, + "step": 45835 + }, + { + "entropy": 0.09437487740069628, + "epoch": 10.685394568131484, + "grad_norm": 0.0244140625, + "learning_rate": 4.518788078253685e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 156042372.0, + "step": 45840 + }, + { + "entropy": 0.05893740877509117, + "epoch": 10.686560205152116, + "grad_norm": 0.0235595703125, + "learning_rate": 4.5186677147109426e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 156054300.0, + "step": 45845 + }, + { + "entropy": 0.03940586084499955, + "epoch": 10.687725842172748, + "grad_norm": 0.04296875, + "learning_rate": 4.518547339706208e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 156081050.0, + "step": 45850 + }, + { + "entropy": 0.05552921891212463, + "epoch": 10.68889147919338, + "grad_norm": 0.1005859375, + "learning_rate": 4.518426953241277e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 156090944.0, + "step": 45855 + }, + { + "entropy": 0.03403740664944053, + "epoch": 10.69005711621401, + "grad_norm": 0.0233154296875, + "learning_rate": 4.518306555317944e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 156125244.0, + "step": 45860 + }, + { + "entropy": 0.051710548158735034, + "epoch": 10.691222753234642, + "grad_norm": 0.041015625, + "learning_rate": 4.518186145938005e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 156152733.0, + "step": 45865 + }, + { + "entropy": 0.046452429797500375, + "epoch": 10.692388390255275, + "grad_norm": 0.0184326171875, + "learning_rate": 4.518065725103255e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 156176237.0, + "step": 45870 + }, + { + "entropy": 0.05762659339234233, + "epoch": 10.693554027275907, + "grad_norm": 0.01446533203125, + "learning_rate": 4.51794529281549e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999857604503631, + "num_tokens": 156203627.0, + "step": 45875 + }, + { + "entropy": 0.05502113308757543, + "epoch": 10.694719664296539, + "grad_norm": 0.0296630859375, + "learning_rate": 4.5178248490765054e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 156220487.0, + "step": 45880 + }, + { + "entropy": 0.04440567507408559, + "epoch": 10.695885301317169, + "grad_norm": 0.07763671875, + "learning_rate": 4.517704393888097e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999863386154175, + "num_tokens": 156255165.0, + "step": 45885 + }, + { + "entropy": 0.038073526509106156, + "epoch": 10.697050938337801, + "grad_norm": 0.012939453125, + "learning_rate": 4.517583927252062e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999889850616455, + "num_tokens": 156292113.0, + "step": 45890 + }, + { + "entropy": 0.0740324473939836, + "epoch": 10.698216575358433, + "grad_norm": 0.03759765625, + "learning_rate": 4.517463449170196e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 156304723.0, + "step": 45895 + }, + { + "entropy": 0.09565671551972628, + "epoch": 10.699382212379065, + "grad_norm": 0.05615234375, + "learning_rate": 4.5173429596442955e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 156322150.0, + "step": 45900 + }, + { + "entropy": 0.06854218691587448, + "epoch": 10.700547849399697, + "grad_norm": 0.0252685546875, + "learning_rate": 4.517222458676158e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 156333929.0, + "step": 45905 + }, + { + "entropy": 0.054353891499340536, + "epoch": 10.701713486420328, + "grad_norm": 0.04736328125, + "learning_rate": 4.517101946267579e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9997032642364502, + "num_tokens": 156347801.0, + "step": 45910 + }, + { + "entropy": 0.05088144233450294, + "epoch": 10.70287912344096, + "grad_norm": 0.0147705078125, + "learning_rate": 4.516981422420356e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999614655971527, + "num_tokens": 156362209.0, + "step": 45915 + }, + { + "entropy": 0.064677076600492, + "epoch": 10.704044760461592, + "grad_norm": 0.016357421875, + "learning_rate": 4.516860887136287e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 156377040.0, + "step": 45920 + }, + { + "entropy": 0.050365370139479636, + "epoch": 10.705210397482224, + "grad_norm": 0.04443359375, + "learning_rate": 4.516740340417168e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 156399740.0, + "step": 45925 + }, + { + "entropy": 0.04361268552020192, + "epoch": 10.706376034502856, + "grad_norm": 0.349609375, + "learning_rate": 4.516619782264798e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999781847000122, + "num_tokens": 156421815.0, + "step": 45930 + }, + { + "entropy": 0.0545174271799624, + "epoch": 10.707541671523488, + "grad_norm": 0.181640625, + "learning_rate": 4.516499212680974e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999893307685852, + "num_tokens": 156442944.0, + "step": 45935 + }, + { + "entropy": 0.04827831089496613, + "epoch": 10.708707308544119, + "grad_norm": 0.024658203125, + "learning_rate": 4.5163786316674934e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 156471993.0, + "step": 45940 + }, + { + "entropy": 0.05747170811519027, + "epoch": 10.70987294556475, + "grad_norm": 0.053466796875, + "learning_rate": 4.516258039226155e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 156488309.0, + "step": 45945 + }, + { + "entropy": 0.052263769414275886, + "epoch": 10.711038582585383, + "grad_norm": 0.05126953125, + "learning_rate": 4.516137435358757e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 156520541.0, + "step": 45950 + }, + { + "entropy": 0.05521758422255516, + "epoch": 10.712204219606015, + "grad_norm": 0.052734375, + "learning_rate": 4.516016820067096e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 156554999.0, + "step": 45955 + }, + { + "entropy": 0.07212651148438454, + "epoch": 10.713369856626647, + "grad_norm": 0.1484375, + "learning_rate": 4.515896193352974e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 156566072.0, + "step": 45960 + }, + { + "entropy": 0.05750099988654256, + "epoch": 10.714535493647277, + "grad_norm": 0.029296875, + "learning_rate": 4.515775555218187e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 156578638.0, + "step": 45965 + }, + { + "entropy": 0.06050075925886631, + "epoch": 10.71570113066791, + "grad_norm": 0.07177734375, + "learning_rate": 4.515654905664535e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 156605760.0, + "step": 45970 + }, + { + "entropy": 0.05188793493434787, + "epoch": 10.716866767688542, + "grad_norm": 0.01214599609375, + "learning_rate": 4.515534244693816e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999775767326355, + "num_tokens": 156631387.0, + "step": 45975 + }, + { + "entropy": 0.06129320347681642, + "epoch": 10.718032404709174, + "grad_norm": 0.052001953125, + "learning_rate": 4.5154135723078306e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 156650853.0, + "step": 45980 + }, + { + "entropy": 0.07202781811356544, + "epoch": 10.719198041729806, + "grad_norm": 0.01361083984375, + "learning_rate": 4.515292888508377e-05, + "loss": 0.0005, + "mean_token_accuracy": 1.0, + "num_tokens": 156672371.0, + "step": 45985 + }, + { + "entropy": 0.05794037450104952, + "epoch": 10.720363678750438, + "grad_norm": 0.03662109375, + "learning_rate": 4.515172193297256e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 156693221.0, + "step": 45990 + }, + { + "entropy": 0.06433992027305066, + "epoch": 10.721529315771068, + "grad_norm": 0.1474609375, + "learning_rate": 4.515051486676266e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999689877033233, + "num_tokens": 156717306.0, + "step": 45995 + }, + { + "entropy": 0.05043230140581727, + "epoch": 10.7226949527917, + "grad_norm": 0.0281982421875, + "learning_rate": 4.514930768647209e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999823093414306, + "num_tokens": 156735116.0, + "step": 46000 + }, + { + "entropy": 0.053006827272474764, + "epoch": 10.723860589812332, + "grad_norm": 0.0216064453125, + "learning_rate": 4.514810039211882e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 156760057.0, + "step": 46005 + }, + { + "entropy": 0.04849845627322793, + "epoch": 10.725026226832965, + "grad_norm": 0.130859375, + "learning_rate": 4.514689298372088e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9998394846916199, + "num_tokens": 156786045.0, + "step": 46010 + }, + { + "entropy": 0.07077356418594719, + "epoch": 10.726191863853597, + "grad_norm": 0.0289306640625, + "learning_rate": 4.514568546129626e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999822556972504, + "num_tokens": 156819310.0, + "step": 46015 + }, + { + "entropy": 0.06431170925498009, + "epoch": 10.727357500874227, + "grad_norm": 0.392578125, + "learning_rate": 4.5144477824862976e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999893486499787, + "num_tokens": 156843448.0, + "step": 46020 + }, + { + "entropy": 0.059689549077302215, + "epoch": 10.72852313789486, + "grad_norm": 0.423828125, + "learning_rate": 4.514327007443903e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 156860177.0, + "step": 46025 + }, + { + "entropy": 0.05863263411447406, + "epoch": 10.729688774915491, + "grad_norm": 0.0791015625, + "learning_rate": 4.514206221004242e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999895572662354, + "num_tokens": 156882655.0, + "step": 46030 + }, + { + "entropy": 0.06030096784234047, + "epoch": 10.730854411936123, + "grad_norm": 0.048095703125, + "learning_rate": 4.514085423169118e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999851763248444, + "num_tokens": 156905529.0, + "step": 46035 + }, + { + "entropy": 0.06778318686410785, + "epoch": 10.732020048956755, + "grad_norm": 0.0546875, + "learning_rate": 4.51396461394033e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 156928035.0, + "step": 46040 + }, + { + "entropy": 0.04988482487387955, + "epoch": 10.733185685977386, + "grad_norm": 0.18359375, + "learning_rate": 4.513843793319681e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999284684658051, + "num_tokens": 156978769.0, + "step": 46045 + }, + { + "entropy": 0.053737777099013326, + "epoch": 10.734351322998018, + "grad_norm": 0.050048828125, + "learning_rate": 4.513722961308973e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999889194965362, + "num_tokens": 157000562.0, + "step": 46050 + }, + { + "entropy": 0.0412151537835598, + "epoch": 10.73551696001865, + "grad_norm": 0.12060546875, + "learning_rate": 4.5136021179100055e-05, + "loss": 0.0006, + "mean_token_accuracy": 1.0, + "num_tokens": 157024160.0, + "step": 46055 + }, + { + "entropy": 0.04722663722932339, + "epoch": 10.736682597039282, + "grad_norm": 1.1640625, + "learning_rate": 4.513481263124583e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9996932506561279, + "num_tokens": 157035689.0, + "step": 46060 + }, + { + "entropy": 0.07388071976602077, + "epoch": 10.737848234059914, + "grad_norm": 0.0361328125, + "learning_rate": 4.5133603969545054e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 157047511.0, + "step": 46065 + }, + { + "entropy": 0.05361750088632107, + "epoch": 10.739013871080546, + "grad_norm": 0.1484375, + "learning_rate": 4.513239519401578e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999772608280182, + "num_tokens": 157083063.0, + "step": 46070 + }, + { + "entropy": 0.07026615431532264, + "epoch": 10.740179508101177, + "grad_norm": 0.03271484375, + "learning_rate": 4.513118630467599e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 157099612.0, + "step": 46075 + }, + { + "entropy": 0.05888519417494535, + "epoch": 10.741345145121809, + "grad_norm": 0.02197265625, + "learning_rate": 4.512997730154375e-05, + "loss": 0.0162, + "mean_token_accuracy": 0.9984605610370636, + "num_tokens": 157139175.0, + "step": 46080 + }, + { + "entropy": 0.0555834517814219, + "epoch": 10.74251078214244, + "grad_norm": 0.013916015625, + "learning_rate": 4.512876818463707e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.999962842464447, + "num_tokens": 157160147.0, + "step": 46085 + }, + { + "entropy": 0.0965851410292089, + "epoch": 10.743676419163073, + "grad_norm": 0.021484375, + "learning_rate": 4.5127558953973984e-05, + "loss": 0.0307, + "mean_token_accuracy": 0.9936910271644592, + "num_tokens": 157198608.0, + "step": 46090 + }, + { + "entropy": 0.04910420798696578, + "epoch": 10.744842056183705, + "grad_norm": 0.07080078125, + "learning_rate": 4.5126349609572515e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 157222442.0, + "step": 46095 + }, + { + "entropy": 0.04693204928189516, + "epoch": 10.746007693204335, + "grad_norm": 0.025146484375, + "learning_rate": 4.5125140151450706e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9997084558010101, + "num_tokens": 157239643.0, + "step": 46100 + }, + { + "entropy": 0.06487049106508494, + "epoch": 10.747173330224967, + "grad_norm": 0.029052734375, + "learning_rate": 4.512393057962659e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 157254305.0, + "step": 46105 + }, + { + "entropy": 0.07090209368616343, + "epoch": 10.7483389672456, + "grad_norm": 0.01031494140625, + "learning_rate": 4.5122720894118196e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 157263233.0, + "step": 46110 + }, + { + "entropy": 0.0584601731505245, + "epoch": 10.749504604266232, + "grad_norm": 0.044189453125, + "learning_rate": 4.512151109494357e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9993981122970581, + "num_tokens": 157292283.0, + "step": 46115 + }, + { + "entropy": 0.07151546142995358, + "epoch": 10.750670241286864, + "grad_norm": 0.119140625, + "learning_rate": 4.512030118212076e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.999989241361618, + "num_tokens": 157312137.0, + "step": 46120 + }, + { + "entropy": 0.05258299568668008, + "epoch": 10.751835878307496, + "grad_norm": 0.0218505859375, + "learning_rate": 4.511909115566779e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 157331948.0, + "step": 46125 + }, + { + "entropy": 0.058688655495643616, + "epoch": 10.753001515328126, + "grad_norm": 0.0224609375, + "learning_rate": 4.511788101560271e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 157352978.0, + "step": 46130 + }, + { + "entropy": 0.05229517556726933, + "epoch": 10.754167152348758, + "grad_norm": 0.0194091796875, + "learning_rate": 4.511667076194357e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 157364802.0, + "step": 46135 + }, + { + "entropy": 0.060772106423974036, + "epoch": 10.75533278936939, + "grad_norm": 0.1103515625, + "learning_rate": 4.511546039470841e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 157380506.0, + "step": 46140 + }, + { + "entropy": 0.07069172933697701, + "epoch": 10.756498426390023, + "grad_norm": 0.2109375, + "learning_rate": 4.511424991391528e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 157391149.0, + "step": 46145 + }, + { + "entropy": 0.048912717308849094, + "epoch": 10.757664063410655, + "grad_norm": 0.020751953125, + "learning_rate": 4.511303931958224e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 157414839.0, + "step": 46150 + }, + { + "entropy": 0.05329639483243227, + "epoch": 10.758829700431285, + "grad_norm": 0.033935546875, + "learning_rate": 4.5111828611727324e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 157431406.0, + "step": 46155 + }, + { + "entropy": 0.06239355998113751, + "epoch": 10.759995337451917, + "grad_norm": 0.28515625, + "learning_rate": 4.51106177903686e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999683320522308, + "num_tokens": 157458439.0, + "step": 46160 + }, + { + "entropy": 0.04378145812079311, + "epoch": 10.76116097447255, + "grad_norm": 0.0283203125, + "learning_rate": 4.510940685552411e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 157486671.0, + "step": 46165 + }, + { + "entropy": 0.06428087167441845, + "epoch": 10.762326611493181, + "grad_norm": 0.025390625, + "learning_rate": 4.5108195807211925e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 157496563.0, + "step": 46170 + }, + { + "entropy": 0.048792037460952994, + "epoch": 10.763492248513813, + "grad_norm": 0.0196533203125, + "learning_rate": 4.510698464545009e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 157518160.0, + "step": 46175 + }, + { + "entropy": 0.05703177060931921, + "epoch": 10.764657885534444, + "grad_norm": 0.030029296875, + "learning_rate": 4.510577337025668e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 157536400.0, + "step": 46180 + }, + { + "entropy": 0.058706553932279346, + "epoch": 10.765823522555076, + "grad_norm": 0.027587890625, + "learning_rate": 4.5104561981649754e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 157557203.0, + "step": 46185 + }, + { + "entropy": 0.04635893451049924, + "epoch": 10.766989159575708, + "grad_norm": 0.03515625, + "learning_rate": 4.510335047964736e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 157588463.0, + "step": 46190 + }, + { + "entropy": 0.05046179071068764, + "epoch": 10.76815479659634, + "grad_norm": 0.050048828125, + "learning_rate": 4.510213886426758e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 157622759.0, + "step": 46195 + }, + { + "entropy": 0.06388626941479743, + "epoch": 10.769320433616972, + "grad_norm": 0.05224609375, + "learning_rate": 4.510092713552847e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 157640856.0, + "step": 46200 + }, + { + "entropy": 0.04669735683128238, + "epoch": 10.770486070637604, + "grad_norm": 0.0238037109375, + "learning_rate": 4.5099715293448106e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 157668198.0, + "step": 46205 + }, + { + "entropy": 0.06003896193578839, + "epoch": 10.771651707658235, + "grad_norm": 0.671875, + "learning_rate": 4.5098503338044564e-05, + "loss": 0.001, + "mean_token_accuracy": 0.9995143592357636, + "num_tokens": 157698155.0, + "step": 46210 + }, + { + "entropy": 0.05959401056170464, + "epoch": 10.772817344678867, + "grad_norm": 0.058837890625, + "learning_rate": 4.50972912693359e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 157713655.0, + "step": 46215 + }, + { + "entropy": 0.06324677541851997, + "epoch": 10.773982981699499, + "grad_norm": 0.0233154296875, + "learning_rate": 4.5096079087340196e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.999968820810318, + "num_tokens": 157733745.0, + "step": 46220 + }, + { + "entropy": 0.08075557686388493, + "epoch": 10.77514861872013, + "grad_norm": 0.046142578125, + "learning_rate": 4.509486679207553e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 157742039.0, + "step": 46225 + }, + { + "entropy": 0.058673990145325663, + "epoch": 10.776314255740763, + "grad_norm": 0.041259765625, + "learning_rate": 4.509365438355998e-05, + "loss": 0.0006, + "mean_token_accuracy": 1.0, + "num_tokens": 157760679.0, + "step": 46230 + }, + { + "entropy": 0.06012842683121562, + "epoch": 10.777479892761393, + "grad_norm": 0.01239013671875, + "learning_rate": 4.509244186181162e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 157780551.0, + "step": 46235 + }, + { + "entropy": 0.05824370728805661, + "epoch": 10.778645529782025, + "grad_norm": 0.052734375, + "learning_rate": 4.509122922684853e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 157804333.0, + "step": 46240 + }, + { + "entropy": 0.05847919774241746, + "epoch": 10.779811166802657, + "grad_norm": 0.039306640625, + "learning_rate": 4.50900164786888e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999836504459381, + "num_tokens": 157819699.0, + "step": 46245 + }, + { + "entropy": 0.059690111130475995, + "epoch": 10.78097680382329, + "grad_norm": 0.03271484375, + "learning_rate": 4.50888036173505e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 157832701.0, + "step": 46250 + }, + { + "entropy": 0.0757465548813343, + "epoch": 10.782142440843922, + "grad_norm": 0.380859375, + "learning_rate": 4.508759064285173e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999886095523834, + "num_tokens": 157851007.0, + "step": 46255 + }, + { + "entropy": 0.055960895121097566, + "epoch": 10.783308077864554, + "grad_norm": 0.1220703125, + "learning_rate": 4.508637755521057e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999889612197876, + "num_tokens": 157883389.0, + "step": 46260 + }, + { + "entropy": 0.05103008281439543, + "epoch": 10.784473714885184, + "grad_norm": 0.12060546875, + "learning_rate": 4.5085164354445106e-05, + "loss": 0.0001, + "mean_token_accuracy": 0.9999893248081207, + "num_tokens": 157905917.0, + "step": 46265 + }, + { + "entropy": 0.07026658989489079, + "epoch": 10.785639351905816, + "grad_norm": 0.0322265625, + "learning_rate": 4.508395104057344e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 157926464.0, + "step": 46270 + }, + { + "entropy": 0.06333841476589441, + "epoch": 10.786804988926448, + "grad_norm": 0.038330078125, + "learning_rate": 4.508273761361365e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999777853488923, + "num_tokens": 157949618.0, + "step": 46275 + }, + { + "entropy": 0.04195570405572653, + "epoch": 10.78797062594708, + "grad_norm": 0.0208740234375, + "learning_rate": 4.508152407358384e-05, + "loss": 0.0009, + "mean_token_accuracy": 1.0, + "num_tokens": 158003139.0, + "step": 46280 + }, + { + "entropy": 0.06566681675612926, + "epoch": 10.789136262967713, + "grad_norm": 0.0439453125, + "learning_rate": 4.5080310420502104e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 158012765.0, + "step": 46285 + }, + { + "entropy": 0.05819013142026961, + "epoch": 10.790301899988343, + "grad_norm": 0.0625, + "learning_rate": 4.5079096654386534e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999180316925049, + "num_tokens": 158041200.0, + "step": 46290 + }, + { + "entropy": 0.07342990711331368, + "epoch": 10.791467537008975, + "grad_norm": 0.0301513671875, + "learning_rate": 4.5077882775255235e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999577939510346, + "num_tokens": 158060295.0, + "step": 46295 + }, + { + "entropy": 0.04707777537405491, + "epoch": 10.792633174029607, + "grad_norm": 0.0830078125, + "learning_rate": 4.5076668783126304e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.99990394115448, + "num_tokens": 158074145.0, + "step": 46300 + }, + { + "entropy": 0.04368063462898135, + "epoch": 10.79379881105024, + "grad_norm": 0.06689453125, + "learning_rate": 4.507545467801785e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 158102757.0, + "step": 46305 + }, + { + "entropy": 0.0559994793497026, + "epoch": 10.794964448070871, + "grad_norm": 0.31640625, + "learning_rate": 4.507424045994797e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999407052993774, + "num_tokens": 158121960.0, + "step": 46310 + }, + { + "entropy": 0.06052645109593868, + "epoch": 10.796130085091502, + "grad_norm": 0.03955078125, + "learning_rate": 4.507302612893478e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158130632.0, + "step": 46315 + }, + { + "entropy": 0.05715808067470789, + "epoch": 10.797295722112134, + "grad_norm": 0.03515625, + "learning_rate": 4.5071811684996365e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 158155311.0, + "step": 46320 + }, + { + "entropy": 0.06015035463497043, + "epoch": 10.798461359132766, + "grad_norm": 0.040771484375, + "learning_rate": 4.507059712815086e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158171695.0, + "step": 46325 + }, + { + "entropy": 0.06758764693513512, + "epoch": 10.799626996153398, + "grad_norm": 0.057861328125, + "learning_rate": 4.506938245841636e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999662697315216, + "num_tokens": 158196574.0, + "step": 46330 + }, + { + "entropy": 0.0585243116132915, + "epoch": 10.80079263317403, + "grad_norm": 0.04296875, + "learning_rate": 4.506816767581099e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158220399.0, + "step": 46335 + }, + { + "entropy": 0.054940407443791625, + "epoch": 10.801958270194662, + "grad_norm": 0.34765625, + "learning_rate": 4.506695278035285e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 158244022.0, + "step": 46340 + }, + { + "entropy": 0.05187456281855703, + "epoch": 10.803123907215292, + "grad_norm": 0.138671875, + "learning_rate": 4.506573777206006e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158264788.0, + "step": 46345 + }, + { + "entropy": 0.07387198638170958, + "epoch": 10.804289544235925, + "grad_norm": 0.06640625, + "learning_rate": 4.5064522650950745e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.999989116191864, + "num_tokens": 158285073.0, + "step": 46350 + }, + { + "entropy": 0.04432422863319516, + "epoch": 10.805455181256557, + "grad_norm": 0.01904296875, + "learning_rate": 4.506330741704302e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999845564365387, + "num_tokens": 158312097.0, + "step": 46355 + }, + { + "entropy": 0.047852301597595216, + "epoch": 10.806620818277189, + "grad_norm": 0.0181884765625, + "learning_rate": 4.506209207035501e-05, + "loss": 0.0008, + "mean_token_accuracy": 1.0, + "num_tokens": 158334327.0, + "step": 46360 + }, + { + "entropy": 0.0673111722804606, + "epoch": 10.807786455297821, + "grad_norm": 0.07470703125, + "learning_rate": 4.506087661090483e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158349202.0, + "step": 46365 + }, + { + "entropy": 0.043902119528502224, + "epoch": 10.808952092318451, + "grad_norm": 0.0225830078125, + "learning_rate": 4.50596610387106e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999886274337768, + "num_tokens": 158379655.0, + "step": 46370 + }, + { + "entropy": 0.0513251107186079, + "epoch": 10.810117729339083, + "grad_norm": 0.18359375, + "learning_rate": 4.505844535379046e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999780595302582, + "num_tokens": 158404138.0, + "step": 46375 + }, + { + "entropy": 0.07172129093669355, + "epoch": 10.811283366359715, + "grad_norm": 0.025634765625, + "learning_rate": 4.505722955616254e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158423565.0, + "step": 46380 + }, + { + "entropy": 0.048972241766750815, + "epoch": 10.812449003380348, + "grad_norm": 0.047607421875, + "learning_rate": 4.505601364584495e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999634981155395, + "num_tokens": 158450117.0, + "step": 46385 + }, + { + "entropy": 0.05351090729236603, + "epoch": 10.81361464040098, + "grad_norm": 0.01275634765625, + "learning_rate": 4.5054797622855825e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158465495.0, + "step": 46390 + }, + { + "entropy": 0.039100353326648477, + "epoch": 10.814780277421612, + "grad_norm": 0.267578125, + "learning_rate": 4.505358148721332e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999880850315094, + "num_tokens": 158501337.0, + "step": 46395 + }, + { + "entropy": 0.04310502801090479, + "epoch": 10.815945914442242, + "grad_norm": 0.028564453125, + "learning_rate": 4.505236523893554e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158520456.0, + "step": 46400 + }, + { + "entropy": 0.05818876605480909, + "epoch": 10.817111551462874, + "grad_norm": 0.03759765625, + "learning_rate": 4.5051148878040646e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158533301.0, + "step": 46405 + }, + { + "entropy": 0.074424147605896, + "epoch": 10.818277188483506, + "grad_norm": 0.0118408203125, + "learning_rate": 4.5049932404546755e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158550858.0, + "step": 46410 + }, + { + "entropy": 0.07016548365354539, + "epoch": 10.819442825504138, + "grad_norm": 0.1328125, + "learning_rate": 4.504871581847202e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 158561725.0, + "step": 46415 + }, + { + "entropy": 0.05289169065654278, + "epoch": 10.82060846252477, + "grad_norm": 0.0196533203125, + "learning_rate": 4.504749911983458e-05, + "loss": 0.0059, + "mean_token_accuracy": 0.9993242502212525, + "num_tokens": 158586226.0, + "step": 46420 + }, + { + "entropy": 0.042893216293305156, + "epoch": 10.8217740995454, + "grad_norm": 0.0164794921875, + "learning_rate": 4.504628230865258e-05, + "loss": 0.0035, + "mean_token_accuracy": 0.9993669390678406, + "num_tokens": 158615230.0, + "step": 46425 + }, + { + "entropy": 0.06487907837145031, + "epoch": 10.822939736566033, + "grad_norm": 0.134765625, + "learning_rate": 4.504506538494415e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999787032604217, + "num_tokens": 158637921.0, + "step": 46430 + }, + { + "entropy": 0.07466004192829132, + "epoch": 10.824105373586665, + "grad_norm": 2.453125, + "learning_rate": 4.504384834872745e-05, + "loss": 0.0014, + "mean_token_accuracy": 0.9998039186000824, + "num_tokens": 158648773.0, + "step": 46435 + }, + { + "entropy": 0.05474613988772035, + "epoch": 10.825271010607297, + "grad_norm": 0.01953125, + "learning_rate": 4.504263120002063e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158677327.0, + "step": 46440 + }, + { + "entropy": 0.07186794616281986, + "epoch": 10.82643664762793, + "grad_norm": 0.01177978515625, + "learning_rate": 4.504141393884183e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158689482.0, + "step": 46445 + }, + { + "entropy": 0.06628307662904262, + "epoch": 10.82760228464856, + "grad_norm": 0.28515625, + "learning_rate": 4.50401965652092e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 158701872.0, + "step": 46450 + }, + { + "entropy": 0.06336401142179966, + "epoch": 10.828767921669192, + "grad_norm": 0.029052734375, + "learning_rate": 4.503897907914091e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 158715592.0, + "step": 46455 + }, + { + "entropy": 0.06207911539822817, + "epoch": 10.829933558689824, + "grad_norm": 0.029296875, + "learning_rate": 4.503776148065509e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 158732624.0, + "step": 46460 + }, + { + "entropy": 0.0868097135797143, + "epoch": 10.831099195710456, + "grad_norm": 0.029296875, + "learning_rate": 4.503654376976992e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 158743880.0, + "step": 46465 + }, + { + "entropy": 0.0856756535358727, + "epoch": 10.832264832731088, + "grad_norm": 0.018798828125, + "learning_rate": 4.503532594650355e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158757618.0, + "step": 46470 + }, + { + "entropy": 0.055490178242325786, + "epoch": 10.83343046975172, + "grad_norm": 0.03564453125, + "learning_rate": 4.503410801087412e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 158772979.0, + "step": 46475 + }, + { + "entropy": 0.05386331751942634, + "epoch": 10.83459610677235, + "grad_norm": 0.029296875, + "learning_rate": 4.503288996289982e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 158787215.0, + "step": 46480 + }, + { + "entropy": 0.04529502475634217, + "epoch": 10.835761743792983, + "grad_norm": 0.01336669921875, + "learning_rate": 4.50316718025988e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9998682498931885, + "num_tokens": 158803337.0, + "step": 46485 + }, + { + "entropy": 0.055859196837991475, + "epoch": 10.836927380813615, + "grad_norm": 0.02001953125, + "learning_rate": 4.503045352998922e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158818801.0, + "step": 46490 + }, + { + "entropy": 0.050908899027854206, + "epoch": 10.838093017834247, + "grad_norm": 0.00799560546875, + "learning_rate": 4.502923514508926e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.999989241361618, + "num_tokens": 158840131.0, + "step": 46495 + }, + { + "entropy": 0.04821221772581339, + "epoch": 10.839258654854879, + "grad_norm": 0.024169921875, + "learning_rate": 4.5028016647917064e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158854498.0, + "step": 46500 + }, + { + "entropy": 0.06585240634158254, + "epoch": 10.84042429187551, + "grad_norm": 0.037109375, + "learning_rate": 4.5026798038490826e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158867837.0, + "step": 46505 + }, + { + "entropy": 0.051115707214921716, + "epoch": 10.841589928896141, + "grad_norm": 0.0201416015625, + "learning_rate": 4.502557931682872e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 158882615.0, + "step": 46510 + }, + { + "entropy": 0.0701981533318758, + "epoch": 10.842755565916773, + "grad_norm": 0.014892578125, + "learning_rate": 4.5024360482948885e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158891294.0, + "step": 46515 + }, + { + "entropy": 0.054075379855930805, + "epoch": 10.843921202937405, + "grad_norm": 0.2353515625, + "learning_rate": 4.502314153686953e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 158908722.0, + "step": 46520 + }, + { + "entropy": 0.059608714934438464, + "epoch": 10.845086839958038, + "grad_norm": 0.0146484375, + "learning_rate": 4.502192247860882e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158925185.0, + "step": 46525 + }, + { + "entropy": 0.051519249100238085, + "epoch": 10.84625247697867, + "grad_norm": 0.049072265625, + "learning_rate": 4.502070330818493e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999891877174377, + "num_tokens": 158944524.0, + "step": 46530 + }, + { + "entropy": 0.06343580950051546, + "epoch": 10.8474181139993, + "grad_norm": 0.07958984375, + "learning_rate": 4.501948402561604e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 158957367.0, + "step": 46535 + }, + { + "entropy": 0.06394379865378141, + "epoch": 10.848583751019932, + "grad_norm": 0.030029296875, + "learning_rate": 4.5018264630920335e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999693334102631, + "num_tokens": 158977627.0, + "step": 46540 + }, + { + "entropy": 0.056028542760759595, + "epoch": 10.849749388040564, + "grad_norm": 0.0164794921875, + "learning_rate": 4.5017045124116e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.999985671043396, + "num_tokens": 158994400.0, + "step": 46545 + }, + { + "entropy": 0.0522186104208231, + "epoch": 10.850915025061196, + "grad_norm": 0.134765625, + "learning_rate": 4.501582550522121e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999893844127655, + "num_tokens": 159015220.0, + "step": 46550 + }, + { + "entropy": 0.0532087117433548, + "epoch": 10.852080662081828, + "grad_norm": 0.029296875, + "learning_rate": 4.5014605774254157e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.999976909160614, + "num_tokens": 159043131.0, + "step": 46555 + }, + { + "entropy": 0.0711324105039239, + "epoch": 10.853246299102459, + "grad_norm": 0.2255859375, + "learning_rate": 4.5013385931233034e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999675273895263, + "num_tokens": 159059578.0, + "step": 46560 + }, + { + "entropy": 0.0600858336314559, + "epoch": 10.85441193612309, + "grad_norm": 0.02783203125, + "learning_rate": 4.501216597617602e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 159077354.0, + "step": 46565 + }, + { + "entropy": 0.05426020985469222, + "epoch": 10.855577573143723, + "grad_norm": 0.0283203125, + "learning_rate": 4.501094590910132e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 159098793.0, + "step": 46570 + }, + { + "entropy": 0.054496968165040016, + "epoch": 10.856743210164355, + "grad_norm": 0.130859375, + "learning_rate": 4.5009725730027115e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999775528907776, + "num_tokens": 159135013.0, + "step": 46575 + }, + { + "entropy": 0.05475781839340925, + "epoch": 10.857908847184987, + "grad_norm": 0.05908203125, + "learning_rate": 4.500850543897161e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999852359294892, + "num_tokens": 159152297.0, + "step": 46580 + }, + { + "entropy": 0.05325304474681616, + "epoch": 10.859074484205617, + "grad_norm": 0.314453125, + "learning_rate": 4.500728503595298e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 159172531.0, + "step": 46585 + }, + { + "entropy": 0.15666724601760507, + "epoch": 10.86024012122625, + "grad_norm": 0.01708984375, + "learning_rate": 4.500606452098945e-05, + "loss": 0.2038, + "mean_token_accuracy": 0.9648593962192535, + "num_tokens": 159195114.0, + "step": 46590 + }, + { + "entropy": 0.062145916000008586, + "epoch": 10.861405758246882, + "grad_norm": 0.07958984375, + "learning_rate": 4.500484389409921e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 159214629.0, + "step": 46595 + }, + { + "entropy": 0.06958608105778694, + "epoch": 10.862571395267514, + "grad_norm": 0.037841796875, + "learning_rate": 4.5003623155300455e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999890923500061, + "num_tokens": 159239052.0, + "step": 46600 + }, + { + "entropy": 0.05691871186718345, + "epoch": 10.863737032288146, + "grad_norm": 0.031494140625, + "learning_rate": 4.50024023046114e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 159266360.0, + "step": 46605 + }, + { + "entropy": 0.04313572673127055, + "epoch": 10.864902669308778, + "grad_norm": 0.058349609375, + "learning_rate": 4.500118134205023e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999039709568024, + "num_tokens": 159287232.0, + "step": 46610 + }, + { + "entropy": 0.06896574310958385, + "epoch": 10.866068306329408, + "grad_norm": 0.02978515625, + "learning_rate": 4.499996026763517e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 159296067.0, + "step": 46615 + }, + { + "entropy": 0.05110810687765479, + "epoch": 10.86723394335004, + "grad_norm": 0.0361328125, + "learning_rate": 4.499873908138442e-05, + "loss": 0.0061, + "mean_token_accuracy": 0.9997102320194244, + "num_tokens": 159324520.0, + "step": 46620 + }, + { + "entropy": 0.050146241392940286, + "epoch": 10.868399580370673, + "grad_norm": 0.076171875, + "learning_rate": 4.49975177833162e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 159342964.0, + "step": 46625 + }, + { + "entropy": 0.049869694840162994, + "epoch": 10.869565217391305, + "grad_norm": 0.02490234375, + "learning_rate": 4.4996296373448706e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 159376393.0, + "step": 46630 + }, + { + "entropy": 0.055807786621153356, + "epoch": 10.870730854411937, + "grad_norm": 0.01806640625, + "learning_rate": 4.499507485180016e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 159391769.0, + "step": 46635 + }, + { + "entropy": 0.044060825975611805, + "epoch": 10.871896491432567, + "grad_norm": 0.034423828125, + "learning_rate": 4.499385321838877e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 159425953.0, + "step": 46640 + }, + { + "entropy": 0.051666715648025274, + "epoch": 10.8730621284532, + "grad_norm": 0.0673828125, + "learning_rate": 4.499263147323276e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9997613370418549, + "num_tokens": 159443888.0, + "step": 46645 + }, + { + "entropy": 0.06478715725243092, + "epoch": 10.874227765473831, + "grad_norm": 0.09912109375, + "learning_rate": 4.499140961635035e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999750196933747, + "num_tokens": 159466030.0, + "step": 46650 + }, + { + "entropy": 0.05626236498355865, + "epoch": 10.875393402494463, + "grad_norm": 0.026123046875, + "learning_rate": 4.499018764775975e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.999988055229187, + "num_tokens": 159495164.0, + "step": 46655 + }, + { + "entropy": 0.06515727676451206, + "epoch": 10.876559039515096, + "grad_norm": 0.03466796875, + "learning_rate": 4.4988965567479186e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 159512208.0, + "step": 46660 + }, + { + "entropy": 0.0587312781251967, + "epoch": 10.877724676535728, + "grad_norm": 0.048095703125, + "learning_rate": 4.498774337552688e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 159532669.0, + "step": 46665 + }, + { + "entropy": 0.0911343522835523, + "epoch": 10.878890313556358, + "grad_norm": 0.016845703125, + "learning_rate": 4.4986521071921064e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 159547697.0, + "step": 46670 + }, + { + "entropy": 0.09437187728472055, + "epoch": 10.88005595057699, + "grad_norm": 0.0279541015625, + "learning_rate": 4.498529865667995e-05, + "loss": 0.0334, + "mean_token_accuracy": 0.9862868785858154, + "num_tokens": 159574345.0, + "step": 46675 + }, + { + "entropy": 0.056803678441792724, + "epoch": 10.881221587597622, + "grad_norm": 0.111328125, + "learning_rate": 4.498407612982178e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 159597605.0, + "step": 46680 + }, + { + "entropy": 0.0474022360984236, + "epoch": 10.882387224618254, + "grad_norm": 0.146484375, + "learning_rate": 4.4982853491364786e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.999748706817627, + "num_tokens": 159634069.0, + "step": 46685 + }, + { + "entropy": 0.062248018197715284, + "epoch": 10.883552861638886, + "grad_norm": 0.0177001953125, + "learning_rate": 4.498163074132718e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 159649192.0, + "step": 46690 + }, + { + "entropy": 0.05585647188127041, + "epoch": 10.884718498659517, + "grad_norm": 0.09716796875, + "learning_rate": 4.498040787972721e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999870717525482, + "num_tokens": 159669437.0, + "step": 46695 + }, + { + "entropy": 0.05814083656296134, + "epoch": 10.885884135680149, + "grad_norm": 0.02197265625, + "learning_rate": 4.4979184906583105e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.999438202381134, + "num_tokens": 159690161.0, + "step": 46700 + }, + { + "entropy": 0.052232743427157405, + "epoch": 10.887049772700781, + "grad_norm": 0.01806640625, + "learning_rate": 4.49779618219131e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 159705554.0, + "step": 46705 + }, + { + "entropy": 0.05564236463978887, + "epoch": 10.888215409721413, + "grad_norm": 0.0284423828125, + "learning_rate": 4.497673862573545e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 159722870.0, + "step": 46710 + }, + { + "entropy": 0.050008386839181185, + "epoch": 10.889381046742045, + "grad_norm": 0.01416015625, + "learning_rate": 4.497551531806837e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999751627445221, + "num_tokens": 159748801.0, + "step": 46715 + }, + { + "entropy": 0.05154750719666481, + "epoch": 10.890546683762675, + "grad_norm": 0.06103515625, + "learning_rate": 4.497429189893012e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 159770977.0, + "step": 46720 + }, + { + "entropy": 0.060008395742624995, + "epoch": 10.891712320783308, + "grad_norm": 0.0556640625, + "learning_rate": 4.4973068368338935e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 159793862.0, + "step": 46725 + }, + { + "entropy": 0.04886681037023664, + "epoch": 10.89287795780394, + "grad_norm": 0.10302734375, + "learning_rate": 4.4971844726313055e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9997793674468994, + "num_tokens": 159821291.0, + "step": 46730 + }, + { + "entropy": 0.06569123174995184, + "epoch": 10.894043594824572, + "grad_norm": 0.01202392578125, + "learning_rate": 4.497062097287074e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 159832940.0, + "step": 46735 + }, + { + "entropy": 0.0484948112629354, + "epoch": 10.895209231845204, + "grad_norm": 0.0242919921875, + "learning_rate": 4.496939710803022e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.999989491701126, + "num_tokens": 159856860.0, + "step": 46740 + }, + { + "entropy": 0.06075437283143401, + "epoch": 10.896374868865836, + "grad_norm": 0.01416015625, + "learning_rate": 4.496817313180976e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999761402606964, + "num_tokens": 159888601.0, + "step": 46745 + }, + { + "entropy": 0.052485890313982965, + "epoch": 10.897540505886466, + "grad_norm": 0.0235595703125, + "learning_rate": 4.49669490442276e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999137938022613, + "num_tokens": 159918599.0, + "step": 46750 + }, + { + "entropy": 0.043839930184185506, + "epoch": 10.898706142907098, + "grad_norm": 0.024658203125, + "learning_rate": 4.496572484530201e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999860525131226, + "num_tokens": 159943656.0, + "step": 46755 + }, + { + "entropy": 0.070591782592237, + "epoch": 10.89987177992773, + "grad_norm": 0.034423828125, + "learning_rate": 4.4964500535051224e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 159955076.0, + "step": 46760 + }, + { + "entropy": 0.05396025264635682, + "epoch": 10.901037416948363, + "grad_norm": 0.032470703125, + "learning_rate": 4.4963276113493516e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999826550483704, + "num_tokens": 159985512.0, + "step": 46765 + }, + { + "entropy": 0.039633904490619896, + "epoch": 10.902203053968995, + "grad_norm": 0.016845703125, + "learning_rate": 4.496205158064713e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160008250.0, + "step": 46770 + }, + { + "entropy": 0.06101240310817957, + "epoch": 10.903368690989625, + "grad_norm": 1.9453125, + "learning_rate": 4.496082693653033e-05, + "loss": 0.0013, + "mean_token_accuracy": 0.9997867822647095, + "num_tokens": 160022169.0, + "step": 46775 + }, + { + "entropy": 0.037630663625895976, + "epoch": 10.904534328010257, + "grad_norm": 0.2021484375, + "learning_rate": 4.495960218116138e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999630093574524, + "num_tokens": 160053029.0, + "step": 46780 + }, + { + "entropy": 0.049717345740646124, + "epoch": 10.90569996503089, + "grad_norm": 0.032958984375, + "learning_rate": 4.495837731455854e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.999989253282547, + "num_tokens": 160077100.0, + "step": 46785 + }, + { + "entropy": 0.09623774792999029, + "epoch": 10.906865602051521, + "grad_norm": 0.00897216796875, + "learning_rate": 4.495715233674008e-05, + "loss": 0.0667, + "mean_token_accuracy": 0.9903517365455627, + "num_tokens": 160099240.0, + "step": 46790 + }, + { + "entropy": 0.06155298855155707, + "epoch": 10.908031239072153, + "grad_norm": 0.0281982421875, + "learning_rate": 4.495592724772427e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999715268611908, + "num_tokens": 160123176.0, + "step": 46795 + }, + { + "entropy": 0.047423532139509915, + "epoch": 10.909196876092786, + "grad_norm": 0.01092529296875, + "learning_rate": 4.4954702047529354e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 160149639.0, + "step": 46800 + }, + { + "entropy": 0.05371604720130563, + "epoch": 10.910362513113416, + "grad_norm": 0.017822265625, + "learning_rate": 4.4953476736173624e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9999548852443695, + "num_tokens": 160173688.0, + "step": 46805 + }, + { + "entropy": 0.05411314023658633, + "epoch": 10.911528150134048, + "grad_norm": 0.058837890625, + "learning_rate": 4.4952251313675354e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160190111.0, + "step": 46810 + }, + { + "entropy": 0.10832246728241443, + "epoch": 10.91269378715468, + "grad_norm": 0.0186767578125, + "learning_rate": 4.49510257800528e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160200466.0, + "step": 46815 + }, + { + "entropy": 0.05945599777624011, + "epoch": 10.913859424175312, + "grad_norm": 0.036865234375, + "learning_rate": 4.494980013532424e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160216988.0, + "step": 46820 + }, + { + "entropy": 0.06891081742942333, + "epoch": 10.915025061195944, + "grad_norm": 0.06787109375, + "learning_rate": 4.494857437950797e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160232389.0, + "step": 46825 + }, + { + "entropy": 0.06697693895548582, + "epoch": 10.916190698216575, + "grad_norm": 0.0238037109375, + "learning_rate": 4.494734851262224e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160250277.0, + "step": 46830 + }, + { + "entropy": 0.0647424777969718, + "epoch": 10.917356335237207, + "grad_norm": 0.00958251953125, + "learning_rate": 4.494612253468534e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160265324.0, + "step": 46835 + }, + { + "entropy": 0.06983225960284471, + "epoch": 10.918521972257839, + "grad_norm": 0.050537109375, + "learning_rate": 4.494489644571556e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 160280959.0, + "step": 46840 + }, + { + "entropy": 0.05160042904317379, + "epoch": 10.919687609278471, + "grad_norm": 0.064453125, + "learning_rate": 4.4943670245731176e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999881267547608, + "num_tokens": 160305171.0, + "step": 46845 + }, + { + "entropy": 0.067512839846313, + "epoch": 10.920853246299103, + "grad_norm": 0.02685546875, + "learning_rate": 4.494244393475047e-05, + "loss": 0.0068, + "mean_token_accuracy": 0.9995011687278748, + "num_tokens": 160331333.0, + "step": 46850 + }, + { + "entropy": 0.05696789734065533, + "epoch": 10.922018883319733, + "grad_norm": 0.02783203125, + "learning_rate": 4.4941217512791736e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999892592430115, + "num_tokens": 160350125.0, + "step": 46855 + }, + { + "entropy": 0.05613101264461875, + "epoch": 10.923184520340365, + "grad_norm": 0.1064453125, + "learning_rate": 4.493999097987325e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 160366326.0, + "step": 46860 + }, + { + "entropy": 0.0436920034699142, + "epoch": 10.924350157360998, + "grad_norm": 0.07177734375, + "learning_rate": 4.4938764336013314e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999226748943328, + "num_tokens": 160405365.0, + "step": 46865 + }, + { + "entropy": 0.0488032216206193, + "epoch": 10.92551579438163, + "grad_norm": 0.0177001953125, + "learning_rate": 4.493753758123021e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160424476.0, + "step": 46870 + }, + { + "entropy": 0.057715128920972344, + "epoch": 10.926681431402262, + "grad_norm": 0.1884765625, + "learning_rate": 4.4936310715542235e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9995918929576874, + "num_tokens": 160454477.0, + "step": 46875 + }, + { + "entropy": 0.07326898816972971, + "epoch": 10.927847068422894, + "grad_norm": 0.03076171875, + "learning_rate": 4.493508373896768e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160466285.0, + "step": 46880 + }, + { + "entropy": 0.07479946129024029, + "epoch": 10.929012705443524, + "grad_norm": 0.322265625, + "learning_rate": 4.493385665152485e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999879539012909, + "num_tokens": 160482048.0, + "step": 46885 + }, + { + "entropy": 0.04308393271639943, + "epoch": 10.930178342464156, + "grad_norm": 0.033203125, + "learning_rate": 4.493262945323203e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999845504760743, + "num_tokens": 160511233.0, + "step": 46890 + }, + { + "entropy": 0.05206470335833728, + "epoch": 10.931343979484788, + "grad_norm": 0.0125732421875, + "learning_rate": 4.4931402144107525e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999897003173828, + "num_tokens": 160532775.0, + "step": 46895 + }, + { + "entropy": 0.0450290129519999, + "epoch": 10.93250961650542, + "grad_norm": 0.259765625, + "learning_rate": 4.493017472416964e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999562263488769, + "num_tokens": 160554549.0, + "step": 46900 + }, + { + "entropy": 0.05536229070276022, + "epoch": 10.933675253526053, + "grad_norm": 0.059814453125, + "learning_rate": 4.4928947193436675e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999775588512421, + "num_tokens": 160584324.0, + "step": 46905 + }, + { + "entropy": 0.06466497108340263, + "epoch": 10.934840890546683, + "grad_norm": 0.052001953125, + "learning_rate": 4.492771955192693e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 160611199.0, + "step": 46910 + }, + { + "entropy": 0.05528691895306111, + "epoch": 10.936006527567315, + "grad_norm": 0.0546875, + "learning_rate": 4.492649179965872e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999731123447418, + "num_tokens": 160635898.0, + "step": 46915 + }, + { + "entropy": 0.05148557173088193, + "epoch": 10.937172164587947, + "grad_norm": 0.015869140625, + "learning_rate": 4.492526393665034e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999773383140564, + "num_tokens": 160659185.0, + "step": 46920 + }, + { + "entropy": 0.046163988672196864, + "epoch": 10.93833780160858, + "grad_norm": 0.00921630859375, + "learning_rate": 4.4924035962920114e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160675283.0, + "step": 46925 + }, + { + "entropy": 0.06352129988372326, + "epoch": 10.939503438629211, + "grad_norm": 0.11572265625, + "learning_rate": 4.492280787848634e-05, + "loss": 0.0012, + "mean_token_accuracy": 1.0, + "num_tokens": 160700580.0, + "step": 46930 + }, + { + "entropy": 0.053053854452446106, + "epoch": 10.940669075649843, + "grad_norm": 0.03466796875, + "learning_rate": 4.492157968336734e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 160720175.0, + "step": 46935 + }, + { + "entropy": 0.05099070845171809, + "epoch": 10.941834712670474, + "grad_norm": 0.154296875, + "learning_rate": 4.492035137758141e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.999974662065506, + "num_tokens": 160743372.0, + "step": 46940 + }, + { + "entropy": 0.057633578404784204, + "epoch": 10.943000349691106, + "grad_norm": 0.0233154296875, + "learning_rate": 4.4919122961146895e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160756529.0, + "step": 46945 + }, + { + "entropy": 0.03396057607606053, + "epoch": 10.944165986711738, + "grad_norm": 0.03173828125, + "learning_rate": 4.4917894434082094e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999751329421998, + "num_tokens": 160805843.0, + "step": 46950 + }, + { + "entropy": 0.0497897163964808, + "epoch": 10.94533162373237, + "grad_norm": 0.08837890625, + "learning_rate": 4.4916665796405335e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 160827488.0, + "step": 46955 + }, + { + "entropy": 0.057886832021176816, + "epoch": 10.946497260753002, + "grad_norm": 0.0208740234375, + "learning_rate": 4.4915437048134926e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160843657.0, + "step": 46960 + }, + { + "entropy": 0.06596755646169186, + "epoch": 10.947662897773633, + "grad_norm": 0.0361328125, + "learning_rate": 4.4914208189289203e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160854533.0, + "step": 46965 + }, + { + "entropy": 0.06427727779373527, + "epoch": 10.948828534794265, + "grad_norm": 0.1240234375, + "learning_rate": 4.491297921988648e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999892592430115, + "num_tokens": 160872862.0, + "step": 46970 + }, + { + "entropy": 0.050669254176318644, + "epoch": 10.949994171814897, + "grad_norm": 0.015869140625, + "learning_rate": 4.4911750139945085e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160891437.0, + "step": 46975 + }, + { + "entropy": 0.060540583729743955, + "epoch": 10.951159808835529, + "grad_norm": 0.080078125, + "learning_rate": 4.4910520949483355e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 160900617.0, + "step": 46980 + }, + { + "entropy": 0.0568017577752471, + "epoch": 10.952325445856161, + "grad_norm": 0.0146484375, + "learning_rate": 4.490929164851961e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160917838.0, + "step": 46985 + }, + { + "entropy": 0.06554779931902885, + "epoch": 10.953491082876791, + "grad_norm": 0.27734375, + "learning_rate": 4.4908062237072176e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9997151017189025, + "num_tokens": 160929706.0, + "step": 46990 + }, + { + "entropy": 0.060347382165491584, + "epoch": 10.954656719897423, + "grad_norm": 0.043701171875, + "learning_rate": 4.49068327151594e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 160951452.0, + "step": 46995 + }, + { + "entropy": 0.05922064566984773, + "epoch": 10.955822356918056, + "grad_norm": 0.07763671875, + "learning_rate": 4.49056030827996e-05, + "loss": 0.0012, + "mean_token_accuracy": 0.9995283007621765, + "num_tokens": 160972788.0, + "step": 47000 + }, + { + "entropy": 0.04474808312952518, + "epoch": 10.956987993938688, + "grad_norm": 0.01904296875, + "learning_rate": 4.490437334001112e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 160986235.0, + "step": 47005 + }, + { + "entropy": 0.05137424096465111, + "epoch": 10.95815363095932, + "grad_norm": 0.04052734375, + "learning_rate": 4.49031434868123e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999865055084228, + "num_tokens": 161001817.0, + "step": 47010 + }, + { + "entropy": 0.0649847850203514, + "epoch": 10.959319267979952, + "grad_norm": 0.0308837890625, + "learning_rate": 4.4901913523221474e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999863147735596, + "num_tokens": 161028501.0, + "step": 47015 + }, + { + "entropy": 0.05216891895979643, + "epoch": 10.960484905000582, + "grad_norm": 0.12890625, + "learning_rate": 4.490068344925699e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 161043699.0, + "step": 47020 + }, + { + "entropy": 0.059045640379190446, + "epoch": 10.961650542021214, + "grad_norm": 0.01092529296875, + "learning_rate": 4.489945326493717e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 161055626.0, + "step": 47025 + }, + { + "entropy": 0.039859526231884955, + "epoch": 10.962816179041846, + "grad_norm": 0.0186767578125, + "learning_rate": 4.489822297028039e-05, + "loss": 0.0007, + "mean_token_accuracy": 0.9999716818332672, + "num_tokens": 161084220.0, + "step": 47030 + }, + { + "entropy": 0.0398352628108114, + "epoch": 10.963981816062478, + "grad_norm": 0.023681640625, + "learning_rate": 4.489699256530497e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 161120641.0, + "step": 47035 + }, + { + "entropy": 0.07531622983515263, + "epoch": 10.96514745308311, + "grad_norm": 0.043212890625, + "learning_rate": 4.489576205002926e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999892890453339, + "num_tokens": 161139755.0, + "step": 47040 + }, + { + "entropy": 0.0534830316901207, + "epoch": 10.966313090103741, + "grad_norm": 0.0147705078125, + "learning_rate": 4.489453142447163e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999890923500061, + "num_tokens": 161168833.0, + "step": 47045 + }, + { + "entropy": 0.0778751790523529, + "epoch": 10.967478727124373, + "grad_norm": 1.2109375, + "learning_rate": 4.4893300688650396e-05, + "loss": 0.0008, + "mean_token_accuracy": 0.9997368395328522, + "num_tokens": 161180900.0, + "step": 47050 + }, + { + "entropy": 0.047051459364593026, + "epoch": 10.968644364145005, + "grad_norm": 0.0184326171875, + "learning_rate": 4.489206984258394e-05, + "loss": 0.0004, + "mean_token_accuracy": 0.9999641895294189, + "num_tokens": 161207943.0, + "step": 47055 + }, + { + "entropy": 0.07386998878791928, + "epoch": 10.969810001165637, + "grad_norm": 0.02392578125, + "learning_rate": 4.48908388862906e-05, + "loss": 0.0003, + "mean_token_accuracy": 0.9999883055686951, + "num_tokens": 161238852.0, + "step": 47060 + }, + { + "entropy": 0.06613675840198993, + "epoch": 10.97097563818627, + "grad_norm": 0.0179443359375, + "learning_rate": 4.488960781978874e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 161247685.0, + "step": 47065 + }, + { + "entropy": 0.06145631754770875, + "epoch": 10.972141275206901, + "grad_norm": 0.08203125, + "learning_rate": 4.488837664309671e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 161261824.0, + "step": 47070 + }, + { + "entropy": 0.05057299751788378, + "epoch": 10.973306912227532, + "grad_norm": 0.0546875, + "learning_rate": 4.488714535623286e-05, + "loss": 0.0004, + "mean_token_accuracy": 1.0, + "num_tokens": 161289547.0, + "step": 47075 + }, + { + "entropy": 0.05460757054388523, + "epoch": 10.974472549248164, + "grad_norm": 0.015625, + "learning_rate": 4.4885913959215575e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 161305775.0, + "step": 47080 + }, + { + "entropy": 0.04940736414864659, + "epoch": 10.975638186268796, + "grad_norm": 0.0189208984375, + "learning_rate": 4.48846824520632e-05, + "loss": 0.0084, + "mean_token_accuracy": 0.9985802948474884, + "num_tokens": 161331070.0, + "step": 47085 + }, + { + "entropy": 0.04240815499797464, + "epoch": 10.976803823289428, + "grad_norm": 0.0218505859375, + "learning_rate": 4.48834508347941e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.99996976852417, + "num_tokens": 161369429.0, + "step": 47090 + }, + { + "entropy": 0.049704886972904205, + "epoch": 10.97796946031006, + "grad_norm": 0.0155029296875, + "learning_rate": 4.4882219107426655e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 161394674.0, + "step": 47095 + }, + { + "entropy": 0.07719376794993878, + "epoch": 10.97913509733069, + "grad_norm": 0.1513671875, + "learning_rate": 4.488098726997921e-05, + "loss": 0.0008, + "mean_token_accuracy": 1.0, + "num_tokens": 161405704.0, + "step": 47100 + }, + { + "entropy": 0.03706503426656127, + "epoch": 10.980300734351323, + "grad_norm": 0.10791015625, + "learning_rate": 4.4879755322470146e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 161445858.0, + "step": 47105 + }, + { + "entropy": 0.05613263482227922, + "epoch": 10.981466371371955, + "grad_norm": 0.126953125, + "learning_rate": 4.4878523264917825e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 161467141.0, + "step": 47110 + }, + { + "entropy": 0.04681310541927815, + "epoch": 10.982632008392587, + "grad_norm": 0.044677734375, + "learning_rate": 4.487729109734063e-05, + "loss": 0.0006, + "mean_token_accuracy": 0.9999776303768158, + "num_tokens": 161499793.0, + "step": 47115 + }, + { + "entropy": 0.04940250525251031, + "epoch": 10.983797645413219, + "grad_norm": 0.09228515625, + "learning_rate": 4.487605881975693e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 161522900.0, + "step": 47120 + }, + { + "entropy": 0.06434418596327304, + "epoch": 10.98496328243385, + "grad_norm": 0.05712890625, + "learning_rate": 4.4874826432185094e-05, + "loss": 0.0009, + "mean_token_accuracy": 0.9998936176300048, + "num_tokens": 161532228.0, + "step": 47125 + }, + { + "entropy": 0.05575905358418822, + "epoch": 10.986128919454481, + "grad_norm": 0.1240234375, + "learning_rate": 4.487359393464351e-05, + "loss": 0.0001, + "mean_token_accuracy": 0.999989265203476, + "num_tokens": 161557700.0, + "step": 47130 + }, + { + "entropy": 0.06682649105787278, + "epoch": 10.987294556475113, + "grad_norm": 0.0208740234375, + "learning_rate": 4.487236132715054e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 161568934.0, + "step": 47135 + }, + { + "entropy": 0.04514258056879043, + "epoch": 10.988460193495746, + "grad_norm": 0.130859375, + "learning_rate": 4.487112860972458e-05, + "loss": 0.0005, + "mean_token_accuracy": 0.9999634981155395, + "num_tokens": 161592366.0, + "step": 47140 + }, + { + "entropy": 0.06385567653924226, + "epoch": 10.989625830516378, + "grad_norm": 0.0277099609375, + "learning_rate": 4.4869895782384e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 161608094.0, + "step": 47145 + }, + { + "entropy": 0.057418633997440335, + "epoch": 10.99079146753701, + "grad_norm": 0.05712890625, + "learning_rate": 4.486866284514719e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 161627583.0, + "step": 47150 + }, + { + "entropy": 0.048961544316262005, + "epoch": 10.99195710455764, + "grad_norm": 0.2294921875, + "learning_rate": 4.486742979803254e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999533712863922, + "num_tokens": 161645226.0, + "step": 47155 + }, + { + "entropy": 0.08157783066853881, + "epoch": 10.993122741578272, + "grad_norm": 0.0625, + "learning_rate": 4.486619664105843e-05, + "loss": 0.0103, + "mean_token_accuracy": 0.9992714405059815, + "num_tokens": 161668211.0, + "step": 47160 + }, + { + "entropy": 0.04942806586623192, + "epoch": 10.994288378598904, + "grad_norm": 0.0341796875, + "learning_rate": 4.486496337424325e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 161688874.0, + "step": 47165 + }, + { + "entropy": 0.054880039487034085, + "epoch": 10.995454015619536, + "grad_norm": 0.095703125, + "learning_rate": 4.486372999760538e-05, + "loss": 0.0002, + "mean_token_accuracy": 1.0, + "num_tokens": 161727249.0, + "step": 47170 + }, + { + "entropy": 0.048123320937156676, + "epoch": 10.996619652640168, + "grad_norm": 0.0228271484375, + "learning_rate": 4.486249651116323e-05, + "loss": 0.0003, + "mean_token_accuracy": 1.0, + "num_tokens": 161755299.0, + "step": 47175 + }, + { + "entropy": 0.07769046975299716, + "epoch": 10.997785289660799, + "grad_norm": 0.03857421875, + "learning_rate": 4.4861262914935174e-05, + "loss": 0.0001, + "mean_token_accuracy": 1.0, + "num_tokens": 161769230.0, + "step": 47180 + }, + { + "entropy": 0.06101873740553856, + "epoch": 10.998950926681431, + "grad_norm": 0.1005859375, + "learning_rate": 4.4860029208939616e-05, + "loss": 0.0002, + "mean_token_accuracy": 0.9999891459941864, + "num_tokens": 161791922.0, + "step": 47185 + }, + { + "entropy": 0.0431011370383203, + "epoch": 11.0, + "grad_norm": 0.75390625, + "learning_rate": 4.485879539319496e-05, + "loss": 0.0025, + "mean_token_accuracy": 0.9998559223281013, + "num_tokens": 161826750.0, + "step": 47190 + } + ], + "logging_steps": 5, + "max_steps": 128700, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.311995803053312e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}