{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.912087912087912, "eval_steps": 500, "global_step": 2160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018315018315018316, "grad_norm": 26.125, "learning_rate": 1.2509723013743402e-06, "loss": 1.4391, "mean_token_accuracy": 0.7123644590377808, "num_tokens": 9264.0, "step": 5 }, { "epoch": 0.03663003663003663, "grad_norm": 24.5, "learning_rate": 2.814687678092266e-06, "loss": 1.2334, "mean_token_accuracy": 0.7464994072914124, "num_tokens": 18573.0, "step": 10 }, { "epoch": 0.054945054945054944, "grad_norm": 14.6875, "learning_rate": 4.378403054810191e-06, "loss": 1.2219, "mean_token_accuracy": 0.7385512232780457, "num_tokens": 28324.0, "step": 15 }, { "epoch": 0.07326007326007326, "grad_norm": 8.625, "learning_rate": 5.942118431528117e-06, "loss": 1.1542, "mean_token_accuracy": 0.7414855003356934, "num_tokens": 37365.0, "step": 20 }, { "epoch": 0.09157509157509157, "grad_norm": 7.75, "learning_rate": 7.505833808246043e-06, "loss": 0.963, "mean_token_accuracy": 0.7728639602661133, "num_tokens": 47169.0, "step": 25 }, { "epoch": 0.10989010989010989, "grad_norm": 7.6875, "learning_rate": 9.069549184963967e-06, "loss": 0.687, "mean_token_accuracy": 0.8294244527816772, "num_tokens": 56922.0, "step": 30 }, { "epoch": 0.1282051282051282, "grad_norm": 6.09375, "learning_rate": 1.0633264561681893e-05, "loss": 0.5571, "mean_token_accuracy": 0.8617596507072449, "num_tokens": 65860.0, "step": 35 }, { "epoch": 0.14652014652014653, "grad_norm": 7.0625, "learning_rate": 1.2196979938399817e-05, "loss": 0.1916, "mean_token_accuracy": 0.9526763677597045, "num_tokens": 75644.0, "step": 40 }, { "epoch": 0.16483516483516483, "grad_norm": 5.9375, "learning_rate": 1.3760695315117745e-05, "loss": 0.195, "mean_token_accuracy": 0.9427057504653931, "num_tokens": 84597.0, "step": 45 }, { "epoch": 0.18315018315018314, "grad_norm": 2.046875, "learning_rate": 1.532441069183567e-05, "loss": 0.272, "mean_token_accuracy": 0.9327008485794067, "num_tokens": 93032.0, "step": 50 }, { "epoch": 0.20146520146520147, "grad_norm": 7.5, "learning_rate": 1.6888126068553595e-05, "loss": 0.2524, "mean_token_accuracy": 0.9316304802894593, "num_tokens": 101726.0, "step": 55 }, { "epoch": 0.21978021978021978, "grad_norm": 2.5625, "learning_rate": 1.845184144527152e-05, "loss": 0.2507, "mean_token_accuracy": 0.9321518659591674, "num_tokens": 110505.0, "step": 60 }, { "epoch": 0.23809523809523808, "grad_norm": 0.55859375, "learning_rate": 2.0015556821989444e-05, "loss": 0.1683, "mean_token_accuracy": 0.9499430179595947, "num_tokens": 119502.0, "step": 65 }, { "epoch": 0.2564102564102564, "grad_norm": 5.0625, "learning_rate": 2.157927219870737e-05, "loss": 0.3318, "mean_token_accuracy": 0.911142885684967, "num_tokens": 129089.0, "step": 70 }, { "epoch": 0.27472527472527475, "grad_norm": 1.59375, "learning_rate": 2.3142987575425293e-05, "loss": 0.1793, "mean_token_accuracy": 0.950410532951355, "num_tokens": 138645.0, "step": 75 }, { "epoch": 0.29304029304029305, "grad_norm": 1.53125, "learning_rate": 2.4706702952143225e-05, "loss": 0.0869, "mean_token_accuracy": 0.9751996159553528, "num_tokens": 148602.0, "step": 80 }, { "epoch": 0.31135531135531136, "grad_norm": 2.25, "learning_rate": 2.627041832886115e-05, "loss": 0.1275, "mean_token_accuracy": 0.9607115983963013, "num_tokens": 157517.0, "step": 85 }, { "epoch": 0.32967032967032966, "grad_norm": 2.03125, "learning_rate": 2.7834133705579074e-05, "loss": 0.0966, "mean_token_accuracy": 0.9684963464736939, "num_tokens": 166575.0, "step": 90 }, { "epoch": 0.34798534798534797, "grad_norm": 0.55078125, "learning_rate": 2.9397849082297e-05, "loss": 0.133, "mean_token_accuracy": 0.9579505920410156, "num_tokens": 175735.0, "step": 95 }, { "epoch": 0.3663003663003663, "grad_norm": 1.375, "learning_rate": 3.096156445901492e-05, "loss": 0.1709, "mean_token_accuracy": 0.9461557865142822, "num_tokens": 183857.0, "step": 100 }, { "epoch": 0.38461538461538464, "grad_norm": 3.015625, "learning_rate": 3.2525279835732844e-05, "loss": 0.1466, "mean_token_accuracy": 0.9522614717483521, "num_tokens": 191716.0, "step": 105 }, { "epoch": 0.40293040293040294, "grad_norm": 5.65625, "learning_rate": 3.4088995212450776e-05, "loss": 0.2514, "mean_token_accuracy": 0.9299401640892029, "num_tokens": 200518.0, "step": 110 }, { "epoch": 0.42124542124542125, "grad_norm": 0.92578125, "learning_rate": 3.56527105891687e-05, "loss": 0.0687, "mean_token_accuracy": 0.975311815738678, "num_tokens": 209961.0, "step": 115 }, { "epoch": 0.43956043956043955, "grad_norm": 2.53125, "learning_rate": 3.7216425965886625e-05, "loss": 0.1955, "mean_token_accuracy": 0.9394341588020325, "num_tokens": 218838.0, "step": 120 }, { "epoch": 0.45787545787545786, "grad_norm": 1.6953125, "learning_rate": 3.878014134260455e-05, "loss": 0.1805, "mean_token_accuracy": 0.944377863407135, "num_tokens": 227866.0, "step": 125 }, { "epoch": 0.47619047619047616, "grad_norm": 2.03125, "learning_rate": 4.0343856719322474e-05, "loss": 0.2456, "mean_token_accuracy": 0.9300832152366638, "num_tokens": 237211.0, "step": 130 }, { "epoch": 0.4945054945054945, "grad_norm": 3.5625, "learning_rate": 4.19075720960404e-05, "loss": 0.1599, "mean_token_accuracy": 0.9527908086776733, "num_tokens": 247238.0, "step": 135 }, { "epoch": 0.5128205128205128, "grad_norm": 1.0078125, "learning_rate": 4.3471287472758323e-05, "loss": 0.1544, "mean_token_accuracy": 0.9530697703361511, "num_tokens": 255982.0, "step": 140 }, { "epoch": 0.5311355311355311, "grad_norm": 3.28125, "learning_rate": 4.5035002849476255e-05, "loss": 0.1674, "mean_token_accuracy": 0.949328339099884, "num_tokens": 265058.0, "step": 145 }, { "epoch": 0.5494505494505495, "grad_norm": 2.375, "learning_rate": 4.659871822619417e-05, "loss": 0.1525, "mean_token_accuracy": 0.9563624501228333, "num_tokens": 274227.0, "step": 150 }, { "epoch": 0.5677655677655677, "grad_norm": 2.28125, "learning_rate": 4.8162433602912104e-05, "loss": 0.2012, "mean_token_accuracy": 0.940619957447052, "num_tokens": 282699.0, "step": 155 }, { "epoch": 0.5860805860805861, "grad_norm": 2.078125, "learning_rate": 4.9726148979630036e-05, "loss": 0.0786, "mean_token_accuracy": 0.9756275773048401, "num_tokens": 292119.0, "step": 160 }, { "epoch": 0.6043956043956044, "grad_norm": 1.40625, "learning_rate": 5.128986435634795e-05, "loss": 0.1985, "mean_token_accuracy": 0.9421133875846863, "num_tokens": 301406.0, "step": 165 }, { "epoch": 0.6227106227106227, "grad_norm": 1.5078125, "learning_rate": 5.2853579733065885e-05, "loss": 0.1154, "mean_token_accuracy": 0.9618610620498658, "num_tokens": 310481.0, "step": 170 }, { "epoch": 0.6410256410256411, "grad_norm": 0.359375, "learning_rate": 5.44172951097838e-05, "loss": 0.1248, "mean_token_accuracy": 0.9580595254898071, "num_tokens": 319932.0, "step": 175 }, { "epoch": 0.6593406593406593, "grad_norm": 1.546875, "learning_rate": 5.5981010486501734e-05, "loss": 0.0783, "mean_token_accuracy": 0.972282862663269, "num_tokens": 329237.0, "step": 180 }, { "epoch": 0.6776556776556777, "grad_norm": 0.478515625, "learning_rate": 5.754472586321966e-05, "loss": 0.0932, "mean_token_accuracy": 0.9683905124664307, "num_tokens": 338362.0, "step": 185 }, { "epoch": 0.6959706959706959, "grad_norm": 1.9453125, "learning_rate": 5.910844123993758e-05, "loss": 0.0953, "mean_token_accuracy": 0.9647938251495362, "num_tokens": 347632.0, "step": 190 }, { "epoch": 0.7142857142857143, "grad_norm": 1.0234375, "learning_rate": 6.067215661665551e-05, "loss": 0.1225, "mean_token_accuracy": 0.9624135136604309, "num_tokens": 357312.0, "step": 195 }, { "epoch": 0.7326007326007326, "grad_norm": 6.5, "learning_rate": 6.223587199337343e-05, "loss": 0.1022, "mean_token_accuracy": 0.9669649362564087, "num_tokens": 366372.0, "step": 200 }, { "epoch": 0.7509157509157509, "grad_norm": 1.9609375, "learning_rate": 6.379958737009136e-05, "loss": 0.1294, "mean_token_accuracy": 0.9620457410812377, "num_tokens": 376261.0, "step": 205 }, { "epoch": 0.7692307692307693, "grad_norm": 2.328125, "learning_rate": 6.536330274680927e-05, "loss": 0.1534, "mean_token_accuracy": 0.9578806042671204, "num_tokens": 385931.0, "step": 210 }, { "epoch": 0.7875457875457875, "grad_norm": 1.6484375, "learning_rate": 6.69270181235272e-05, "loss": 0.1488, "mean_token_accuracy": 0.9553104996681213, "num_tokens": 395126.0, "step": 215 }, { "epoch": 0.8058608058608059, "grad_norm": 1.5625, "learning_rate": 6.849073350024514e-05, "loss": 0.2034, "mean_token_accuracy": 0.9399782657623291, "num_tokens": 404038.0, "step": 220 }, { "epoch": 0.8241758241758241, "grad_norm": 0.95703125, "learning_rate": 7.005444887696306e-05, "loss": 0.1123, "mean_token_accuracy": 0.9649486184120178, "num_tokens": 412865.0, "step": 225 }, { "epoch": 0.8424908424908425, "grad_norm": 12.125, "learning_rate": 7.161816425368099e-05, "loss": 0.2582, "mean_token_accuracy": 0.9260981917381287, "num_tokens": 421875.0, "step": 230 }, { "epoch": 0.8608058608058609, "grad_norm": 1.0078125, "learning_rate": 7.31818796303989e-05, "loss": 0.1237, "mean_token_accuracy": 0.958947730064392, "num_tokens": 430974.0, "step": 235 }, { "epoch": 0.8791208791208791, "grad_norm": 0.890625, "learning_rate": 7.474559500711684e-05, "loss": 0.094, "mean_token_accuracy": 0.9669261336326599, "num_tokens": 440593.0, "step": 240 }, { "epoch": 0.8974358974358975, "grad_norm": 1.5234375, "learning_rate": 7.630931038383477e-05, "loss": 0.1799, "mean_token_accuracy": 0.9489495992660523, "num_tokens": 449999.0, "step": 245 }, { "epoch": 0.9157509157509157, "grad_norm": 0.1806640625, "learning_rate": 7.787302576055269e-05, "loss": 0.0973, "mean_token_accuracy": 0.9693061113357544, "num_tokens": 458710.0, "step": 250 }, { "epoch": 0.9340659340659341, "grad_norm": 1.359375, "learning_rate": 7.943674113727062e-05, "loss": 0.1313, "mean_token_accuracy": 0.9576280236244201, "num_tokens": 468138.0, "step": 255 }, { "epoch": 0.9523809523809523, "grad_norm": 3.21875, "learning_rate": 8.100045651398853e-05, "loss": 0.2788, "mean_token_accuracy": 0.9308066725730896, "num_tokens": 478094.0, "step": 260 }, { "epoch": 0.9706959706959707, "grad_norm": 15.75, "learning_rate": 8.256417189070647e-05, "loss": 0.1394, "mean_token_accuracy": 0.9597956895828247, "num_tokens": 487236.0, "step": 265 }, { "epoch": 0.989010989010989, "grad_norm": 0.73046875, "learning_rate": 8.41278872674244e-05, "loss": 0.105, "mean_token_accuracy": 0.9711844086647033, "num_tokens": 496836.0, "step": 270 }, { "epoch": 1.0073260073260073, "grad_norm": 4.125, "learning_rate": 8.537885369635508e-05, "loss": 0.1623, "mean_token_accuracy": 0.9566316485404969, "num_tokens": 505486.0, "step": 275 }, { "epoch": 1.0256410256410255, "grad_norm": 38.0, "learning_rate": 8.537864816105374e-05, "loss": 0.1003, "mean_token_accuracy": 0.9712117314338684, "num_tokens": 515232.0, "step": 280 }, { "epoch": 1.043956043956044, "grad_norm": 1.671875, "learning_rate": 8.537814900572437e-05, "loss": 0.1104, "mean_token_accuracy": 0.9652750253677368, "num_tokens": 523979.0, "step": 285 }, { "epoch": 1.0622710622710623, "grad_norm": 0.361328125, "learning_rate": 8.537735623494464e-05, "loss": 0.0756, "mean_token_accuracy": 0.9749327659606933, "num_tokens": 533573.0, "step": 290 }, { "epoch": 1.0805860805860805, "grad_norm": 3.5, "learning_rate": 8.537626985598489e-05, "loss": 0.5942, "mean_token_accuracy": 0.8817975878715515, "num_tokens": 543009.0, "step": 295 }, { "epoch": 1.098901098901099, "grad_norm": 1.5390625, "learning_rate": 8.537488987880808e-05, "loss": 0.1367, "mean_token_accuracy": 0.9571704030036926, "num_tokens": 552940.0, "step": 300 }, { "epoch": 1.1172161172161172, "grad_norm": 0.89453125, "learning_rate": 8.537321631606968e-05, "loss": 0.1217, "mean_token_accuracy": 0.9623419761657714, "num_tokens": 562364.0, "step": 305 }, { "epoch": 1.1355311355311355, "grad_norm": 1.578125, "learning_rate": 8.537124918311761e-05, "loss": 0.1608, "mean_token_accuracy": 0.949942660331726, "num_tokens": 571646.0, "step": 310 }, { "epoch": 1.1538461538461537, "grad_norm": 1.5, "learning_rate": 8.536898849799202e-05, "loss": 0.0852, "mean_token_accuracy": 0.971439003944397, "num_tokens": 581084.0, "step": 315 }, { "epoch": 1.1721611721611722, "grad_norm": 1.8046875, "learning_rate": 8.53664342814252e-05, "loss": 0.1247, "mean_token_accuracy": 0.9593337297439575, "num_tokens": 590812.0, "step": 320 }, { "epoch": 1.1904761904761905, "grad_norm": 1.2578125, "learning_rate": 8.536358655684135e-05, "loss": 0.1186, "mean_token_accuracy": 0.957237160205841, "num_tokens": 599646.0, "step": 325 }, { "epoch": 1.2087912087912087, "grad_norm": 0.5078125, "learning_rate": 8.536044535035635e-05, "loss": 0.1778, "mean_token_accuracy": 0.9485100388526917, "num_tokens": 608962.0, "step": 330 }, { "epoch": 1.2271062271062272, "grad_norm": 0.91796875, "learning_rate": 8.535701069077756e-05, "loss": 0.131, "mean_token_accuracy": 0.9616786003112793, "num_tokens": 617832.0, "step": 335 }, { "epoch": 1.2454212454212454, "grad_norm": 1.28125, "learning_rate": 8.535328260960355e-05, "loss": 0.1983, "mean_token_accuracy": 0.9394309878349304, "num_tokens": 626739.0, "step": 340 }, { "epoch": 1.2637362637362637, "grad_norm": 1.5234375, "learning_rate": 8.534926114102375e-05, "loss": 0.064, "mean_token_accuracy": 0.9792219161987304, "num_tokens": 636553.0, "step": 345 }, { "epoch": 1.282051282051282, "grad_norm": 0.73828125, "learning_rate": 8.534494632191824e-05, "loss": 0.1623, "mean_token_accuracy": 0.9512728333473206, "num_tokens": 645322.0, "step": 350 }, { "epoch": 1.3003663003663004, "grad_norm": 0.302734375, "learning_rate": 8.534033819185732e-05, "loss": 0.1257, "mean_token_accuracy": 0.9629031181335449, "num_tokens": 654789.0, "step": 355 }, { "epoch": 1.3186813186813187, "grad_norm": 2.78125, "learning_rate": 8.533543679310125e-05, "loss": 0.1226, "mean_token_accuracy": 0.962236201763153, "num_tokens": 664164.0, "step": 360 }, { "epoch": 1.3369963369963371, "grad_norm": 0.1669921875, "learning_rate": 8.533024217059969e-05, "loss": 0.0789, "mean_token_accuracy": 0.9735670685768127, "num_tokens": 674019.0, "step": 365 }, { "epoch": 1.3553113553113554, "grad_norm": 6.84375, "learning_rate": 8.53247543719915e-05, "loss": 0.168, "mean_token_accuracy": 0.9529448866844177, "num_tokens": 683484.0, "step": 370 }, { "epoch": 1.3736263736263736, "grad_norm": 1.359375, "learning_rate": 8.531897344760409e-05, "loss": 0.1943, "mean_token_accuracy": 0.9423548102378845, "num_tokens": 691983.0, "step": 375 }, { "epoch": 1.3919413919413919, "grad_norm": 0.97265625, "learning_rate": 8.531289945045318e-05, "loss": 0.1563, "mean_token_accuracy": 0.957783043384552, "num_tokens": 701252.0, "step": 380 }, { "epoch": 1.4102564102564101, "grad_norm": 0.93359375, "learning_rate": 8.530653243624211e-05, "loss": 0.2077, "mean_token_accuracy": 0.9389472723007202, "num_tokens": 709727.0, "step": 385 }, { "epoch": 1.4285714285714286, "grad_norm": 11.875, "learning_rate": 8.529987246336146e-05, "loss": 0.0909, "mean_token_accuracy": 0.9703719019889832, "num_tokens": 719138.0, "step": 390 }, { "epoch": 1.4468864468864469, "grad_norm": 3.4375, "learning_rate": 8.529291959288849e-05, "loss": 0.0696, "mean_token_accuracy": 0.9730043172836303, "num_tokens": 728302.0, "step": 395 }, { "epoch": 1.4652014652014653, "grad_norm": 0.93359375, "learning_rate": 8.528567388858653e-05, "loss": 0.1001, "mean_token_accuracy": 0.962419056892395, "num_tokens": 737476.0, "step": 400 }, { "epoch": 1.4835164835164836, "grad_norm": 0.7109375, "learning_rate": 8.527813541690442e-05, "loss": 0.1422, "mean_token_accuracy": 0.9559484243392944, "num_tokens": 746604.0, "step": 405 }, { "epoch": 1.5018315018315018, "grad_norm": 0.62890625, "learning_rate": 8.527030424697596e-05, "loss": 0.1023, "mean_token_accuracy": 0.9671313047409058, "num_tokens": 756047.0, "step": 410 }, { "epoch": 1.52014652014652, "grad_norm": 0.54296875, "learning_rate": 8.526218045061917e-05, "loss": 0.1209, "mean_token_accuracy": 0.9592770576477051, "num_tokens": 764805.0, "step": 415 }, { "epoch": 1.5384615384615383, "grad_norm": 1.1796875, "learning_rate": 8.525376410233573e-05, "loss": 0.1895, "mean_token_accuracy": 0.942843246459961, "num_tokens": 773770.0, "step": 420 }, { "epoch": 1.5567765567765568, "grad_norm": 0.90625, "learning_rate": 8.524505527931021e-05, "loss": 0.1104, "mean_token_accuracy": 0.9629818797111511, "num_tokens": 782555.0, "step": 425 }, { "epoch": 1.575091575091575, "grad_norm": 1.953125, "learning_rate": 8.523605406140945e-05, "loss": 0.079, "mean_token_accuracy": 0.9723729610443115, "num_tokens": 791364.0, "step": 430 }, { "epoch": 1.5934065934065935, "grad_norm": 0.765625, "learning_rate": 8.522676053118176e-05, "loss": 0.1355, "mean_token_accuracy": 0.9603265643119812, "num_tokens": 801577.0, "step": 435 }, { "epoch": 1.6117216117216118, "grad_norm": 38.0, "learning_rate": 8.521717477385618e-05, "loss": 0.0925, "mean_token_accuracy": 0.9714651226997375, "num_tokens": 810680.0, "step": 440 }, { "epoch": 1.63003663003663, "grad_norm": 54.0, "learning_rate": 8.520729687734172e-05, "loss": 0.4008, "mean_token_accuracy": 0.9063192009925842, "num_tokens": 819733.0, "step": 445 }, { "epoch": 1.6483516483516483, "grad_norm": 17.0, "learning_rate": 8.519712693222653e-05, "loss": 0.2733, "mean_token_accuracy": 0.9280066013336181, "num_tokens": 828640.0, "step": 450 }, { "epoch": 1.6666666666666665, "grad_norm": 0.26171875, "learning_rate": 8.518666503177708e-05, "loss": 0.3508, "mean_token_accuracy": 0.912005627155304, "num_tokens": 837843.0, "step": 455 }, { "epoch": 1.684981684981685, "grad_norm": 0.73046875, "learning_rate": 8.517591127193731e-05, "loss": 0.0529, "mean_token_accuracy": 0.9821884870529175, "num_tokens": 847611.0, "step": 460 }, { "epoch": 1.7032967032967035, "grad_norm": 6.09375, "learning_rate": 8.516486575132771e-05, "loss": 0.1331, "mean_token_accuracy": 0.959692919254303, "num_tokens": 856321.0, "step": 465 }, { "epoch": 1.7216117216117217, "grad_norm": 5.0, "learning_rate": 8.515352857124449e-05, "loss": 0.0689, "mean_token_accuracy": 0.9763989567756652, "num_tokens": 865828.0, "step": 470 }, { "epoch": 1.73992673992674, "grad_norm": 1.5703125, "learning_rate": 8.514189983565859e-05, "loss": 0.0946, "mean_token_accuracy": 0.9694816589355468, "num_tokens": 875232.0, "step": 475 }, { "epoch": 1.7582417582417582, "grad_norm": 21.25, "learning_rate": 8.512997965121474e-05, "loss": 0.0833, "mean_token_accuracy": 0.9721729278564453, "num_tokens": 884274.0, "step": 480 }, { "epoch": 1.7765567765567765, "grad_norm": 0.7265625, "learning_rate": 8.511776812723049e-05, "loss": 0.0723, "mean_token_accuracy": 0.9744561910629272, "num_tokens": 893656.0, "step": 485 }, { "epoch": 1.7948717948717947, "grad_norm": 0.6015625, "learning_rate": 8.510526537569522e-05, "loss": 0.0605, "mean_token_accuracy": 0.9765355348587036, "num_tokens": 902461.0, "step": 490 }, { "epoch": 1.8131868131868132, "grad_norm": 1.6328125, "learning_rate": 8.509247151126907e-05, "loss": 0.097, "mean_token_accuracy": 0.9699956893920898, "num_tokens": 911366.0, "step": 495 }, { "epoch": 1.8315018315018317, "grad_norm": 0.5625, "learning_rate": 8.507938665128194e-05, "loss": 0.0759, "mean_token_accuracy": 0.9745470285415649, "num_tokens": 920856.0, "step": 500 }, { "epoch": 1.84981684981685, "grad_norm": 2.125, "learning_rate": 8.506601091573238e-05, "loss": 0.1981, "mean_token_accuracy": 0.9415134191513062, "num_tokens": 929641.0, "step": 505 }, { "epoch": 1.8681318681318682, "grad_norm": 0.5625, "learning_rate": 8.505234442728651e-05, "loss": 0.1232, "mean_token_accuracy": 0.9627613186836242, "num_tokens": 939594.0, "step": 510 }, { "epoch": 1.8864468864468864, "grad_norm": 35.5, "learning_rate": 8.503838731127686e-05, "loss": 0.1638, "mean_token_accuracy": 0.9507665157318115, "num_tokens": 948528.0, "step": 515 }, { "epoch": 1.9047619047619047, "grad_norm": 1.1640625, "learning_rate": 8.502413969570129e-05, "loss": 0.1643, "mean_token_accuracy": 0.9527613878250122, "num_tokens": 957817.0, "step": 520 }, { "epoch": 1.9230769230769231, "grad_norm": 0.18359375, "learning_rate": 8.500960171122171e-05, "loss": 0.1136, "mean_token_accuracy": 0.9619654774665832, "num_tokens": 966584.0, "step": 525 }, { "epoch": 1.9413919413919414, "grad_norm": 2.75, "learning_rate": 8.4994773491163e-05, "loss": 0.1588, "mean_token_accuracy": 0.9544906854629517, "num_tokens": 975280.0, "step": 530 }, { "epoch": 1.9597069597069599, "grad_norm": 1.2734375, "learning_rate": 8.497965517151172e-05, "loss": 0.2298, "mean_token_accuracy": 0.9354098320007325, "num_tokens": 984056.0, "step": 535 }, { "epoch": 1.978021978021978, "grad_norm": 0.50390625, "learning_rate": 8.49642468909148e-05, "loss": 0.0629, "mean_token_accuracy": 0.97831951379776, "num_tokens": 993635.0, "step": 540 }, { "epoch": 1.9963369963369964, "grad_norm": 1.140625, "learning_rate": 8.494854879067847e-05, "loss": 0.1468, "mean_token_accuracy": 0.9564722418785095, "num_tokens": 1003151.0, "step": 545 }, { "epoch": 2.0146520146520146, "grad_norm": 0.73046875, "learning_rate": 8.493256101476675e-05, "loss": 0.1093, "mean_token_accuracy": 0.9693841338157654, "num_tokens": 1011069.0, "step": 550 }, { "epoch": 2.032967032967033, "grad_norm": 0.5390625, "learning_rate": 8.491628370980023e-05, "loss": 0.1015, "mean_token_accuracy": 0.9635228157043457, "num_tokens": 1019386.0, "step": 555 }, { "epoch": 2.051282051282051, "grad_norm": 0.23828125, "learning_rate": 8.489971702505472e-05, "loss": 0.105, "mean_token_accuracy": 0.969475531578064, "num_tokens": 1028915.0, "step": 560 }, { "epoch": 2.06959706959707, "grad_norm": 0.41796875, "learning_rate": 8.488286111245985e-05, "loss": 0.0659, "mean_token_accuracy": 0.9784932613372803, "num_tokens": 1038671.0, "step": 565 }, { "epoch": 2.087912087912088, "grad_norm": 0.64453125, "learning_rate": 8.486571612659775e-05, "loss": 0.1141, "mean_token_accuracy": 0.9647136211395264, "num_tokens": 1048771.0, "step": 570 }, { "epoch": 2.1062271062271063, "grad_norm": 1.1015625, "learning_rate": 8.484828222470152e-05, "loss": 0.0762, "mean_token_accuracy": 0.9740150094032287, "num_tokens": 1058283.0, "step": 575 }, { "epoch": 2.1245421245421245, "grad_norm": 7.71875, "learning_rate": 8.48305595666539e-05, "loss": 0.1661, "mean_token_accuracy": 0.9484933137893676, "num_tokens": 1067021.0, "step": 580 }, { "epoch": 2.142857142857143, "grad_norm": 0.21484375, "learning_rate": 8.481254831498573e-05, "loss": 0.0481, "mean_token_accuracy": 0.9831829905509949, "num_tokens": 1076203.0, "step": 585 }, { "epoch": 2.161172161172161, "grad_norm": 0.88671875, "learning_rate": 8.479424863487448e-05, "loss": 0.0859, "mean_token_accuracy": 0.9719721555709839, "num_tokens": 1085924.0, "step": 590 }, { "epoch": 2.1794871794871793, "grad_norm": 0.1650390625, "learning_rate": 8.477566069414271e-05, "loss": 0.1872, "mean_token_accuracy": 0.9515769362449646, "num_tokens": 1095974.0, "step": 595 }, { "epoch": 2.197802197802198, "grad_norm": 83.0, "learning_rate": 8.475678466325665e-05, "loss": 0.3268, "mean_token_accuracy": 0.9221652507781982, "num_tokens": 1105194.0, "step": 600 }, { "epoch": 2.2161172161172162, "grad_norm": 0.9765625, "learning_rate": 8.473762071532443e-05, "loss": 0.0572, "mean_token_accuracy": 0.9784234523773193, "num_tokens": 1113857.0, "step": 605 }, { "epoch": 2.2344322344322345, "grad_norm": 0.6171875, "learning_rate": 8.471816902609471e-05, "loss": 0.0721, "mean_token_accuracy": 0.975789201259613, "num_tokens": 1123835.0, "step": 610 }, { "epoch": 2.2527472527472527, "grad_norm": 0.43359375, "learning_rate": 8.46984297739549e-05, "loss": 0.0894, "mean_token_accuracy": 0.968826174736023, "num_tokens": 1132971.0, "step": 615 }, { "epoch": 2.271062271062271, "grad_norm": 0.3515625, "learning_rate": 8.467840313992962e-05, "loss": 0.044, "mean_token_accuracy": 0.9843294978141784, "num_tokens": 1142825.0, "step": 620 }, { "epoch": 2.2893772893772892, "grad_norm": 1.09375, "learning_rate": 8.465808930767897e-05, "loss": 0.1133, "mean_token_accuracy": 0.9606971025466919, "num_tokens": 1152228.0, "step": 625 }, { "epoch": 2.3076923076923075, "grad_norm": 0.76171875, "learning_rate": 8.463748846349694e-05, "loss": 0.0924, "mean_token_accuracy": 0.968487274646759, "num_tokens": 1161178.0, "step": 630 }, { "epoch": 2.326007326007326, "grad_norm": 0.6875, "learning_rate": 8.461660079630962e-05, "loss": 0.1203, "mean_token_accuracy": 0.9595974802970886, "num_tokens": 1170764.0, "step": 635 }, { "epoch": 2.3443223443223444, "grad_norm": 0.6171875, "learning_rate": 8.45954264976735e-05, "loss": 0.0458, "mean_token_accuracy": 0.9816165566444397, "num_tokens": 1180611.0, "step": 640 }, { "epoch": 2.3626373626373627, "grad_norm": 1.2734375, "learning_rate": 8.457396576177369e-05, "loss": 0.109, "mean_token_accuracy": 0.9632077097892762, "num_tokens": 1188843.0, "step": 645 }, { "epoch": 2.380952380952381, "grad_norm": 0.5625, "learning_rate": 8.455221878542219e-05, "loss": 0.0454, "mean_token_accuracy": 0.982891297340393, "num_tokens": 1198270.0, "step": 650 }, { "epoch": 2.399267399267399, "grad_norm": 0.447265625, "learning_rate": 8.453018576805604e-05, "loss": 0.0807, "mean_token_accuracy": 0.9707582116127014, "num_tokens": 1207459.0, "step": 655 }, { "epoch": 2.4175824175824174, "grad_norm": 0.796875, "learning_rate": 8.450786691173547e-05, "loss": 0.1203, "mean_token_accuracy": 0.9592945575714111, "num_tokens": 1217120.0, "step": 660 }, { "epoch": 2.435897435897436, "grad_norm": 0.8125, "learning_rate": 8.448526242114215e-05, "loss": 0.0962, "mean_token_accuracy": 0.9632490515708924, "num_tokens": 1226485.0, "step": 665 }, { "epoch": 2.4542124542124544, "grad_norm": 306.0, "learning_rate": 8.446237250357716e-05, "loss": 0.2642, "mean_token_accuracy": 0.9311501502990722, "num_tokens": 1234436.0, "step": 670 }, { "epoch": 2.4725274725274726, "grad_norm": 0.6875, "learning_rate": 8.443919736895923e-05, "loss": 0.0925, "mean_token_accuracy": 0.9679561376571655, "num_tokens": 1243656.0, "step": 675 }, { "epoch": 2.490842490842491, "grad_norm": 0.326171875, "learning_rate": 8.441573722982275e-05, "loss": 0.1172, "mean_token_accuracy": 0.9645622253417969, "num_tokens": 1252230.0, "step": 680 }, { "epoch": 2.509157509157509, "grad_norm": 4.59375, "learning_rate": 8.439199230131578e-05, "loss": 0.2658, "mean_token_accuracy": 0.9416054487228394, "num_tokens": 1261738.0, "step": 685 }, { "epoch": 2.5274725274725274, "grad_norm": 16.25, "learning_rate": 8.436796280119821e-05, "loss": 0.1691, "mean_token_accuracy": 0.959836196899414, "num_tokens": 1270639.0, "step": 690 }, { "epoch": 2.5457875457875456, "grad_norm": 2.140625, "learning_rate": 8.43436489498396e-05, "loss": 0.1439, "mean_token_accuracy": 0.9541051030158997, "num_tokens": 1279566.0, "step": 695 }, { "epoch": 2.564102564102564, "grad_norm": 184.0, "learning_rate": 8.431905097021727e-05, "loss": 0.0963, "mean_token_accuracy": 0.9705996751785279, "num_tokens": 1288968.0, "step": 700 }, { "epoch": 2.5824175824175826, "grad_norm": 0.984375, "learning_rate": 8.429416908791423e-05, "loss": 0.106, "mean_token_accuracy": 0.9661186218261719, "num_tokens": 1298605.0, "step": 705 }, { "epoch": 2.600732600732601, "grad_norm": 0.84375, "learning_rate": 8.426900353111708e-05, "loss": 0.1213, "mean_token_accuracy": 0.9659365892410279, "num_tokens": 1306963.0, "step": 710 }, { "epoch": 2.619047619047619, "grad_norm": 0.310546875, "learning_rate": 8.424355453061395e-05, "loss": 0.11, "mean_token_accuracy": 0.9625479221343994, "num_tokens": 1315486.0, "step": 715 }, { "epoch": 2.6373626373626373, "grad_norm": 0.7265625, "learning_rate": 8.421782231979236e-05, "loss": 0.095, "mean_token_accuracy": 0.9687173247337342, "num_tokens": 1325156.0, "step": 720 }, { "epoch": 2.6556776556776556, "grad_norm": 0.671875, "learning_rate": 8.419180713463716e-05, "loss": 0.0597, "mean_token_accuracy": 0.9778618574142456, "num_tokens": 1334588.0, "step": 725 }, { "epoch": 2.6739926739926743, "grad_norm": 0.56640625, "learning_rate": 8.416550921372818e-05, "loss": 0.0961, "mean_token_accuracy": 0.965964937210083, "num_tokens": 1343414.0, "step": 730 }, { "epoch": 2.6923076923076925, "grad_norm": 1.609375, "learning_rate": 8.413892879823828e-05, "loss": 0.1179, "mean_token_accuracy": 0.9629538536071778, "num_tokens": 1353270.0, "step": 735 }, { "epoch": 2.7106227106227108, "grad_norm": 0.2275390625, "learning_rate": 8.411206613193094e-05, "loss": 0.0733, "mean_token_accuracy": 0.9732595682144165, "num_tokens": 1362123.0, "step": 740 }, { "epoch": 2.728937728937729, "grad_norm": 0.228515625, "learning_rate": 8.408492146115815e-05, "loss": 0.1294, "mean_token_accuracy": 0.9595796465873718, "num_tokens": 1371808.0, "step": 745 }, { "epoch": 2.7472527472527473, "grad_norm": 0.185546875, "learning_rate": 8.405749503485807e-05, "loss": 0.0698, "mean_token_accuracy": 0.9742272734642029, "num_tokens": 1380865.0, "step": 750 }, { "epoch": 2.7655677655677655, "grad_norm": 1.078125, "learning_rate": 8.402978710455282e-05, "loss": 0.0939, "mean_token_accuracy": 0.9644173741340637, "num_tokens": 1389329.0, "step": 755 }, { "epoch": 2.7838827838827838, "grad_norm": 0.67578125, "learning_rate": 8.400179792434609e-05, "loss": 0.0986, "mean_token_accuracy": 0.9603239297866821, "num_tokens": 1397560.0, "step": 760 }, { "epoch": 2.802197802197802, "grad_norm": 1.0859375, "learning_rate": 8.397352775092089e-05, "loss": 0.1284, "mean_token_accuracy": 0.9567705154418945, "num_tokens": 1406742.0, "step": 765 }, { "epoch": 2.8205128205128203, "grad_norm": 0.1708984375, "learning_rate": 8.394497684353717e-05, "loss": 0.1174, "mean_token_accuracy": 0.9635369896888732, "num_tokens": 1416577.0, "step": 770 }, { "epoch": 2.838827838827839, "grad_norm": 1.09375, "learning_rate": 8.391614546402936e-05, "loss": 0.1623, "mean_token_accuracy": 0.947747004032135, "num_tokens": 1424952.0, "step": 775 }, { "epoch": 2.857142857142857, "grad_norm": 0.6171875, "learning_rate": 8.388703387680416e-05, "loss": 0.0449, "mean_token_accuracy": 0.9851283431053162, "num_tokens": 1434524.0, "step": 780 }, { "epoch": 2.8754578754578755, "grad_norm": 0.93359375, "learning_rate": 8.385764234883788e-05, "loss": 0.1116, "mean_token_accuracy": 0.9607040166854859, "num_tokens": 1443089.0, "step": 785 }, { "epoch": 2.8937728937728937, "grad_norm": 0.59765625, "learning_rate": 8.382797114967418e-05, "loss": 0.0757, "mean_token_accuracy": 0.9740386247634888, "num_tokens": 1452624.0, "step": 790 }, { "epoch": 2.912087912087912, "grad_norm": 0.76953125, "learning_rate": 8.379802055142151e-05, "loss": 0.0641, "mean_token_accuracy": 0.9792454838752747, "num_tokens": 1462735.0, "step": 795 }, { "epoch": 2.9304029304029307, "grad_norm": 0.8671875, "learning_rate": 8.376779082875063e-05, "loss": 0.0948, "mean_token_accuracy": 0.9742291688919067, "num_tokens": 1472169.0, "step": 800 }, { "epoch": 2.948717948717949, "grad_norm": 1.0, "learning_rate": 8.37372822588921e-05, "loss": 0.1112, "mean_token_accuracy": 0.9607684016227722, "num_tokens": 1481405.0, "step": 805 }, { "epoch": 2.967032967032967, "grad_norm": 0.55859375, "learning_rate": 8.370649512163369e-05, "loss": 0.1755, "mean_token_accuracy": 0.9431592702865601, "num_tokens": 1490690.0, "step": 810 }, { "epoch": 2.9853479853479854, "grad_norm": 0.1259765625, "learning_rate": 8.367542969931792e-05, "loss": 0.0996, "mean_token_accuracy": 0.9651033759117127, "num_tokens": 1499659.0, "step": 815 }, { "epoch": 3.0036630036630036, "grad_norm": 0.1181640625, "learning_rate": 8.364408627683935e-05, "loss": 0.0761, "mean_token_accuracy": 0.9716196894645691, "num_tokens": 1507916.0, "step": 820 }, { "epoch": 3.021978021978022, "grad_norm": 0.080078125, "learning_rate": 8.361246514164205e-05, "loss": 0.0558, "mean_token_accuracy": 0.9798445224761962, "num_tokens": 1517236.0, "step": 825 }, { "epoch": 3.04029304029304, "grad_norm": 0.30078125, "learning_rate": 8.358056658371692e-05, "loss": 0.0628, "mean_token_accuracy": 0.9788961172103882, "num_tokens": 1526056.0, "step": 830 }, { "epoch": 3.0586080586080584, "grad_norm": 0.8359375, "learning_rate": 8.35483908955991e-05, "loss": 0.064, "mean_token_accuracy": 0.9767962694168091, "num_tokens": 1535105.0, "step": 835 }, { "epoch": 3.076923076923077, "grad_norm": 0.71875, "learning_rate": 8.351593837236514e-05, "loss": 0.0672, "mean_token_accuracy": 0.9740965247154236, "num_tokens": 1543572.0, "step": 840 }, { "epoch": 3.0952380952380953, "grad_norm": 0.765625, "learning_rate": 8.348320931163043e-05, "loss": 0.1008, "mean_token_accuracy": 0.962606143951416, "num_tokens": 1553371.0, "step": 845 }, { "epoch": 3.1135531135531136, "grad_norm": 1.4765625, "learning_rate": 8.345020401354646e-05, "loss": 0.0652, "mean_token_accuracy": 0.9775573253631592, "num_tokens": 1563374.0, "step": 850 }, { "epoch": 3.131868131868132, "grad_norm": 1.03125, "learning_rate": 8.341692278079804e-05, "loss": 0.0701, "mean_token_accuracy": 0.9748265624046326, "num_tokens": 1572311.0, "step": 855 }, { "epoch": 3.15018315018315, "grad_norm": 0.447265625, "learning_rate": 8.338336591860042e-05, "loss": 0.0616, "mean_token_accuracy": 0.9770539045333863, "num_tokens": 1581662.0, "step": 860 }, { "epoch": 3.1684981684981683, "grad_norm": 0.8046875, "learning_rate": 8.334953373469673e-05, "loss": 0.0847, "mean_token_accuracy": 0.9684791564941406, "num_tokens": 1590875.0, "step": 865 }, { "epoch": 3.186813186813187, "grad_norm": 0.7265625, "learning_rate": 8.331542653935491e-05, "loss": 0.0618, "mean_token_accuracy": 0.9756833434104919, "num_tokens": 1599508.0, "step": 870 }, { "epoch": 3.2051282051282053, "grad_norm": 0.484375, "learning_rate": 8.328104464536502e-05, "loss": 0.0538, "mean_token_accuracy": 0.9818952322006226, "num_tokens": 1609055.0, "step": 875 }, { "epoch": 3.2234432234432235, "grad_norm": 1.578125, "learning_rate": 8.324638836803633e-05, "loss": 0.066, "mean_token_accuracy": 0.9749167203903198, "num_tokens": 1618211.0, "step": 880 }, { "epoch": 3.241758241758242, "grad_norm": 0.482421875, "learning_rate": 8.32114580251944e-05, "loss": 0.0943, "mean_token_accuracy": 0.9674638390541077, "num_tokens": 1627192.0, "step": 885 }, { "epoch": 3.26007326007326, "grad_norm": 0.09521484375, "learning_rate": 8.317625393717823e-05, "loss": 0.0424, "mean_token_accuracy": 0.9843096375465393, "num_tokens": 1636468.0, "step": 890 }, { "epoch": 3.2783882783882783, "grad_norm": 1.1953125, "learning_rate": 8.314077642683719e-05, "loss": 0.0866, "mean_token_accuracy": 0.971860671043396, "num_tokens": 1645884.0, "step": 895 }, { "epoch": 3.2967032967032965, "grad_norm": 0.796875, "learning_rate": 8.310502581952828e-05, "loss": 0.0579, "mean_token_accuracy": 0.9784857869148255, "num_tokens": 1655337.0, "step": 900 }, { "epoch": 3.315018315018315, "grad_norm": 0.48828125, "learning_rate": 8.306900244311288e-05, "loss": 0.1142, "mean_token_accuracy": 0.961796247959137, "num_tokens": 1663976.0, "step": 905 }, { "epoch": 3.3333333333333335, "grad_norm": 0.287109375, "learning_rate": 8.303270662795399e-05, "loss": 0.075, "mean_token_accuracy": 0.9726075410842896, "num_tokens": 1673433.0, "step": 910 }, { "epoch": 3.3516483516483517, "grad_norm": 0.7578125, "learning_rate": 8.299613870691302e-05, "loss": 0.0939, "mean_token_accuracy": 0.9688026666641235, "num_tokens": 1683030.0, "step": 915 }, { "epoch": 3.36996336996337, "grad_norm": 0.373046875, "learning_rate": 8.295929901534686e-05, "loss": 0.0319, "mean_token_accuracy": 0.9874111294746399, "num_tokens": 1693029.0, "step": 920 }, { "epoch": 3.3882783882783882, "grad_norm": 0.2392578125, "learning_rate": 8.29221878911047e-05, "loss": 0.0532, "mean_token_accuracy": 0.9800354242324829, "num_tokens": 1703258.0, "step": 925 }, { "epoch": 3.4065934065934065, "grad_norm": 0.73046875, "learning_rate": 8.288480567452501e-05, "loss": 0.1188, "mean_token_accuracy": 0.9611821174621582, "num_tokens": 1712754.0, "step": 930 }, { "epoch": 3.4249084249084247, "grad_norm": 0.99609375, "learning_rate": 8.284715270843238e-05, "loss": 0.0829, "mean_token_accuracy": 0.9708463668823242, "num_tokens": 1721472.0, "step": 935 }, { "epoch": 3.4432234432234434, "grad_norm": 10.3125, "learning_rate": 8.280922933813442e-05, "loss": 0.04, "mean_token_accuracy": 0.9824108600616455, "num_tokens": 1730959.0, "step": 940 }, { "epoch": 3.4615384615384617, "grad_norm": 0.6015625, "learning_rate": 8.277103591141852e-05, "loss": 0.0678, "mean_token_accuracy": 0.9735846400260926, "num_tokens": 1739674.0, "step": 945 }, { "epoch": 3.47985347985348, "grad_norm": 0.39453125, "learning_rate": 8.273257277854872e-05, "loss": 0.0424, "mean_token_accuracy": 0.9842739105224609, "num_tokens": 1749137.0, "step": 950 }, { "epoch": 3.498168498168498, "grad_norm": 0.1630859375, "learning_rate": 8.269384029226248e-05, "loss": 0.0285, "mean_token_accuracy": 0.9885275959968567, "num_tokens": 1758530.0, "step": 955 }, { "epoch": 3.5164835164835164, "grad_norm": 0.0927734375, "learning_rate": 8.265483880776745e-05, "loss": 0.0741, "mean_token_accuracy": 0.9747227191925049, "num_tokens": 1767672.0, "step": 960 }, { "epoch": 3.5347985347985347, "grad_norm": 0.44921875, "learning_rate": 8.26155686827382e-05, "loss": 0.068, "mean_token_accuracy": 0.975150191783905, "num_tokens": 1776694.0, "step": 965 }, { "epoch": 3.553113553113553, "grad_norm": 2.25, "learning_rate": 8.257603027731291e-05, "loss": 0.0536, "mean_token_accuracy": 0.9809759497642517, "num_tokens": 1785904.0, "step": 970 }, { "epoch": 3.571428571428571, "grad_norm": 0.53125, "learning_rate": 8.253622395409019e-05, "loss": 0.0555, "mean_token_accuracy": 0.9794698238372803, "num_tokens": 1795028.0, "step": 975 }, { "epoch": 3.58974358974359, "grad_norm": 0.9609375, "learning_rate": 8.24961500781256e-05, "loss": 0.1048, "mean_token_accuracy": 0.9619524002075195, "num_tokens": 1802957.0, "step": 980 }, { "epoch": 3.608058608058608, "grad_norm": 0.337890625, "learning_rate": 8.24558090169284e-05, "loss": 0.0801, "mean_token_accuracy": 0.9719898581504822, "num_tokens": 1811233.0, "step": 985 }, { "epoch": 3.6263736263736264, "grad_norm": 0.30859375, "learning_rate": 8.241520114045813e-05, "loss": 0.0932, "mean_token_accuracy": 0.9668406844139099, "num_tokens": 1820206.0, "step": 990 }, { "epoch": 3.6446886446886446, "grad_norm": 0.291015625, "learning_rate": 8.237432682112127e-05, "loss": 0.0814, "mean_token_accuracy": 0.968066930770874, "num_tokens": 1828757.0, "step": 995 }, { "epoch": 3.663003663003663, "grad_norm": 0.51171875, "learning_rate": 8.233318643376773e-05, "loss": 0.0786, "mean_token_accuracy": 0.972130823135376, "num_tokens": 1837693.0, "step": 1000 }, { "epoch": 3.6813186813186816, "grad_norm": 0.138671875, "learning_rate": 8.229178035568755e-05, "loss": 0.0772, "mean_token_accuracy": 0.9723419427871705, "num_tokens": 1847020.0, "step": 1005 }, { "epoch": 3.6996336996337, "grad_norm": 1.1796875, "learning_rate": 8.225010896660734e-05, "loss": 0.1051, "mean_token_accuracy": 0.9616187572479248, "num_tokens": 1855982.0, "step": 1010 }, { "epoch": 3.717948717948718, "grad_norm": 0.44140625, "learning_rate": 8.220817264868678e-05, "loss": 0.0785, "mean_token_accuracy": 0.9704046010971069, "num_tokens": 1865186.0, "step": 1015 }, { "epoch": 3.7362637362637363, "grad_norm": 0.82421875, "learning_rate": 8.216597178651523e-05, "loss": 0.0473, "mean_token_accuracy": 0.9826258182525635, "num_tokens": 1874733.0, "step": 1020 }, { "epoch": 3.7545787545787546, "grad_norm": 1.078125, "learning_rate": 8.212350676710807e-05, "loss": 0.0746, "mean_token_accuracy": 0.9718662738800049, "num_tokens": 1884155.0, "step": 1025 }, { "epoch": 3.772893772893773, "grad_norm": 0.58984375, "learning_rate": 8.208077797990322e-05, "loss": 0.0739, "mean_token_accuracy": 0.9724728226661682, "num_tokens": 1892962.0, "step": 1030 }, { "epoch": 3.791208791208791, "grad_norm": 0.5234375, "learning_rate": 8.203778581675761e-05, "loss": 0.0665, "mean_token_accuracy": 0.9769334554672241, "num_tokens": 1902461.0, "step": 1035 }, { "epoch": 3.8095238095238093, "grad_norm": 0.6640625, "learning_rate": 8.199453067194351e-05, "loss": 0.0952, "mean_token_accuracy": 0.9705726265907287, "num_tokens": 1911844.0, "step": 1040 }, { "epoch": 3.8278388278388276, "grad_norm": 0.82421875, "learning_rate": 8.195101294214486e-05, "loss": 0.0615, "mean_token_accuracy": 0.9792343139648437, "num_tokens": 1921110.0, "step": 1045 }, { "epoch": 3.8461538461538463, "grad_norm": 0.546875, "learning_rate": 8.190723302645387e-05, "loss": 0.0671, "mean_token_accuracy": 0.9760551929473877, "num_tokens": 1930834.0, "step": 1050 }, { "epoch": 3.8644688644688645, "grad_norm": 0.70703125, "learning_rate": 8.186319132636706e-05, "loss": 0.0888, "mean_token_accuracy": 0.9672855019569397, "num_tokens": 1939564.0, "step": 1055 }, { "epoch": 3.8827838827838828, "grad_norm": 0.125, "learning_rate": 8.18188882457818e-05, "loss": 0.0718, "mean_token_accuracy": 0.9734614849090576, "num_tokens": 1948652.0, "step": 1060 }, { "epoch": 3.901098901098901, "grad_norm": 0.52734375, "learning_rate": 8.177432419099249e-05, "loss": 0.0496, "mean_token_accuracy": 0.9841477632522583, "num_tokens": 1958891.0, "step": 1065 }, { "epoch": 3.9194139194139193, "grad_norm": 0.482421875, "learning_rate": 8.172949957068689e-05, "loss": 0.0773, "mean_token_accuracy": 0.9700749635696411, "num_tokens": 1968507.0, "step": 1070 }, { "epoch": 3.937728937728938, "grad_norm": 0.90234375, "learning_rate": 8.168441479594237e-05, "loss": 0.0839, "mean_token_accuracy": 0.9697647333145142, "num_tokens": 1977929.0, "step": 1075 }, { "epoch": 3.956043956043956, "grad_norm": 0.63671875, "learning_rate": 8.163907028022208e-05, "loss": 0.0534, "mean_token_accuracy": 0.9822108268737793, "num_tokens": 1987374.0, "step": 1080 }, { "epoch": 3.9743589743589745, "grad_norm": 0.490234375, "learning_rate": 8.159346643937122e-05, "loss": 0.0895, "mean_token_accuracy": 0.9678827285766601, "num_tokens": 1995512.0, "step": 1085 }, { "epoch": 3.9926739926739927, "grad_norm": 1.046875, "learning_rate": 8.154760369161322e-05, "loss": 0.0842, "mean_token_accuracy": 0.9745811820030212, "num_tokens": 2005014.0, "step": 1090 }, { "epoch": 4.010989010989011, "grad_norm": 5.25, "learning_rate": 8.150148245754586e-05, "loss": 0.0784, "mean_token_accuracy": 0.9694916486740113, "num_tokens": 2013958.0, "step": 1095 }, { "epoch": 4.029304029304029, "grad_norm": 0.482421875, "learning_rate": 8.145510316013748e-05, "loss": 0.0379, "mean_token_accuracy": 0.9864168405532837, "num_tokens": 2023416.0, "step": 1100 }, { "epoch": 4.0476190476190474, "grad_norm": 0.11328125, "learning_rate": 8.140846622472304e-05, "loss": 0.0336, "mean_token_accuracy": 0.9863126277923584, "num_tokens": 2032892.0, "step": 1105 }, { "epoch": 4.065934065934066, "grad_norm": 0.1259765625, "learning_rate": 8.13615720790003e-05, "loss": 0.0572, "mean_token_accuracy": 0.9810018301010132, "num_tokens": 2042824.0, "step": 1110 }, { "epoch": 4.084249084249084, "grad_norm": 1.078125, "learning_rate": 8.131442115302573e-05, "loss": 0.0579, "mean_token_accuracy": 0.9789334416389466, "num_tokens": 2052256.0, "step": 1115 }, { "epoch": 4.102564102564102, "grad_norm": 2.3125, "learning_rate": 8.12670138792108e-05, "loss": 0.0372, "mean_token_accuracy": 0.9866159200668335, "num_tokens": 2061743.0, "step": 1120 }, { "epoch": 4.1208791208791204, "grad_norm": 0.3828125, "learning_rate": 8.121935069231779e-05, "loss": 0.0484, "mean_token_accuracy": 0.9815837264060974, "num_tokens": 2069937.0, "step": 1125 }, { "epoch": 4.13919413919414, "grad_norm": 1.015625, "learning_rate": 8.1171432029456e-05, "loss": 0.0687, "mean_token_accuracy": 0.9753403425216675, "num_tokens": 2079121.0, "step": 1130 }, { "epoch": 4.157509157509158, "grad_norm": 0.77734375, "learning_rate": 8.11232583300776e-05, "loss": 0.06, "mean_token_accuracy": 0.9811612010002136, "num_tokens": 2088105.0, "step": 1135 }, { "epoch": 4.175824175824176, "grad_norm": 1.5234375, "learning_rate": 8.107483003597365e-05, "loss": 0.0537, "mean_token_accuracy": 0.9808408856391907, "num_tokens": 2096831.0, "step": 1140 }, { "epoch": 4.194139194139194, "grad_norm": 0.1767578125, "learning_rate": 8.102614759127002e-05, "loss": 0.0222, "mean_token_accuracy": 0.9910707116127014, "num_tokens": 2106634.0, "step": 1145 }, { "epoch": 4.212454212454213, "grad_norm": 0.37109375, "learning_rate": 8.097721144242338e-05, "loss": 0.0617, "mean_token_accuracy": 0.9770854115486145, "num_tokens": 2116094.0, "step": 1150 }, { "epoch": 4.230769230769231, "grad_norm": 0.1806640625, "learning_rate": 8.092802203821708e-05, "loss": 0.0256, "mean_token_accuracy": 0.9892764806747436, "num_tokens": 2125097.0, "step": 1155 }, { "epoch": 4.249084249084249, "grad_norm": 0.130859375, "learning_rate": 8.087857982975698e-05, "loss": 0.0323, "mean_token_accuracy": 0.9884976744651794, "num_tokens": 2134122.0, "step": 1160 }, { "epoch": 4.267399267399267, "grad_norm": 1.625, "learning_rate": 8.082888527046738e-05, "loss": 0.0549, "mean_token_accuracy": 0.9823671579360962, "num_tokens": 2142806.0, "step": 1165 }, { "epoch": 4.285714285714286, "grad_norm": 0.703125, "learning_rate": 8.077893881608685e-05, "loss": 0.0772, "mean_token_accuracy": 0.9735370635986328, "num_tokens": 2151281.0, "step": 1170 }, { "epoch": 4.304029304029304, "grad_norm": 0.97265625, "learning_rate": 8.072874092466398e-05, "loss": 0.065, "mean_token_accuracy": 0.9773920774459839, "num_tokens": 2160764.0, "step": 1175 }, { "epoch": 4.322344322344322, "grad_norm": 0.69140625, "learning_rate": 8.067829205655333e-05, "loss": 0.0588, "mean_token_accuracy": 0.9791547775268554, "num_tokens": 2169484.0, "step": 1180 }, { "epoch": 4.34065934065934, "grad_norm": 0.64453125, "learning_rate": 8.062759267441103e-05, "loss": 0.0444, "mean_token_accuracy": 0.9826448798179627, "num_tokens": 2178294.0, "step": 1185 }, { "epoch": 4.358974358974359, "grad_norm": 0.09423828125, "learning_rate": 8.057664324319065e-05, "loss": 0.0673, "mean_token_accuracy": 0.9759621739387512, "num_tokens": 2187496.0, "step": 1190 }, { "epoch": 4.377289377289378, "grad_norm": 1.2421875, "learning_rate": 8.052544423013895e-05, "loss": 0.0366, "mean_token_accuracy": 0.9850521922111511, "num_tokens": 2196704.0, "step": 1195 }, { "epoch": 4.395604395604396, "grad_norm": 0.73828125, "learning_rate": 8.047399610479149e-05, "loss": 0.0487, "mean_token_accuracy": 0.9822589874267578, "num_tokens": 2205968.0, "step": 1200 }, { "epoch": 4.413919413919414, "grad_norm": 0.9140625, "learning_rate": 8.042229933896844e-05, "loss": 0.0742, "mean_token_accuracy": 0.9758718729019165, "num_tokens": 2215463.0, "step": 1205 }, { "epoch": 4.4322344322344325, "grad_norm": 0.5703125, "learning_rate": 8.037035440677016e-05, "loss": 0.0424, "mean_token_accuracy": 0.9832551598548889, "num_tokens": 2224409.0, "step": 1210 }, { "epoch": 4.450549450549451, "grad_norm": 0.396484375, "learning_rate": 8.03181617845729e-05, "loss": 0.0614, "mean_token_accuracy": 0.9775595307350159, "num_tokens": 2233382.0, "step": 1215 }, { "epoch": 4.468864468864469, "grad_norm": 0.671875, "learning_rate": 8.026572195102447e-05, "loss": 0.0516, "mean_token_accuracy": 0.982862401008606, "num_tokens": 2242990.0, "step": 1220 }, { "epoch": 4.487179487179487, "grad_norm": 0.2255859375, "learning_rate": 8.021303538703972e-05, "loss": 0.0649, "mean_token_accuracy": 0.9760475039482117, "num_tokens": 2251597.0, "step": 1225 }, { "epoch": 4.5054945054945055, "grad_norm": 0.384765625, "learning_rate": 8.01601025757963e-05, "loss": 0.0872, "mean_token_accuracy": 0.9712528467178345, "num_tokens": 2260017.0, "step": 1230 }, { "epoch": 4.523809523809524, "grad_norm": 0.1015625, "learning_rate": 8.010692400273009e-05, "loss": 0.0514, "mean_token_accuracy": 0.9804089546203614, "num_tokens": 2270253.0, "step": 1235 }, { "epoch": 4.542124542124542, "grad_norm": 0.53515625, "learning_rate": 8.00535001555308e-05, "loss": 0.0392, "mean_token_accuracy": 0.9841665983200073, "num_tokens": 2279714.0, "step": 1240 }, { "epoch": 4.56043956043956, "grad_norm": 0.68359375, "learning_rate": 7.999983152413753e-05, "loss": 0.0686, "mean_token_accuracy": 0.9762724280357361, "num_tokens": 2289630.0, "step": 1245 }, { "epoch": 4.5787545787545785, "grad_norm": 1.515625, "learning_rate": 7.994591860073424e-05, "loss": 0.0641, "mean_token_accuracy": 0.9772836685180664, "num_tokens": 2297980.0, "step": 1250 }, { "epoch": 4.597069597069597, "grad_norm": 3.890625, "learning_rate": 7.989176187974522e-05, "loss": 0.0768, "mean_token_accuracy": 0.9789605379104614, "num_tokens": 2307539.0, "step": 1255 }, { "epoch": 4.615384615384615, "grad_norm": 22.25, "learning_rate": 7.983736185783057e-05, "loss": 0.1298, "mean_token_accuracy": 0.9655901789665222, "num_tokens": 2316208.0, "step": 1260 }, { "epoch": 4.633699633699633, "grad_norm": 19.875, "learning_rate": 7.97827190338817e-05, "loss": 0.068, "mean_token_accuracy": 0.977760374546051, "num_tokens": 2325231.0, "step": 1265 }, { "epoch": 4.652014652014652, "grad_norm": 0.404296875, "learning_rate": 7.972783390901666e-05, "loss": 0.0608, "mean_token_accuracy": 0.9790961384773255, "num_tokens": 2334422.0, "step": 1270 }, { "epoch": 4.670329670329671, "grad_norm": 17.25, "learning_rate": 7.967270698657563e-05, "loss": 0.077, "mean_token_accuracy": 0.9756144642829895, "num_tokens": 2343150.0, "step": 1275 }, { "epoch": 4.688644688644689, "grad_norm": 0.578125, "learning_rate": 7.96173387721162e-05, "loss": 0.0358, "mean_token_accuracy": 0.9854479551315307, "num_tokens": 2352172.0, "step": 1280 }, { "epoch": 4.706959706959707, "grad_norm": 0.53125, "learning_rate": 7.95617297734089e-05, "loss": 0.0488, "mean_token_accuracy": 0.9814175963401794, "num_tokens": 2362355.0, "step": 1285 }, { "epoch": 4.725274725274725, "grad_norm": 0.54296875, "learning_rate": 7.950588050043236e-05, "loss": 0.0309, "mean_token_accuracy": 0.9877290248870849, "num_tokens": 2372065.0, "step": 1290 }, { "epoch": 4.743589743589744, "grad_norm": 0.37109375, "learning_rate": 7.944979146536874e-05, "loss": 0.0425, "mean_token_accuracy": 0.9832926988601685, "num_tokens": 2381430.0, "step": 1295 }, { "epoch": 4.761904761904762, "grad_norm": 0.55859375, "learning_rate": 7.939346318259904e-05, "loss": 0.0294, "mean_token_accuracy": 0.9892051696777344, "num_tokens": 2390948.0, "step": 1300 }, { "epoch": 4.78021978021978, "grad_norm": 0.734375, "learning_rate": 7.933689616869828e-05, "loss": 0.0499, "mean_token_accuracy": 0.9788974165916443, "num_tokens": 2399937.0, "step": 1305 }, { "epoch": 4.798534798534798, "grad_norm": 1.2265625, "learning_rate": 7.92800909424309e-05, "loss": 0.059, "mean_token_accuracy": 0.9804568767547608, "num_tokens": 2408310.0, "step": 1310 }, { "epoch": 4.816849816849817, "grad_norm": 0.53515625, "learning_rate": 7.922304802474593e-05, "loss": 0.0689, "mean_token_accuracy": 0.9761590838432312, "num_tokens": 2417776.0, "step": 1315 }, { "epoch": 4.835164835164835, "grad_norm": 0.1728515625, "learning_rate": 7.916576793877218e-05, "loss": 0.096, "mean_token_accuracy": 0.9676541090011597, "num_tokens": 2427560.0, "step": 1320 }, { "epoch": 4.853479853479853, "grad_norm": 0.142578125, "learning_rate": 7.91082512098135e-05, "loss": 0.0481, "mean_token_accuracy": 0.9823224782943726, "num_tokens": 2437579.0, "step": 1325 }, { "epoch": 4.871794871794872, "grad_norm": 0.123046875, "learning_rate": 7.905049836534396e-05, "loss": 0.0371, "mean_token_accuracy": 0.9855931043624878, "num_tokens": 2446578.0, "step": 1330 }, { "epoch": 4.8901098901098905, "grad_norm": 1.0703125, "learning_rate": 7.8992509935003e-05, "loss": 0.0589, "mean_token_accuracy": 0.9781901359558105, "num_tokens": 2455332.0, "step": 1335 }, { "epoch": 4.908424908424909, "grad_norm": 0.25, "learning_rate": 7.893428645059053e-05, "loss": 0.0478, "mean_token_accuracy": 0.9811420202255249, "num_tokens": 2464469.0, "step": 1340 }, { "epoch": 4.926739926739927, "grad_norm": 0.55859375, "learning_rate": 7.887582844606212e-05, "loss": 0.0416, "mean_token_accuracy": 0.9845540761947632, "num_tokens": 2474332.0, "step": 1345 }, { "epoch": 4.945054945054945, "grad_norm": 0.1943359375, "learning_rate": 7.881713645752409e-05, "loss": 0.0567, "mean_token_accuracy": 0.977370023727417, "num_tokens": 2483586.0, "step": 1350 }, { "epoch": 4.9633699633699635, "grad_norm": 0.57421875, "learning_rate": 7.875821102322853e-05, "loss": 0.0489, "mean_token_accuracy": 0.9813120841979981, "num_tokens": 2492757.0, "step": 1355 }, { "epoch": 4.981684981684982, "grad_norm": 0.158203125, "learning_rate": 7.869905268356847e-05, "loss": 0.0683, "mean_token_accuracy": 0.9747755646705627, "num_tokens": 2501516.0, "step": 1360 }, { "epoch": 5.0, "grad_norm": 0.41015625, "learning_rate": 7.863966198107285e-05, "loss": 0.0371, "mean_token_accuracy": 0.9825921297073364, "num_tokens": 2509780.0, "step": 1365 }, { "epoch": 5.018315018315018, "grad_norm": 0.1923828125, "learning_rate": 7.858003946040152e-05, "loss": 0.0349, "mean_token_accuracy": 0.9879397273063659, "num_tokens": 2518456.0, "step": 1370 }, { "epoch": 5.0366300366300365, "grad_norm": 0.38671875, "learning_rate": 7.852018566834035e-05, "loss": 0.0623, "mean_token_accuracy": 0.9814428091049194, "num_tokens": 2526696.0, "step": 1375 }, { "epoch": 5.054945054945055, "grad_norm": 15.5625, "learning_rate": 7.846010115379609e-05, "loss": 0.0851, "mean_token_accuracy": 0.9754502534866333, "num_tokens": 2535458.0, "step": 1380 }, { "epoch": 5.073260073260073, "grad_norm": 0.1337890625, "learning_rate": 7.839978646779148e-05, "loss": 0.0302, "mean_token_accuracy": 0.9864336133003235, "num_tokens": 2544932.0, "step": 1385 }, { "epoch": 5.091575091575091, "grad_norm": 0.474609375, "learning_rate": 7.833924216346e-05, "loss": 0.044, "mean_token_accuracy": 0.9824022054672241, "num_tokens": 2553707.0, "step": 1390 }, { "epoch": 5.1098901098901095, "grad_norm": 0.62890625, "learning_rate": 7.827846879604103e-05, "loss": 0.0476, "mean_token_accuracy": 0.9825940251350402, "num_tokens": 2562559.0, "step": 1395 }, { "epoch": 5.128205128205128, "grad_norm": 0.3046875, "learning_rate": 7.821746692287458e-05, "loss": 0.0301, "mean_token_accuracy": 0.9876471161842346, "num_tokens": 2571694.0, "step": 1400 }, { "epoch": 5.146520146520147, "grad_norm": 0.2236328125, "learning_rate": 7.815623710339623e-05, "loss": 0.041, "mean_token_accuracy": 0.9862527489662171, "num_tokens": 2580208.0, "step": 1405 }, { "epoch": 5.164835164835165, "grad_norm": 0.96484375, "learning_rate": 7.809477989913203e-05, "loss": 0.0618, "mean_token_accuracy": 0.9779723167419434, "num_tokens": 2588156.0, "step": 1410 }, { "epoch": 5.183150183150183, "grad_norm": 0.20703125, "learning_rate": 7.803309587369332e-05, "loss": 0.0303, "mean_token_accuracy": 0.9863034844398498, "num_tokens": 2596974.0, "step": 1415 }, { "epoch": 5.201465201465202, "grad_norm": 0.65234375, "learning_rate": 7.79711855927716e-05, "loss": 0.0248, "mean_token_accuracy": 0.9895213365554809, "num_tokens": 2605921.0, "step": 1420 }, { "epoch": 5.21978021978022, "grad_norm": 0.6171875, "learning_rate": 7.790904962413324e-05, "loss": 0.0588, "mean_token_accuracy": 0.9793162941932678, "num_tokens": 2615551.0, "step": 1425 }, { "epoch": 5.238095238095238, "grad_norm": 0.61328125, "learning_rate": 7.784668853761446e-05, "loss": 0.0344, "mean_token_accuracy": 0.987682557106018, "num_tokens": 2624766.0, "step": 1430 }, { "epoch": 5.256410256410256, "grad_norm": 0.138671875, "learning_rate": 7.778410290511585e-05, "loss": 0.0183, "mean_token_accuracy": 0.9921578407287598, "num_tokens": 2634654.0, "step": 1435 }, { "epoch": 5.274725274725275, "grad_norm": 0.16796875, "learning_rate": 7.772129330059739e-05, "loss": 0.025, "mean_token_accuracy": 0.991031551361084, "num_tokens": 2644515.0, "step": 1440 }, { "epoch": 5.293040293040293, "grad_norm": 0.126953125, "learning_rate": 7.7658260300073e-05, "loss": 0.0384, "mean_token_accuracy": 0.9862215161323548, "num_tokens": 2653798.0, "step": 1445 }, { "epoch": 5.311355311355311, "grad_norm": 5.0, "learning_rate": 7.759500448160529e-05, "loss": 0.0418, "mean_token_accuracy": 0.9845625400543213, "num_tokens": 2662880.0, "step": 1450 }, { "epoch": 5.329670329670329, "grad_norm": 12.0625, "learning_rate": 7.753152642530036e-05, "loss": 0.0456, "mean_token_accuracy": 0.9851066589355468, "num_tokens": 2671685.0, "step": 1455 }, { "epoch": 5.347985347985348, "grad_norm": 0.953125, "learning_rate": 7.746782671330237e-05, "loss": 0.0476, "mean_token_accuracy": 0.9837478876113892, "num_tokens": 2680763.0, "step": 1460 }, { "epoch": 5.366300366300366, "grad_norm": 0.1611328125, "learning_rate": 7.740390592978824e-05, "loss": 0.0462, "mean_token_accuracy": 0.9852417230606079, "num_tokens": 2690295.0, "step": 1465 }, { "epoch": 5.384615384615385, "grad_norm": 1.109375, "learning_rate": 7.733976466096226e-05, "loss": 0.0513, "mean_token_accuracy": 0.9832407712936402, "num_tokens": 2699104.0, "step": 1470 }, { "epoch": 5.402930402930403, "grad_norm": 0.3984375, "learning_rate": 7.727540349505082e-05, "loss": 0.0598, "mean_token_accuracy": 0.9803775191307068, "num_tokens": 2708621.0, "step": 1475 }, { "epoch": 5.4212454212454215, "grad_norm": 0.240234375, "learning_rate": 7.721082302229688e-05, "loss": 0.042, "mean_token_accuracy": 0.9849236011505127, "num_tokens": 2718857.0, "step": 1480 }, { "epoch": 5.43956043956044, "grad_norm": 0.5078125, "learning_rate": 7.714602383495464e-05, "loss": 0.0365, "mean_token_accuracy": 0.986870002746582, "num_tokens": 2728231.0, "step": 1485 }, { "epoch": 5.457875457875458, "grad_norm": 0.578125, "learning_rate": 7.708100652728407e-05, "loss": 0.076, "mean_token_accuracy": 0.9744701385498047, "num_tokens": 2737360.0, "step": 1490 }, { "epoch": 5.476190476190476, "grad_norm": 1.421875, "learning_rate": 7.70157716955455e-05, "loss": 0.0334, "mean_token_accuracy": 0.9870843052864074, "num_tokens": 2746745.0, "step": 1495 }, { "epoch": 5.4945054945054945, "grad_norm": 3.28125, "learning_rate": 7.695031993799411e-05, "loss": 0.0462, "mean_token_accuracy": 0.982709014415741, "num_tokens": 2756089.0, "step": 1500 }, { "epoch": 5.512820512820513, "grad_norm": 0.2294921875, "learning_rate": 7.688465185487447e-05, "loss": 0.0319, "mean_token_accuracy": 0.9880306243896484, "num_tokens": 2766072.0, "step": 1505 }, { "epoch": 5.531135531135531, "grad_norm": 0.390625, "learning_rate": 7.681876804841504e-05, "loss": 0.0392, "mean_token_accuracy": 0.9848615050315856, "num_tokens": 2775370.0, "step": 1510 }, { "epoch": 5.549450549450549, "grad_norm": 0.8125, "learning_rate": 7.675266912282259e-05, "loss": 0.0376, "mean_token_accuracy": 0.9833124279975891, "num_tokens": 2784606.0, "step": 1515 }, { "epoch": 5.5677655677655675, "grad_norm": 0.44140625, "learning_rate": 7.668635568427677e-05, "loss": 0.0412, "mean_token_accuracy": 0.9833675742149353, "num_tokens": 2794109.0, "step": 1520 }, { "epoch": 5.586080586080586, "grad_norm": 8.375, "learning_rate": 7.661982834092442e-05, "loss": 0.0441, "mean_token_accuracy": 0.9847989916801453, "num_tokens": 2804339.0, "step": 1525 }, { "epoch": 5.604395604395604, "grad_norm": 1.4609375, "learning_rate": 7.65530877028741e-05, "loss": 0.0445, "mean_token_accuracy": 0.9856428861618042, "num_tokens": 2813328.0, "step": 1530 }, { "epoch": 5.622710622710622, "grad_norm": 0.93359375, "learning_rate": 7.648613438219043e-05, "loss": 0.0599, "mean_token_accuracy": 0.9801060795783997, "num_tokens": 2822846.0, "step": 1535 }, { "epoch": 5.641025641025641, "grad_norm": 2.09375, "learning_rate": 7.64189689928885e-05, "loss": 0.0703, "mean_token_accuracy": 0.9766062736511231, "num_tokens": 2832793.0, "step": 1540 }, { "epoch": 5.65934065934066, "grad_norm": 4.40625, "learning_rate": 7.635159215092825e-05, "loss": 0.0482, "mean_token_accuracy": 0.984825873374939, "num_tokens": 2842566.0, "step": 1545 }, { "epoch": 5.677655677655678, "grad_norm": 0.23828125, "learning_rate": 7.62840044742088e-05, "loss": 0.028, "mean_token_accuracy": 0.9913866996765137, "num_tokens": 2852366.0, "step": 1550 }, { "epoch": 5.695970695970696, "grad_norm": 0.4921875, "learning_rate": 7.621620658256279e-05, "loss": 0.0232, "mean_token_accuracy": 0.9910756468772888, "num_tokens": 2861611.0, "step": 1555 }, { "epoch": 5.714285714285714, "grad_norm": 0.55078125, "learning_rate": 7.61481990977507e-05, "loss": 0.0417, "mean_token_accuracy": 0.9838123202323914, "num_tokens": 2870193.0, "step": 1560 }, { "epoch": 5.732600732600733, "grad_norm": 0.26953125, "learning_rate": 7.607998264345515e-05, "loss": 0.0427, "mean_token_accuracy": 0.9834349632263184, "num_tokens": 2879533.0, "step": 1565 }, { "epoch": 5.750915750915751, "grad_norm": 0.09814453125, "learning_rate": 7.601155784527516e-05, "loss": 0.0395, "mean_token_accuracy": 0.9865917205810547, "num_tokens": 2888588.0, "step": 1570 }, { "epoch": 5.769230769230769, "grad_norm": 0.63671875, "learning_rate": 7.594292533072048e-05, "loss": 0.0359, "mean_token_accuracy": 0.9868963241577149, "num_tokens": 2898690.0, "step": 1575 }, { "epoch": 5.787545787545787, "grad_norm": 0.205078125, "learning_rate": 7.587408572920568e-05, "loss": 0.0284, "mean_token_accuracy": 0.9881658792495728, "num_tokens": 2908343.0, "step": 1580 }, { "epoch": 5.805860805860806, "grad_norm": 0.3671875, "learning_rate": 7.58050396720446e-05, "loss": 0.0257, "mean_token_accuracy": 0.990417754650116, "num_tokens": 2918583.0, "step": 1585 }, { "epoch": 5.824175824175824, "grad_norm": 0.470703125, "learning_rate": 7.573578779244438e-05, "loss": 0.0461, "mean_token_accuracy": 0.9842254996299744, "num_tokens": 2927267.0, "step": 1590 }, { "epoch": 5.842490842490842, "grad_norm": 0.462890625, "learning_rate": 7.566633072549971e-05, "loss": 0.0422, "mean_token_accuracy": 0.9848962306976319, "num_tokens": 2937053.0, "step": 1595 }, { "epoch": 5.860805860805861, "grad_norm": 0.388671875, "learning_rate": 7.559666910818704e-05, "loss": 0.0444, "mean_token_accuracy": 0.9840786457061768, "num_tokens": 2945426.0, "step": 1600 }, { "epoch": 5.8791208791208796, "grad_norm": 0.431640625, "learning_rate": 7.552680357935869e-05, "loss": 0.04, "mean_token_accuracy": 0.9841191053390503, "num_tokens": 2953945.0, "step": 1605 }, { "epoch": 5.897435897435898, "grad_norm": 0.2255859375, "learning_rate": 7.5456734779737e-05, "loss": 0.0426, "mean_token_accuracy": 0.9842138767242432, "num_tokens": 2962787.0, "step": 1610 }, { "epoch": 5.915750915750916, "grad_norm": 0.671875, "learning_rate": 7.53864633519085e-05, "loss": 0.045, "mean_token_accuracy": 0.9817765593528748, "num_tokens": 2971473.0, "step": 1615 }, { "epoch": 5.934065934065934, "grad_norm": 0.75390625, "learning_rate": 7.531598994031796e-05, "loss": 0.0691, "mean_token_accuracy": 0.9776899933815002, "num_tokens": 2980183.0, "step": 1620 }, { "epoch": 5.9523809523809526, "grad_norm": 0.474609375, "learning_rate": 7.524531519126248e-05, "loss": 0.0509, "mean_token_accuracy": 0.9816537737846375, "num_tokens": 2989666.0, "step": 1625 }, { "epoch": 5.970695970695971, "grad_norm": 0.984375, "learning_rate": 7.517443975288563e-05, "loss": 0.0569, "mean_token_accuracy": 0.9782140016555786, "num_tokens": 2998079.0, "step": 1630 }, { "epoch": 5.989010989010989, "grad_norm": 1.109375, "learning_rate": 7.510336427517143e-05, "loss": 0.0628, "mean_token_accuracy": 0.9775285959243775, "num_tokens": 3007389.0, "step": 1635 }, { "epoch": 6.007326007326007, "grad_norm": 0.3203125, "learning_rate": 7.503208940993842e-05, "loss": 0.0693, "mean_token_accuracy": 0.9747921347618103, "num_tokens": 3015549.0, "step": 1640 }, { "epoch": 6.0256410256410255, "grad_norm": 0.69140625, "learning_rate": 7.49606158108337e-05, "loss": 0.0209, "mean_token_accuracy": 0.9922340869903564, "num_tokens": 3025486.0, "step": 1645 }, { "epoch": 6.043956043956044, "grad_norm": 0.095703125, "learning_rate": 7.488894413332689e-05, "loss": 0.162, "mean_token_accuracy": 0.9714855909347534, "num_tokens": 3034529.0, "step": 1650 }, { "epoch": 6.062271062271062, "grad_norm": 26.0, "learning_rate": 7.481707503470417e-05, "loss": 0.0312, "mean_token_accuracy": 0.9905990958213806, "num_tokens": 3043834.0, "step": 1655 }, { "epoch": 6.08058608058608, "grad_norm": 1.6875, "learning_rate": 7.474500917406223e-05, "loss": 0.0467, "mean_token_accuracy": 0.9854714870452881, "num_tokens": 3053424.0, "step": 1660 }, { "epoch": 6.0989010989010985, "grad_norm": 0.1240234375, "learning_rate": 7.467274721230221e-05, "loss": 0.0179, "mean_token_accuracy": 0.9933658838272095, "num_tokens": 3063201.0, "step": 1665 }, { "epoch": 6.117216117216117, "grad_norm": 0.06640625, "learning_rate": 7.460028981212365e-05, "loss": 0.0242, "mean_token_accuracy": 0.9913597822189331, "num_tokens": 3072991.0, "step": 1670 }, { "epoch": 6.135531135531136, "grad_norm": 0.921875, "learning_rate": 7.452763763801842e-05, "loss": 0.032, "mean_token_accuracy": 0.9883728504180909, "num_tokens": 3082543.0, "step": 1675 }, { "epoch": 6.153846153846154, "grad_norm": 0.09619140625, "learning_rate": 7.445479135626463e-05, "loss": 0.0306, "mean_token_accuracy": 0.9895648956298828, "num_tokens": 3091715.0, "step": 1680 }, { "epoch": 6.172161172161172, "grad_norm": 0.345703125, "learning_rate": 7.43817516349205e-05, "loss": 0.0357, "mean_token_accuracy": 0.9860040664672851, "num_tokens": 3100483.0, "step": 1685 }, { "epoch": 6.190476190476191, "grad_norm": 0.255859375, "learning_rate": 7.430851914381827e-05, "loss": 0.0254, "mean_token_accuracy": 0.9893843650817871, "num_tokens": 3109988.0, "step": 1690 }, { "epoch": 6.208791208791209, "grad_norm": 0.384765625, "learning_rate": 7.423509455455799e-05, "loss": 0.0415, "mean_token_accuracy": 0.9854371070861816, "num_tokens": 3119503.0, "step": 1695 }, { "epoch": 6.227106227106227, "grad_norm": 0.095703125, "learning_rate": 7.416147854050143e-05, "loss": 0.0345, "mean_token_accuracy": 0.9878103852272033, "num_tokens": 3128641.0, "step": 1700 }, { "epoch": 6.245421245421245, "grad_norm": 0.447265625, "learning_rate": 7.408767177676586e-05, "loss": 0.0222, "mean_token_accuracy": 0.9916223526000977, "num_tokens": 3137766.0, "step": 1705 }, { "epoch": 6.263736263736264, "grad_norm": 1.140625, "learning_rate": 7.40136749402179e-05, "loss": 0.0295, "mean_token_accuracy": 0.9880544662475585, "num_tokens": 3146326.0, "step": 1710 }, { "epoch": 6.282051282051282, "grad_norm": 0.09765625, "learning_rate": 7.393948870946729e-05, "loss": 0.0219, "mean_token_accuracy": 0.9904427409172059, "num_tokens": 3155485.0, "step": 1715 }, { "epoch": 6.3003663003663, "grad_norm": 0.62890625, "learning_rate": 7.386511376486061e-05, "loss": 0.0399, "mean_token_accuracy": 0.984571659564972, "num_tokens": 3164816.0, "step": 1720 }, { "epoch": 6.318681318681318, "grad_norm": 0.318359375, "learning_rate": 7.37905507884752e-05, "loss": 0.0201, "mean_token_accuracy": 0.992521858215332, "num_tokens": 3174718.0, "step": 1725 }, { "epoch": 6.336996336996337, "grad_norm": 0.578125, "learning_rate": 7.371580046411267e-05, "loss": 0.0357, "mean_token_accuracy": 0.986468493938446, "num_tokens": 3183409.0, "step": 1730 }, { "epoch": 6.355311355311355, "grad_norm": 0.39453125, "learning_rate": 7.364086347729285e-05, "loss": 0.0232, "mean_token_accuracy": 0.9905255913734436, "num_tokens": 3192371.0, "step": 1735 }, { "epoch": 6.373626373626374, "grad_norm": 0.333984375, "learning_rate": 7.356574051524742e-05, "loss": 0.0307, "mean_token_accuracy": 0.9887727737426758, "num_tokens": 3201677.0, "step": 1740 }, { "epoch": 6.391941391941392, "grad_norm": 0.703125, "learning_rate": 7.349043226691354e-05, "loss": 0.0274, "mean_token_accuracy": 0.9895096063613892, "num_tokens": 3211218.0, "step": 1745 }, { "epoch": 6.410256410256411, "grad_norm": 4.03125, "learning_rate": 7.341493942292763e-05, "loss": 0.0364, "mean_token_accuracy": 0.9867009520530701, "num_tokens": 3219808.0, "step": 1750 }, { "epoch": 6.428571428571429, "grad_norm": 1.0703125, "learning_rate": 7.333926267561898e-05, "loss": 0.0389, "mean_token_accuracy": 0.9854157328605652, "num_tokens": 3229195.0, "step": 1755 }, { "epoch": 6.446886446886447, "grad_norm": 0.61328125, "learning_rate": 7.326340271900346e-05, "loss": 0.0479, "mean_token_accuracy": 0.9841797947883606, "num_tokens": 3237885.0, "step": 1760 }, { "epoch": 6.465201465201465, "grad_norm": 0.95703125, "learning_rate": 7.318736024877707e-05, "loss": 0.04, "mean_token_accuracy": 0.9877835512161255, "num_tokens": 3247579.0, "step": 1765 }, { "epoch": 6.483516483516484, "grad_norm": 0.1904296875, "learning_rate": 7.31111359623096e-05, "loss": 0.0299, "mean_token_accuracy": 0.9874194860458374, "num_tokens": 3256966.0, "step": 1770 }, { "epoch": 6.501831501831502, "grad_norm": 0.99609375, "learning_rate": 7.30347305586383e-05, "loss": 0.0335, "mean_token_accuracy": 0.9878918051719665, "num_tokens": 3266350.0, "step": 1775 }, { "epoch": 6.52014652014652, "grad_norm": 0.57421875, "learning_rate": 7.295814473846134e-05, "loss": 0.0326, "mean_token_accuracy": 0.9886090993881226, "num_tokens": 3274954.0, "step": 1780 }, { "epoch": 6.538461538461538, "grad_norm": 0.55078125, "learning_rate": 7.288137920413148e-05, "loss": 0.0426, "mean_token_accuracy": 0.9847822427749634, "num_tokens": 3282952.0, "step": 1785 }, { "epoch": 6.556776556776557, "grad_norm": 0.1943359375, "learning_rate": 7.280443465964961e-05, "loss": 0.0197, "mean_token_accuracy": 0.9919935941696167, "num_tokens": 3292733.0, "step": 1790 }, { "epoch": 6.575091575091575, "grad_norm": 0.365234375, "learning_rate": 7.272731181065829e-05, "loss": 0.0531, "mean_token_accuracy": 0.9843693256378174, "num_tokens": 3302237.0, "step": 1795 }, { "epoch": 6.593406593406593, "grad_norm": 0.45703125, "learning_rate": 7.265001136443525e-05, "loss": 0.0221, "mean_token_accuracy": 0.9909010767936707, "num_tokens": 3311751.0, "step": 1800 }, { "epoch": 6.611721611721611, "grad_norm": 0.275390625, "learning_rate": 7.257253402988693e-05, "loss": 0.032, "mean_token_accuracy": 0.9890513896942139, "num_tokens": 3320625.0, "step": 1805 }, { "epoch": 6.63003663003663, "grad_norm": 0.48828125, "learning_rate": 7.249488051754199e-05, "loss": 0.0278, "mean_token_accuracy": 0.989040732383728, "num_tokens": 3329629.0, "step": 1810 }, { "epoch": 6.648351648351649, "grad_norm": 0.158203125, "learning_rate": 7.241705153954479e-05, "loss": 0.0291, "mean_token_accuracy": 0.988949990272522, "num_tokens": 3338881.0, "step": 1815 }, { "epoch": 6.666666666666667, "grad_norm": 1.484375, "learning_rate": 7.23390478096488e-05, "loss": 0.0316, "mean_token_accuracy": 0.9878135204315186, "num_tokens": 3348032.0, "step": 1820 }, { "epoch": 6.684981684981685, "grad_norm": 0.65234375, "learning_rate": 7.226087004321018e-05, "loss": 0.032, "mean_token_accuracy": 0.9878805875778198, "num_tokens": 3357685.0, "step": 1825 }, { "epoch": 6.7032967032967035, "grad_norm": 0.51171875, "learning_rate": 7.218251895718108e-05, "loss": 0.0314, "mean_token_accuracy": 0.9871991038322449, "num_tokens": 3366081.0, "step": 1830 }, { "epoch": 6.721611721611722, "grad_norm": 0.52734375, "learning_rate": 7.210399527010315e-05, "loss": 0.0371, "mean_token_accuracy": 0.9859683156013489, "num_tokens": 3375665.0, "step": 1835 }, { "epoch": 6.73992673992674, "grad_norm": 0.6875, "learning_rate": 7.202529970210093e-05, "loss": 0.0437, "mean_token_accuracy": 0.9847039103507995, "num_tokens": 3385407.0, "step": 1840 }, { "epoch": 6.758241758241758, "grad_norm": 0.9296875, "learning_rate": 7.194643297487525e-05, "loss": 0.0482, "mean_token_accuracy": 0.9819490432739257, "num_tokens": 3394276.0, "step": 1845 }, { "epoch": 6.7765567765567765, "grad_norm": 0.46875, "learning_rate": 7.186739581169659e-05, "loss": 0.0768, "mean_token_accuracy": 0.9809007167816162, "num_tokens": 3403876.0, "step": 1850 }, { "epoch": 6.794871794871795, "grad_norm": 0.921875, "learning_rate": 7.178818893739847e-05, "loss": 0.0345, "mean_token_accuracy": 0.9873276352882385, "num_tokens": 3413010.0, "step": 1855 }, { "epoch": 6.813186813186813, "grad_norm": 0.3203125, "learning_rate": 7.170881307837081e-05, "loss": 0.0364, "mean_token_accuracy": 0.9852291464805603, "num_tokens": 3420921.0, "step": 1860 }, { "epoch": 6.831501831501831, "grad_norm": 0.44921875, "learning_rate": 7.162926896255323e-05, "loss": 0.0379, "mean_token_accuracy": 0.9871521234512329, "num_tokens": 3429748.0, "step": 1865 }, { "epoch": 6.8498168498168495, "grad_norm": 0.09765625, "learning_rate": 7.154955731942842e-05, "loss": 0.0338, "mean_token_accuracy": 0.9871858716011047, "num_tokens": 3438647.0, "step": 1870 }, { "epoch": 6.868131868131869, "grad_norm": 0.111328125, "learning_rate": 7.146967888001541e-05, "loss": 0.0384, "mean_token_accuracy": 0.9856713056564331, "num_tokens": 3448087.0, "step": 1875 }, { "epoch": 6.886446886446887, "grad_norm": 0.51171875, "learning_rate": 7.138963437686289e-05, "loss": 0.0423, "mean_token_accuracy": 0.9847253203392029, "num_tokens": 3457095.0, "step": 1880 }, { "epoch": 6.904761904761905, "grad_norm": 0.09521484375, "learning_rate": 7.13094245440425e-05, "loss": 0.0335, "mean_token_accuracy": 0.9881868481636047, "num_tokens": 3466170.0, "step": 1885 }, { "epoch": 6.923076923076923, "grad_norm": 0.255859375, "learning_rate": 7.122905011714206e-05, "loss": 0.0331, "mean_token_accuracy": 0.987188744544983, "num_tokens": 3475299.0, "step": 1890 }, { "epoch": 6.941391941391942, "grad_norm": 0.365234375, "learning_rate": 7.114851183325886e-05, "loss": 0.0412, "mean_token_accuracy": 0.984969186782837, "num_tokens": 3485021.0, "step": 1895 }, { "epoch": 6.95970695970696, "grad_norm": 0.609375, "learning_rate": 7.10678104309929e-05, "loss": 0.0348, "mean_token_accuracy": 0.9886750221252442, "num_tokens": 3493774.0, "step": 1900 }, { "epoch": 6.978021978021978, "grad_norm": 0.703125, "learning_rate": 7.098694665044011e-05, "loss": 0.0339, "mean_token_accuracy": 0.9876073241233826, "num_tokens": 3503382.0, "step": 1905 }, { "epoch": 6.996336996336996, "grad_norm": 1.0078125, "learning_rate": 7.090592123318553e-05, "loss": 0.0437, "mean_token_accuracy": 0.9858802318572998, "num_tokens": 3512668.0, "step": 1910 }, { "epoch": 7.014652014652015, "grad_norm": 0.1640625, "learning_rate": 7.082473492229653e-05, "loss": 0.0192, "mean_token_accuracy": 0.9916712999343872, "num_tokens": 3520969.0, "step": 1915 }, { "epoch": 7.032967032967033, "grad_norm": 0.52734375, "learning_rate": 7.074338846231605e-05, "loss": 0.0239, "mean_token_accuracy": 0.9903509378433227, "num_tokens": 3529196.0, "step": 1920 }, { "epoch": 7.051282051282051, "grad_norm": 0.84765625, "learning_rate": 7.066188259925569e-05, "loss": 0.0569, "mean_token_accuracy": 0.9826701760292054, "num_tokens": 3538654.0, "step": 1925 }, { "epoch": 7.069597069597069, "grad_norm": 0.373046875, "learning_rate": 7.05802180805889e-05, "loss": 0.0255, "mean_token_accuracy": 0.9906298637390136, "num_tokens": 3547981.0, "step": 1930 }, { "epoch": 7.087912087912088, "grad_norm": 0.94921875, "learning_rate": 7.049839565524414e-05, "loss": 0.0212, "mean_token_accuracy": 0.9925713777542114, "num_tokens": 3557721.0, "step": 1935 }, { "epoch": 7.106227106227106, "grad_norm": 1.0, "learning_rate": 7.041641607359798e-05, "loss": 0.024, "mean_token_accuracy": 0.991481339931488, "num_tokens": 3566312.0, "step": 1940 }, { "epoch": 7.124542124542124, "grad_norm": 0.2197265625, "learning_rate": 7.033428008746831e-05, "loss": 0.0199, "mean_token_accuracy": 0.9931520938873291, "num_tokens": 3575508.0, "step": 1945 }, { "epoch": 7.142857142857143, "grad_norm": 0.21484375, "learning_rate": 7.025198845010726e-05, "loss": 0.016, "mean_token_accuracy": 0.9932388305664063, "num_tokens": 3584603.0, "step": 1950 }, { "epoch": 7.1611721611721615, "grad_norm": 11.4375, "learning_rate": 7.016954191619448e-05, "loss": 0.0315, "mean_token_accuracy": 0.9889041304588317, "num_tokens": 3593828.0, "step": 1955 }, { "epoch": 7.17948717948718, "grad_norm": 1.15625, "learning_rate": 7.008694124183013e-05, "loss": 0.0246, "mean_token_accuracy": 0.9922425985336304, "num_tokens": 3602200.0, "step": 1960 }, { "epoch": 7.197802197802198, "grad_norm": 0.326171875, "learning_rate": 7.000418718452799e-05, "loss": 0.0145, "mean_token_accuracy": 0.993871533870697, "num_tokens": 3611380.0, "step": 1965 }, { "epoch": 7.216117216117216, "grad_norm": 0.5078125, "learning_rate": 6.992128050320839e-05, "loss": 0.024, "mean_token_accuracy": 0.9902616381645203, "num_tokens": 3621064.0, "step": 1970 }, { "epoch": 7.2344322344322345, "grad_norm": 0.07763671875, "learning_rate": 6.983822195819146e-05, "loss": 0.0157, "mean_token_accuracy": 0.9932525634765625, "num_tokens": 3630093.0, "step": 1975 }, { "epoch": 7.252747252747253, "grad_norm": 0.2353515625, "learning_rate": 6.975501231118994e-05, "loss": 0.0236, "mean_token_accuracy": 0.9911327123641968, "num_tokens": 3639168.0, "step": 1980 }, { "epoch": 7.271062271062271, "grad_norm": 0.5859375, "learning_rate": 6.967165232530237e-05, "loss": 0.0216, "mean_token_accuracy": 0.991173791885376, "num_tokens": 3647754.0, "step": 1985 }, { "epoch": 7.289377289377289, "grad_norm": 0.2080078125, "learning_rate": 6.958814276500599e-05, "loss": 0.0162, "mean_token_accuracy": 0.9926367402076721, "num_tokens": 3657191.0, "step": 1990 }, { "epoch": 7.3076923076923075, "grad_norm": 0.1357421875, "learning_rate": 6.950448439614973e-05, "loss": 0.0163, "mean_token_accuracy": 0.9930072546005249, "num_tokens": 3667054.0, "step": 1995 }, { "epoch": 7.326007326007326, "grad_norm": 0.384765625, "learning_rate": 6.942067798594726e-05, "loss": 0.0211, "mean_token_accuracy": 0.9917723655700683, "num_tokens": 3677025.0, "step": 2000 }, { "epoch": 7.344322344322344, "grad_norm": 2.3125, "learning_rate": 6.933672430296986e-05, "loss": 0.0389, "mean_token_accuracy": 0.9862228393554687, "num_tokens": 3685112.0, "step": 2005 }, { "epoch": 7.362637362637362, "grad_norm": 0.126953125, "learning_rate": 6.925262411713945e-05, "loss": 0.0225, "mean_token_accuracy": 0.9903972864151, "num_tokens": 3694862.0, "step": 2010 }, { "epoch": 7.380952380952381, "grad_norm": 0.26171875, "learning_rate": 6.916837819972149e-05, "loss": 0.0274, "mean_token_accuracy": 0.9918236613273621, "num_tokens": 3704805.0, "step": 2015 }, { "epoch": 7.3992673992674, "grad_norm": 0.08349609375, "learning_rate": 6.908398732331793e-05, "loss": 0.017, "mean_token_accuracy": 0.9925737380981445, "num_tokens": 3714304.0, "step": 2020 }, { "epoch": 7.417582417582418, "grad_norm": 0.375, "learning_rate": 6.899945226186005e-05, "loss": 0.0219, "mean_token_accuracy": 0.9916564226150513, "num_tokens": 3723305.0, "step": 2025 }, { "epoch": 7.435897435897436, "grad_norm": 1.15625, "learning_rate": 6.89147737906015e-05, "loss": 0.0269, "mean_token_accuracy": 0.9908839344978333, "num_tokens": 3732073.0, "step": 2030 }, { "epoch": 7.454212454212454, "grad_norm": 0.091796875, "learning_rate": 6.882995268611106e-05, "loss": 0.0226, "mean_token_accuracy": 0.9915896058082581, "num_tokens": 3741071.0, "step": 2035 }, { "epoch": 7.472527472527473, "grad_norm": 0.62890625, "learning_rate": 6.874498972626559e-05, "loss": 0.0204, "mean_token_accuracy": 0.9924831748008728, "num_tokens": 3750420.0, "step": 2040 }, { "epoch": 7.490842490842491, "grad_norm": 0.15625, "learning_rate": 6.865988569024286e-05, "loss": 0.03, "mean_token_accuracy": 0.9895938873291016, "num_tokens": 3760153.0, "step": 2045 }, { "epoch": 7.509157509157509, "grad_norm": 0.416015625, "learning_rate": 6.857464135851444e-05, "loss": 0.0304, "mean_token_accuracy": 0.9892897367477417, "num_tokens": 3768898.0, "step": 2050 }, { "epoch": 7.527472527472527, "grad_norm": 0.2578125, "learning_rate": 6.848925751283853e-05, "loss": 0.0203, "mean_token_accuracy": 0.9921239137649536, "num_tokens": 3778718.0, "step": 2055 }, { "epoch": 7.545787545787546, "grad_norm": 0.06298828125, "learning_rate": 6.840373493625274e-05, "loss": 0.0203, "mean_token_accuracy": 0.9922136068344116, "num_tokens": 3788022.0, "step": 2060 }, { "epoch": 7.564102564102564, "grad_norm": 0.69921875, "learning_rate": 6.831807441306698e-05, "loss": 0.036, "mean_token_accuracy": 0.988727355003357, "num_tokens": 3797131.0, "step": 2065 }, { "epoch": 7.582417582417582, "grad_norm": 0.263671875, "learning_rate": 6.823227672885628e-05, "loss": 0.0319, "mean_token_accuracy": 0.9886006474494934, "num_tokens": 3806894.0, "step": 2070 }, { "epoch": 7.6007326007326, "grad_norm": 0.435546875, "learning_rate": 6.814634267045346e-05, "loss": 0.0245, "mean_token_accuracy": 0.9917014598846435, "num_tokens": 3815606.0, "step": 2075 }, { "epoch": 7.619047619047619, "grad_norm": 0.26171875, "learning_rate": 6.806027302594206e-05, "loss": 0.0308, "mean_token_accuracy": 0.988996148109436, "num_tokens": 3824337.0, "step": 2080 }, { "epoch": 7.637362637362637, "grad_norm": 0.1240234375, "learning_rate": 6.797406858464905e-05, "loss": 0.0289, "mean_token_accuracy": 0.9898035883903503, "num_tokens": 3833957.0, "step": 2085 }, { "epoch": 7.655677655677656, "grad_norm": 0.369140625, "learning_rate": 6.788773013713758e-05, "loss": 0.0265, "mean_token_accuracy": 0.9909451246261597, "num_tokens": 3843114.0, "step": 2090 }, { "epoch": 7.673992673992674, "grad_norm": 0.62109375, "learning_rate": 6.780125847519971e-05, "loss": 0.0206, "mean_token_accuracy": 0.9923561453819275, "num_tokens": 3852882.0, "step": 2095 }, { "epoch": 7.6923076923076925, "grad_norm": 0.3984375, "learning_rate": 6.771465439184927e-05, "loss": 0.026, "mean_token_accuracy": 0.9902096509933471, "num_tokens": 3862419.0, "step": 2100 }, { "epoch": 7.710622710622711, "grad_norm": 0.69921875, "learning_rate": 6.762791868131442e-05, "loss": 0.0226, "mean_token_accuracy": 0.9907670021057129, "num_tokens": 3871716.0, "step": 2105 }, { "epoch": 7.728937728937729, "grad_norm": 0.10888671875, "learning_rate": 6.754105213903045e-05, "loss": 0.023, "mean_token_accuracy": 0.9909697294235229, "num_tokens": 3880542.0, "step": 2110 }, { "epoch": 7.747252747252747, "grad_norm": 0.1943359375, "learning_rate": 6.745405556163253e-05, "loss": 0.0346, "mean_token_accuracy": 0.9859986186027527, "num_tokens": 3889484.0, "step": 2115 }, { "epoch": 7.7655677655677655, "grad_norm": 0.37890625, "learning_rate": 6.736692974694833e-05, "loss": 0.022, "mean_token_accuracy": 0.9915480494499207, "num_tokens": 3898636.0, "step": 2120 }, { "epoch": 7.783882783882784, "grad_norm": 0.41796875, "learning_rate": 6.727967549399072e-05, "loss": 0.0253, "mean_token_accuracy": 0.9902077794075013, "num_tokens": 3907808.0, "step": 2125 }, { "epoch": 7.802197802197802, "grad_norm": 0.1796875, "learning_rate": 6.719229360295044e-05, "loss": 0.0352, "mean_token_accuracy": 0.9878880977630615, "num_tokens": 3916667.0, "step": 2130 }, { "epoch": 7.82051282051282, "grad_norm": 0.26171875, "learning_rate": 6.710478487518882e-05, "loss": 0.0247, "mean_token_accuracy": 0.9887702345848084, "num_tokens": 3926085.0, "step": 2135 }, { "epoch": 7.8388278388278385, "grad_norm": 0.5703125, "learning_rate": 6.701715011323034e-05, "loss": 0.0318, "mean_token_accuracy": 0.9896463632583619, "num_tokens": 3934900.0, "step": 2140 }, { "epoch": 7.857142857142857, "grad_norm": 0.09326171875, "learning_rate": 6.692939012075532e-05, "loss": 0.0124, "mean_token_accuracy": 0.9943976402282715, "num_tokens": 3944656.0, "step": 2145 }, { "epoch": 7.875457875457876, "grad_norm": 0.291015625, "learning_rate": 6.684150570259256e-05, "loss": 0.0216, "mean_token_accuracy": 0.9906636476516724, "num_tokens": 3954257.0, "step": 2150 }, { "epoch": 7.893772893772894, "grad_norm": 0.3984375, "learning_rate": 6.675349766471193e-05, "loss": 0.0266, "mean_token_accuracy": 0.9893643379211425, "num_tokens": 3962549.0, "step": 2155 }, { "epoch": 7.912087912087912, "grad_norm": 0.3125, "learning_rate": 6.6665366814217e-05, "loss": 0.0203, "mean_token_accuracy": 0.9919966578483581, "num_tokens": 3971970.0, "step": 2160 } ], "logging_steps": 5, "max_steps": 5460, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.623835599352627e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }