| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 7.912087912087912, |
| "eval_steps": 500, |
| "global_step": 2160, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.018315018315018316, |
| "grad_norm": 26.125, |
| "learning_rate": 1.2509723013743402e-06, |
| "loss": 1.4391, |
| "mean_token_accuracy": 0.7123644590377808, |
| "num_tokens": 9264.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.03663003663003663, |
| "grad_norm": 24.5, |
| "learning_rate": 2.814687678092266e-06, |
| "loss": 1.2334, |
| "mean_token_accuracy": 0.7464994072914124, |
| "num_tokens": 18573.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.054945054945054944, |
| "grad_norm": 14.6875, |
| "learning_rate": 4.378403054810191e-06, |
| "loss": 1.2219, |
| "mean_token_accuracy": 0.7385512232780457, |
| "num_tokens": 28324.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.07326007326007326, |
| "grad_norm": 8.625, |
| "learning_rate": 5.942118431528117e-06, |
| "loss": 1.1542, |
| "mean_token_accuracy": 0.7414855003356934, |
| "num_tokens": 37365.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.09157509157509157, |
| "grad_norm": 7.75, |
| "learning_rate": 7.505833808246043e-06, |
| "loss": 0.963, |
| "mean_token_accuracy": 0.7728639602661133, |
| "num_tokens": 47169.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.10989010989010989, |
| "grad_norm": 7.6875, |
| "learning_rate": 9.069549184963967e-06, |
| "loss": 0.687, |
| "mean_token_accuracy": 0.8294244527816772, |
| "num_tokens": 56922.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.1282051282051282, |
| "grad_norm": 6.09375, |
| "learning_rate": 1.0633264561681893e-05, |
| "loss": 0.5571, |
| "mean_token_accuracy": 0.8617596507072449, |
| "num_tokens": 65860.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.14652014652014653, |
| "grad_norm": 7.0625, |
| "learning_rate": 1.2196979938399817e-05, |
| "loss": 0.1916, |
| "mean_token_accuracy": 0.9526763677597045, |
| "num_tokens": 75644.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.16483516483516483, |
| "grad_norm": 5.9375, |
| "learning_rate": 1.3760695315117745e-05, |
| "loss": 0.195, |
| "mean_token_accuracy": 0.9427057504653931, |
| "num_tokens": 84597.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.18315018315018314, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.532441069183567e-05, |
| "loss": 0.272, |
| "mean_token_accuracy": 0.9327008485794067, |
| "num_tokens": 93032.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.20146520146520147, |
| "grad_norm": 7.5, |
| "learning_rate": 1.6888126068553595e-05, |
| "loss": 0.2524, |
| "mean_token_accuracy": 0.9316304802894593, |
| "num_tokens": 101726.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.21978021978021978, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.845184144527152e-05, |
| "loss": 0.2507, |
| "mean_token_accuracy": 0.9321518659591674, |
| "num_tokens": 110505.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.23809523809523808, |
| "grad_norm": 0.55859375, |
| "learning_rate": 2.0015556821989444e-05, |
| "loss": 0.1683, |
| "mean_token_accuracy": 0.9499430179595947, |
| "num_tokens": 119502.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.2564102564102564, |
| "grad_norm": 5.0625, |
| "learning_rate": 2.157927219870737e-05, |
| "loss": 0.3318, |
| "mean_token_accuracy": 0.911142885684967, |
| "num_tokens": 129089.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.27472527472527475, |
| "grad_norm": 1.59375, |
| "learning_rate": 2.3142987575425293e-05, |
| "loss": 0.1793, |
| "mean_token_accuracy": 0.950410532951355, |
| "num_tokens": 138645.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.29304029304029305, |
| "grad_norm": 1.53125, |
| "learning_rate": 2.4706702952143225e-05, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.9751996159553528, |
| "num_tokens": 148602.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.31135531135531136, |
| "grad_norm": 2.25, |
| "learning_rate": 2.627041832886115e-05, |
| "loss": 0.1275, |
| "mean_token_accuracy": 0.9607115983963013, |
| "num_tokens": 157517.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.32967032967032966, |
| "grad_norm": 2.03125, |
| "learning_rate": 2.7834133705579074e-05, |
| "loss": 0.0966, |
| "mean_token_accuracy": 0.9684963464736939, |
| "num_tokens": 166575.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.34798534798534797, |
| "grad_norm": 0.55078125, |
| "learning_rate": 2.9397849082297e-05, |
| "loss": 0.133, |
| "mean_token_accuracy": 0.9579505920410156, |
| "num_tokens": 175735.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.3663003663003663, |
| "grad_norm": 1.375, |
| "learning_rate": 3.096156445901492e-05, |
| "loss": 0.1709, |
| "mean_token_accuracy": 0.9461557865142822, |
| "num_tokens": 183857.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.38461538461538464, |
| "grad_norm": 3.015625, |
| "learning_rate": 3.2525279835732844e-05, |
| "loss": 0.1466, |
| "mean_token_accuracy": 0.9522614717483521, |
| "num_tokens": 191716.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.40293040293040294, |
| "grad_norm": 5.65625, |
| "learning_rate": 3.4088995212450776e-05, |
| "loss": 0.2514, |
| "mean_token_accuracy": 0.9299401640892029, |
| "num_tokens": 200518.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.42124542124542125, |
| "grad_norm": 0.92578125, |
| "learning_rate": 3.56527105891687e-05, |
| "loss": 0.0687, |
| "mean_token_accuracy": 0.975311815738678, |
| "num_tokens": 209961.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.43956043956043955, |
| "grad_norm": 2.53125, |
| "learning_rate": 3.7216425965886625e-05, |
| "loss": 0.1955, |
| "mean_token_accuracy": 0.9394341588020325, |
| "num_tokens": 218838.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.45787545787545786, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.878014134260455e-05, |
| "loss": 0.1805, |
| "mean_token_accuracy": 0.944377863407135, |
| "num_tokens": 227866.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.47619047619047616, |
| "grad_norm": 2.03125, |
| "learning_rate": 4.0343856719322474e-05, |
| "loss": 0.2456, |
| "mean_token_accuracy": 0.9300832152366638, |
| "num_tokens": 237211.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.4945054945054945, |
| "grad_norm": 3.5625, |
| "learning_rate": 4.19075720960404e-05, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.9527908086776733, |
| "num_tokens": 247238.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.5128205128205128, |
| "grad_norm": 1.0078125, |
| "learning_rate": 4.3471287472758323e-05, |
| "loss": 0.1544, |
| "mean_token_accuracy": 0.9530697703361511, |
| "num_tokens": 255982.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5311355311355311, |
| "grad_norm": 3.28125, |
| "learning_rate": 4.5035002849476255e-05, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.949328339099884, |
| "num_tokens": 265058.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.5494505494505495, |
| "grad_norm": 2.375, |
| "learning_rate": 4.659871822619417e-05, |
| "loss": 0.1525, |
| "mean_token_accuracy": 0.9563624501228333, |
| "num_tokens": 274227.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5677655677655677, |
| "grad_norm": 2.28125, |
| "learning_rate": 4.8162433602912104e-05, |
| "loss": 0.2012, |
| "mean_token_accuracy": 0.940619957447052, |
| "num_tokens": 282699.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.5860805860805861, |
| "grad_norm": 2.078125, |
| "learning_rate": 4.9726148979630036e-05, |
| "loss": 0.0786, |
| "mean_token_accuracy": 0.9756275773048401, |
| "num_tokens": 292119.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6043956043956044, |
| "grad_norm": 1.40625, |
| "learning_rate": 5.128986435634795e-05, |
| "loss": 0.1985, |
| "mean_token_accuracy": 0.9421133875846863, |
| "num_tokens": 301406.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.6227106227106227, |
| "grad_norm": 1.5078125, |
| "learning_rate": 5.2853579733065885e-05, |
| "loss": 0.1154, |
| "mean_token_accuracy": 0.9618610620498658, |
| "num_tokens": 310481.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6410256410256411, |
| "grad_norm": 0.359375, |
| "learning_rate": 5.44172951097838e-05, |
| "loss": 0.1248, |
| "mean_token_accuracy": 0.9580595254898071, |
| "num_tokens": 319932.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.6593406593406593, |
| "grad_norm": 1.546875, |
| "learning_rate": 5.5981010486501734e-05, |
| "loss": 0.0783, |
| "mean_token_accuracy": 0.972282862663269, |
| "num_tokens": 329237.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6776556776556777, |
| "grad_norm": 0.478515625, |
| "learning_rate": 5.754472586321966e-05, |
| "loss": 0.0932, |
| "mean_token_accuracy": 0.9683905124664307, |
| "num_tokens": 338362.0, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.6959706959706959, |
| "grad_norm": 1.9453125, |
| "learning_rate": 5.910844123993758e-05, |
| "loss": 0.0953, |
| "mean_token_accuracy": 0.9647938251495362, |
| "num_tokens": 347632.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "grad_norm": 1.0234375, |
| "learning_rate": 6.067215661665551e-05, |
| "loss": 0.1225, |
| "mean_token_accuracy": 0.9624135136604309, |
| "num_tokens": 357312.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.7326007326007326, |
| "grad_norm": 6.5, |
| "learning_rate": 6.223587199337343e-05, |
| "loss": 0.1022, |
| "mean_token_accuracy": 0.9669649362564087, |
| "num_tokens": 366372.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7509157509157509, |
| "grad_norm": 1.9609375, |
| "learning_rate": 6.379958737009136e-05, |
| "loss": 0.1294, |
| "mean_token_accuracy": 0.9620457410812377, |
| "num_tokens": 376261.0, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 2.328125, |
| "learning_rate": 6.536330274680927e-05, |
| "loss": 0.1534, |
| "mean_token_accuracy": 0.9578806042671204, |
| "num_tokens": 385931.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.7875457875457875, |
| "grad_norm": 1.6484375, |
| "learning_rate": 6.69270181235272e-05, |
| "loss": 0.1488, |
| "mean_token_accuracy": 0.9553104996681213, |
| "num_tokens": 395126.0, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.8058608058608059, |
| "grad_norm": 1.5625, |
| "learning_rate": 6.849073350024514e-05, |
| "loss": 0.2034, |
| "mean_token_accuracy": 0.9399782657623291, |
| "num_tokens": 404038.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.8241758241758241, |
| "grad_norm": 0.95703125, |
| "learning_rate": 7.005444887696306e-05, |
| "loss": 0.1123, |
| "mean_token_accuracy": 0.9649486184120178, |
| "num_tokens": 412865.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.8424908424908425, |
| "grad_norm": 12.125, |
| "learning_rate": 7.161816425368099e-05, |
| "loss": 0.2582, |
| "mean_token_accuracy": 0.9260981917381287, |
| "num_tokens": 421875.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8608058608058609, |
| "grad_norm": 1.0078125, |
| "learning_rate": 7.31818796303989e-05, |
| "loss": 0.1237, |
| "mean_token_accuracy": 0.958947730064392, |
| "num_tokens": 430974.0, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.8791208791208791, |
| "grad_norm": 0.890625, |
| "learning_rate": 7.474559500711684e-05, |
| "loss": 0.094, |
| "mean_token_accuracy": 0.9669261336326599, |
| "num_tokens": 440593.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.8974358974358975, |
| "grad_norm": 1.5234375, |
| "learning_rate": 7.630931038383477e-05, |
| "loss": 0.1799, |
| "mean_token_accuracy": 0.9489495992660523, |
| "num_tokens": 449999.0, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.9157509157509157, |
| "grad_norm": 0.1806640625, |
| "learning_rate": 7.787302576055269e-05, |
| "loss": 0.0973, |
| "mean_token_accuracy": 0.9693061113357544, |
| "num_tokens": 458710.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.9340659340659341, |
| "grad_norm": 1.359375, |
| "learning_rate": 7.943674113727062e-05, |
| "loss": 0.1313, |
| "mean_token_accuracy": 0.9576280236244201, |
| "num_tokens": 468138.0, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 3.21875, |
| "learning_rate": 8.100045651398853e-05, |
| "loss": 0.2788, |
| "mean_token_accuracy": 0.9308066725730896, |
| "num_tokens": 478094.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9706959706959707, |
| "grad_norm": 15.75, |
| "learning_rate": 8.256417189070647e-05, |
| "loss": 0.1394, |
| "mean_token_accuracy": 0.9597956895828247, |
| "num_tokens": 487236.0, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.989010989010989, |
| "grad_norm": 0.73046875, |
| "learning_rate": 8.41278872674244e-05, |
| "loss": 0.105, |
| "mean_token_accuracy": 0.9711844086647033, |
| "num_tokens": 496836.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.0073260073260073, |
| "grad_norm": 4.125, |
| "learning_rate": 8.537885369635508e-05, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9566316485404969, |
| "num_tokens": 505486.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.0256410256410255, |
| "grad_norm": 38.0, |
| "learning_rate": 8.537864816105374e-05, |
| "loss": 0.1003, |
| "mean_token_accuracy": 0.9712117314338684, |
| "num_tokens": 515232.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.043956043956044, |
| "grad_norm": 1.671875, |
| "learning_rate": 8.537814900572437e-05, |
| "loss": 0.1104, |
| "mean_token_accuracy": 0.9652750253677368, |
| "num_tokens": 523979.0, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.0622710622710623, |
| "grad_norm": 0.361328125, |
| "learning_rate": 8.537735623494464e-05, |
| "loss": 0.0756, |
| "mean_token_accuracy": 0.9749327659606933, |
| "num_tokens": 533573.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.0805860805860805, |
| "grad_norm": 3.5, |
| "learning_rate": 8.537626985598489e-05, |
| "loss": 0.5942, |
| "mean_token_accuracy": 0.8817975878715515, |
| "num_tokens": 543009.0, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.098901098901099, |
| "grad_norm": 1.5390625, |
| "learning_rate": 8.537488987880808e-05, |
| "loss": 0.1367, |
| "mean_token_accuracy": 0.9571704030036926, |
| "num_tokens": 552940.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.1172161172161172, |
| "grad_norm": 0.89453125, |
| "learning_rate": 8.537321631606968e-05, |
| "loss": 0.1217, |
| "mean_token_accuracy": 0.9623419761657714, |
| "num_tokens": 562364.0, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.1355311355311355, |
| "grad_norm": 1.578125, |
| "learning_rate": 8.537124918311761e-05, |
| "loss": 0.1608, |
| "mean_token_accuracy": 0.949942660331726, |
| "num_tokens": 571646.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.1538461538461537, |
| "grad_norm": 1.5, |
| "learning_rate": 8.536898849799202e-05, |
| "loss": 0.0852, |
| "mean_token_accuracy": 0.971439003944397, |
| "num_tokens": 581084.0, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.1721611721611722, |
| "grad_norm": 1.8046875, |
| "learning_rate": 8.53664342814252e-05, |
| "loss": 0.1247, |
| "mean_token_accuracy": 0.9593337297439575, |
| "num_tokens": 590812.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.1904761904761905, |
| "grad_norm": 1.2578125, |
| "learning_rate": 8.536358655684135e-05, |
| "loss": 0.1186, |
| "mean_token_accuracy": 0.957237160205841, |
| "num_tokens": 599646.0, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.2087912087912087, |
| "grad_norm": 0.5078125, |
| "learning_rate": 8.536044535035635e-05, |
| "loss": 0.1778, |
| "mean_token_accuracy": 0.9485100388526917, |
| "num_tokens": 608962.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.2271062271062272, |
| "grad_norm": 0.91796875, |
| "learning_rate": 8.535701069077756e-05, |
| "loss": 0.131, |
| "mean_token_accuracy": 0.9616786003112793, |
| "num_tokens": 617832.0, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.2454212454212454, |
| "grad_norm": 1.28125, |
| "learning_rate": 8.535328260960355e-05, |
| "loss": 0.1983, |
| "mean_token_accuracy": 0.9394309878349304, |
| "num_tokens": 626739.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.2637362637362637, |
| "grad_norm": 1.5234375, |
| "learning_rate": 8.534926114102375e-05, |
| "loss": 0.064, |
| "mean_token_accuracy": 0.9792219161987304, |
| "num_tokens": 636553.0, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.282051282051282, |
| "grad_norm": 0.73828125, |
| "learning_rate": 8.534494632191824e-05, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9512728333473206, |
| "num_tokens": 645322.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.3003663003663004, |
| "grad_norm": 0.302734375, |
| "learning_rate": 8.534033819185732e-05, |
| "loss": 0.1257, |
| "mean_token_accuracy": 0.9629031181335449, |
| "num_tokens": 654789.0, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.3186813186813187, |
| "grad_norm": 2.78125, |
| "learning_rate": 8.533543679310125e-05, |
| "loss": 0.1226, |
| "mean_token_accuracy": 0.962236201763153, |
| "num_tokens": 664164.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.3369963369963371, |
| "grad_norm": 0.1669921875, |
| "learning_rate": 8.533024217059969e-05, |
| "loss": 0.0789, |
| "mean_token_accuracy": 0.9735670685768127, |
| "num_tokens": 674019.0, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.3553113553113554, |
| "grad_norm": 6.84375, |
| "learning_rate": 8.53247543719915e-05, |
| "loss": 0.168, |
| "mean_token_accuracy": 0.9529448866844177, |
| "num_tokens": 683484.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.3736263736263736, |
| "grad_norm": 1.359375, |
| "learning_rate": 8.531897344760409e-05, |
| "loss": 0.1943, |
| "mean_token_accuracy": 0.9423548102378845, |
| "num_tokens": 691983.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.3919413919413919, |
| "grad_norm": 0.97265625, |
| "learning_rate": 8.531289945045318e-05, |
| "loss": 0.1563, |
| "mean_token_accuracy": 0.957783043384552, |
| "num_tokens": 701252.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.4102564102564101, |
| "grad_norm": 0.93359375, |
| "learning_rate": 8.530653243624211e-05, |
| "loss": 0.2077, |
| "mean_token_accuracy": 0.9389472723007202, |
| "num_tokens": 709727.0, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 11.875, |
| "learning_rate": 8.529987246336146e-05, |
| "loss": 0.0909, |
| "mean_token_accuracy": 0.9703719019889832, |
| "num_tokens": 719138.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.4468864468864469, |
| "grad_norm": 3.4375, |
| "learning_rate": 8.529291959288849e-05, |
| "loss": 0.0696, |
| "mean_token_accuracy": 0.9730043172836303, |
| "num_tokens": 728302.0, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.4652014652014653, |
| "grad_norm": 0.93359375, |
| "learning_rate": 8.528567388858653e-05, |
| "loss": 0.1001, |
| "mean_token_accuracy": 0.962419056892395, |
| "num_tokens": 737476.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.4835164835164836, |
| "grad_norm": 0.7109375, |
| "learning_rate": 8.527813541690442e-05, |
| "loss": 0.1422, |
| "mean_token_accuracy": 0.9559484243392944, |
| "num_tokens": 746604.0, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.5018315018315018, |
| "grad_norm": 0.62890625, |
| "learning_rate": 8.527030424697596e-05, |
| "loss": 0.1023, |
| "mean_token_accuracy": 0.9671313047409058, |
| "num_tokens": 756047.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.52014652014652, |
| "grad_norm": 0.54296875, |
| "learning_rate": 8.526218045061917e-05, |
| "loss": 0.1209, |
| "mean_token_accuracy": 0.9592770576477051, |
| "num_tokens": 764805.0, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.5384615384615383, |
| "grad_norm": 1.1796875, |
| "learning_rate": 8.525376410233573e-05, |
| "loss": 0.1895, |
| "mean_token_accuracy": 0.942843246459961, |
| "num_tokens": 773770.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.5567765567765568, |
| "grad_norm": 0.90625, |
| "learning_rate": 8.524505527931021e-05, |
| "loss": 0.1104, |
| "mean_token_accuracy": 0.9629818797111511, |
| "num_tokens": 782555.0, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.575091575091575, |
| "grad_norm": 1.953125, |
| "learning_rate": 8.523605406140945e-05, |
| "loss": 0.079, |
| "mean_token_accuracy": 0.9723729610443115, |
| "num_tokens": 791364.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.5934065934065935, |
| "grad_norm": 0.765625, |
| "learning_rate": 8.522676053118176e-05, |
| "loss": 0.1355, |
| "mean_token_accuracy": 0.9603265643119812, |
| "num_tokens": 801577.0, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.6117216117216118, |
| "grad_norm": 38.0, |
| "learning_rate": 8.521717477385618e-05, |
| "loss": 0.0925, |
| "mean_token_accuracy": 0.9714651226997375, |
| "num_tokens": 810680.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.63003663003663, |
| "grad_norm": 54.0, |
| "learning_rate": 8.520729687734172e-05, |
| "loss": 0.4008, |
| "mean_token_accuracy": 0.9063192009925842, |
| "num_tokens": 819733.0, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.6483516483516483, |
| "grad_norm": 17.0, |
| "learning_rate": 8.519712693222653e-05, |
| "loss": 0.2733, |
| "mean_token_accuracy": 0.9280066013336181, |
| "num_tokens": 828640.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.26171875, |
| "learning_rate": 8.518666503177708e-05, |
| "loss": 0.3508, |
| "mean_token_accuracy": 0.912005627155304, |
| "num_tokens": 837843.0, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.684981684981685, |
| "grad_norm": 0.73046875, |
| "learning_rate": 8.517591127193731e-05, |
| "loss": 0.0529, |
| "mean_token_accuracy": 0.9821884870529175, |
| "num_tokens": 847611.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.7032967032967035, |
| "grad_norm": 6.09375, |
| "learning_rate": 8.516486575132771e-05, |
| "loss": 0.1331, |
| "mean_token_accuracy": 0.959692919254303, |
| "num_tokens": 856321.0, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.7216117216117217, |
| "grad_norm": 5.0, |
| "learning_rate": 8.515352857124449e-05, |
| "loss": 0.0689, |
| "mean_token_accuracy": 0.9763989567756652, |
| "num_tokens": 865828.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.73992673992674, |
| "grad_norm": 1.5703125, |
| "learning_rate": 8.514189983565859e-05, |
| "loss": 0.0946, |
| "mean_token_accuracy": 0.9694816589355468, |
| "num_tokens": 875232.0, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.7582417582417582, |
| "grad_norm": 21.25, |
| "learning_rate": 8.512997965121474e-05, |
| "loss": 0.0833, |
| "mean_token_accuracy": 0.9721729278564453, |
| "num_tokens": 884274.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.7765567765567765, |
| "grad_norm": 0.7265625, |
| "learning_rate": 8.511776812723049e-05, |
| "loss": 0.0723, |
| "mean_token_accuracy": 0.9744561910629272, |
| "num_tokens": 893656.0, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.7948717948717947, |
| "grad_norm": 0.6015625, |
| "learning_rate": 8.510526537569522e-05, |
| "loss": 0.0605, |
| "mean_token_accuracy": 0.9765355348587036, |
| "num_tokens": 902461.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.8131868131868132, |
| "grad_norm": 1.6328125, |
| "learning_rate": 8.509247151126907e-05, |
| "loss": 0.097, |
| "mean_token_accuracy": 0.9699956893920898, |
| "num_tokens": 911366.0, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.8315018315018317, |
| "grad_norm": 0.5625, |
| "learning_rate": 8.507938665128194e-05, |
| "loss": 0.0759, |
| "mean_token_accuracy": 0.9745470285415649, |
| "num_tokens": 920856.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.84981684981685, |
| "grad_norm": 2.125, |
| "learning_rate": 8.506601091573238e-05, |
| "loss": 0.1981, |
| "mean_token_accuracy": 0.9415134191513062, |
| "num_tokens": 929641.0, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.8681318681318682, |
| "grad_norm": 0.5625, |
| "learning_rate": 8.505234442728651e-05, |
| "loss": 0.1232, |
| "mean_token_accuracy": 0.9627613186836242, |
| "num_tokens": 939594.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.8864468864468864, |
| "grad_norm": 35.5, |
| "learning_rate": 8.503838731127686e-05, |
| "loss": 0.1638, |
| "mean_token_accuracy": 0.9507665157318115, |
| "num_tokens": 948528.0, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.9047619047619047, |
| "grad_norm": 1.1640625, |
| "learning_rate": 8.502413969570129e-05, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9527613878250122, |
| "num_tokens": 957817.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.9230769230769231, |
| "grad_norm": 0.18359375, |
| "learning_rate": 8.500960171122171e-05, |
| "loss": 0.1136, |
| "mean_token_accuracy": 0.9619654774665832, |
| "num_tokens": 966584.0, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.9413919413919414, |
| "grad_norm": 2.75, |
| "learning_rate": 8.4994773491163e-05, |
| "loss": 0.1588, |
| "mean_token_accuracy": 0.9544906854629517, |
| "num_tokens": 975280.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.9597069597069599, |
| "grad_norm": 1.2734375, |
| "learning_rate": 8.497965517151172e-05, |
| "loss": 0.2298, |
| "mean_token_accuracy": 0.9354098320007325, |
| "num_tokens": 984056.0, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.978021978021978, |
| "grad_norm": 0.50390625, |
| "learning_rate": 8.49642468909148e-05, |
| "loss": 0.0629, |
| "mean_token_accuracy": 0.97831951379776, |
| "num_tokens": 993635.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.9963369963369964, |
| "grad_norm": 1.140625, |
| "learning_rate": 8.494854879067847e-05, |
| "loss": 0.1468, |
| "mean_token_accuracy": 0.9564722418785095, |
| "num_tokens": 1003151.0, |
| "step": 545 |
| }, |
| { |
| "epoch": 2.0146520146520146, |
| "grad_norm": 0.73046875, |
| "learning_rate": 8.493256101476675e-05, |
| "loss": 0.1093, |
| "mean_token_accuracy": 0.9693841338157654, |
| "num_tokens": 1011069.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.032967032967033, |
| "grad_norm": 0.5390625, |
| "learning_rate": 8.491628370980023e-05, |
| "loss": 0.1015, |
| "mean_token_accuracy": 0.9635228157043457, |
| "num_tokens": 1019386.0, |
| "step": 555 |
| }, |
| { |
| "epoch": 2.051282051282051, |
| "grad_norm": 0.23828125, |
| "learning_rate": 8.489971702505472e-05, |
| "loss": 0.105, |
| "mean_token_accuracy": 0.969475531578064, |
| "num_tokens": 1028915.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.06959706959707, |
| "grad_norm": 0.41796875, |
| "learning_rate": 8.488286111245985e-05, |
| "loss": 0.0659, |
| "mean_token_accuracy": 0.9784932613372803, |
| "num_tokens": 1038671.0, |
| "step": 565 |
| }, |
| { |
| "epoch": 2.087912087912088, |
| "grad_norm": 0.64453125, |
| "learning_rate": 8.486571612659775e-05, |
| "loss": 0.1141, |
| "mean_token_accuracy": 0.9647136211395264, |
| "num_tokens": 1048771.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.1062271062271063, |
| "grad_norm": 1.1015625, |
| "learning_rate": 8.484828222470152e-05, |
| "loss": 0.0762, |
| "mean_token_accuracy": 0.9740150094032287, |
| "num_tokens": 1058283.0, |
| "step": 575 |
| }, |
| { |
| "epoch": 2.1245421245421245, |
| "grad_norm": 7.71875, |
| "learning_rate": 8.48305595666539e-05, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9484933137893676, |
| "num_tokens": 1067021.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.142857142857143, |
| "grad_norm": 0.21484375, |
| "learning_rate": 8.481254831498573e-05, |
| "loss": 0.0481, |
| "mean_token_accuracy": 0.9831829905509949, |
| "num_tokens": 1076203.0, |
| "step": 585 |
| }, |
| { |
| "epoch": 2.161172161172161, |
| "grad_norm": 0.88671875, |
| "learning_rate": 8.479424863487448e-05, |
| "loss": 0.0859, |
| "mean_token_accuracy": 0.9719721555709839, |
| "num_tokens": 1085924.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.1794871794871793, |
| "grad_norm": 0.1650390625, |
| "learning_rate": 8.477566069414271e-05, |
| "loss": 0.1872, |
| "mean_token_accuracy": 0.9515769362449646, |
| "num_tokens": 1095974.0, |
| "step": 595 |
| }, |
| { |
| "epoch": 2.197802197802198, |
| "grad_norm": 83.0, |
| "learning_rate": 8.475678466325665e-05, |
| "loss": 0.3268, |
| "mean_token_accuracy": 0.9221652507781982, |
| "num_tokens": 1105194.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.2161172161172162, |
| "grad_norm": 0.9765625, |
| "learning_rate": 8.473762071532443e-05, |
| "loss": 0.0572, |
| "mean_token_accuracy": 0.9784234523773193, |
| "num_tokens": 1113857.0, |
| "step": 605 |
| }, |
| { |
| "epoch": 2.2344322344322345, |
| "grad_norm": 0.6171875, |
| "learning_rate": 8.471816902609471e-05, |
| "loss": 0.0721, |
| "mean_token_accuracy": 0.975789201259613, |
| "num_tokens": 1123835.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.2527472527472527, |
| "grad_norm": 0.43359375, |
| "learning_rate": 8.46984297739549e-05, |
| "loss": 0.0894, |
| "mean_token_accuracy": 0.968826174736023, |
| "num_tokens": 1132971.0, |
| "step": 615 |
| }, |
| { |
| "epoch": 2.271062271062271, |
| "grad_norm": 0.3515625, |
| "learning_rate": 8.467840313992962e-05, |
| "loss": 0.044, |
| "mean_token_accuracy": 0.9843294978141784, |
| "num_tokens": 1142825.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.2893772893772892, |
| "grad_norm": 1.09375, |
| "learning_rate": 8.465808930767897e-05, |
| "loss": 0.1133, |
| "mean_token_accuracy": 0.9606971025466919, |
| "num_tokens": 1152228.0, |
| "step": 625 |
| }, |
| { |
| "epoch": 2.3076923076923075, |
| "grad_norm": 0.76171875, |
| "learning_rate": 8.463748846349694e-05, |
| "loss": 0.0924, |
| "mean_token_accuracy": 0.968487274646759, |
| "num_tokens": 1161178.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.326007326007326, |
| "grad_norm": 0.6875, |
| "learning_rate": 8.461660079630962e-05, |
| "loss": 0.1203, |
| "mean_token_accuracy": 0.9595974802970886, |
| "num_tokens": 1170764.0, |
| "step": 635 |
| }, |
| { |
| "epoch": 2.3443223443223444, |
| "grad_norm": 0.6171875, |
| "learning_rate": 8.45954264976735e-05, |
| "loss": 0.0458, |
| "mean_token_accuracy": 0.9816165566444397, |
| "num_tokens": 1180611.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.3626373626373627, |
| "grad_norm": 1.2734375, |
| "learning_rate": 8.457396576177369e-05, |
| "loss": 0.109, |
| "mean_token_accuracy": 0.9632077097892762, |
| "num_tokens": 1188843.0, |
| "step": 645 |
| }, |
| { |
| "epoch": 2.380952380952381, |
| "grad_norm": 0.5625, |
| "learning_rate": 8.455221878542219e-05, |
| "loss": 0.0454, |
| "mean_token_accuracy": 0.982891297340393, |
| "num_tokens": 1198270.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.399267399267399, |
| "grad_norm": 0.447265625, |
| "learning_rate": 8.453018576805604e-05, |
| "loss": 0.0807, |
| "mean_token_accuracy": 0.9707582116127014, |
| "num_tokens": 1207459.0, |
| "step": 655 |
| }, |
| { |
| "epoch": 2.4175824175824174, |
| "grad_norm": 0.796875, |
| "learning_rate": 8.450786691173547e-05, |
| "loss": 0.1203, |
| "mean_token_accuracy": 0.9592945575714111, |
| "num_tokens": 1217120.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.435897435897436, |
| "grad_norm": 0.8125, |
| "learning_rate": 8.448526242114215e-05, |
| "loss": 0.0962, |
| "mean_token_accuracy": 0.9632490515708924, |
| "num_tokens": 1226485.0, |
| "step": 665 |
| }, |
| { |
| "epoch": 2.4542124542124544, |
| "grad_norm": 306.0, |
| "learning_rate": 8.446237250357716e-05, |
| "loss": 0.2642, |
| "mean_token_accuracy": 0.9311501502990722, |
| "num_tokens": 1234436.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.4725274725274726, |
| "grad_norm": 0.6875, |
| "learning_rate": 8.443919736895923e-05, |
| "loss": 0.0925, |
| "mean_token_accuracy": 0.9679561376571655, |
| "num_tokens": 1243656.0, |
| "step": 675 |
| }, |
| { |
| "epoch": 2.490842490842491, |
| "grad_norm": 0.326171875, |
| "learning_rate": 8.441573722982275e-05, |
| "loss": 0.1172, |
| "mean_token_accuracy": 0.9645622253417969, |
| "num_tokens": 1252230.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.509157509157509, |
| "grad_norm": 4.59375, |
| "learning_rate": 8.439199230131578e-05, |
| "loss": 0.2658, |
| "mean_token_accuracy": 0.9416054487228394, |
| "num_tokens": 1261738.0, |
| "step": 685 |
| }, |
| { |
| "epoch": 2.5274725274725274, |
| "grad_norm": 16.25, |
| "learning_rate": 8.436796280119821e-05, |
| "loss": 0.1691, |
| "mean_token_accuracy": 0.959836196899414, |
| "num_tokens": 1270639.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.5457875457875456, |
| "grad_norm": 2.140625, |
| "learning_rate": 8.43436489498396e-05, |
| "loss": 0.1439, |
| "mean_token_accuracy": 0.9541051030158997, |
| "num_tokens": 1279566.0, |
| "step": 695 |
| }, |
| { |
| "epoch": 2.564102564102564, |
| "grad_norm": 184.0, |
| "learning_rate": 8.431905097021727e-05, |
| "loss": 0.0963, |
| "mean_token_accuracy": 0.9705996751785279, |
| "num_tokens": 1288968.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.5824175824175826, |
| "grad_norm": 0.984375, |
| "learning_rate": 8.429416908791423e-05, |
| "loss": 0.106, |
| "mean_token_accuracy": 0.9661186218261719, |
| "num_tokens": 1298605.0, |
| "step": 705 |
| }, |
| { |
| "epoch": 2.600732600732601, |
| "grad_norm": 0.84375, |
| "learning_rate": 8.426900353111708e-05, |
| "loss": 0.1213, |
| "mean_token_accuracy": 0.9659365892410279, |
| "num_tokens": 1306963.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.619047619047619, |
| "grad_norm": 0.310546875, |
| "learning_rate": 8.424355453061395e-05, |
| "loss": 0.11, |
| "mean_token_accuracy": 0.9625479221343994, |
| "num_tokens": 1315486.0, |
| "step": 715 |
| }, |
| { |
| "epoch": 2.6373626373626373, |
| "grad_norm": 0.7265625, |
| "learning_rate": 8.421782231979236e-05, |
| "loss": 0.095, |
| "mean_token_accuracy": 0.9687173247337342, |
| "num_tokens": 1325156.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.6556776556776556, |
| "grad_norm": 0.671875, |
| "learning_rate": 8.419180713463716e-05, |
| "loss": 0.0597, |
| "mean_token_accuracy": 0.9778618574142456, |
| "num_tokens": 1334588.0, |
| "step": 725 |
| }, |
| { |
| "epoch": 2.6739926739926743, |
| "grad_norm": 0.56640625, |
| "learning_rate": 8.416550921372818e-05, |
| "loss": 0.0961, |
| "mean_token_accuracy": 0.965964937210083, |
| "num_tokens": 1343414.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.6923076923076925, |
| "grad_norm": 1.609375, |
| "learning_rate": 8.413892879823828e-05, |
| "loss": 0.1179, |
| "mean_token_accuracy": 0.9629538536071778, |
| "num_tokens": 1353270.0, |
| "step": 735 |
| }, |
| { |
| "epoch": 2.7106227106227108, |
| "grad_norm": 0.2275390625, |
| "learning_rate": 8.411206613193094e-05, |
| "loss": 0.0733, |
| "mean_token_accuracy": 0.9732595682144165, |
| "num_tokens": 1362123.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.728937728937729, |
| "grad_norm": 0.228515625, |
| "learning_rate": 8.408492146115815e-05, |
| "loss": 0.1294, |
| "mean_token_accuracy": 0.9595796465873718, |
| "num_tokens": 1371808.0, |
| "step": 745 |
| }, |
| { |
| "epoch": 2.7472527472527473, |
| "grad_norm": 0.185546875, |
| "learning_rate": 8.405749503485807e-05, |
| "loss": 0.0698, |
| "mean_token_accuracy": 0.9742272734642029, |
| "num_tokens": 1380865.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.7655677655677655, |
| "grad_norm": 1.078125, |
| "learning_rate": 8.402978710455282e-05, |
| "loss": 0.0939, |
| "mean_token_accuracy": 0.9644173741340637, |
| "num_tokens": 1389329.0, |
| "step": 755 |
| }, |
| { |
| "epoch": 2.7838827838827838, |
| "grad_norm": 0.67578125, |
| "learning_rate": 8.400179792434609e-05, |
| "loss": 0.0986, |
| "mean_token_accuracy": 0.9603239297866821, |
| "num_tokens": 1397560.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.802197802197802, |
| "grad_norm": 1.0859375, |
| "learning_rate": 8.397352775092089e-05, |
| "loss": 0.1284, |
| "mean_token_accuracy": 0.9567705154418945, |
| "num_tokens": 1406742.0, |
| "step": 765 |
| }, |
| { |
| "epoch": 2.8205128205128203, |
| "grad_norm": 0.1708984375, |
| "learning_rate": 8.394497684353717e-05, |
| "loss": 0.1174, |
| "mean_token_accuracy": 0.9635369896888732, |
| "num_tokens": 1416577.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.838827838827839, |
| "grad_norm": 1.09375, |
| "learning_rate": 8.391614546402936e-05, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.947747004032135, |
| "num_tokens": 1424952.0, |
| "step": 775 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 0.6171875, |
| "learning_rate": 8.388703387680416e-05, |
| "loss": 0.0449, |
| "mean_token_accuracy": 0.9851283431053162, |
| "num_tokens": 1434524.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.8754578754578755, |
| "grad_norm": 0.93359375, |
| "learning_rate": 8.385764234883788e-05, |
| "loss": 0.1116, |
| "mean_token_accuracy": 0.9607040166854859, |
| "num_tokens": 1443089.0, |
| "step": 785 |
| }, |
| { |
| "epoch": 2.8937728937728937, |
| "grad_norm": 0.59765625, |
| "learning_rate": 8.382797114967418e-05, |
| "loss": 0.0757, |
| "mean_token_accuracy": 0.9740386247634888, |
| "num_tokens": 1452624.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.912087912087912, |
| "grad_norm": 0.76953125, |
| "learning_rate": 8.379802055142151e-05, |
| "loss": 0.0641, |
| "mean_token_accuracy": 0.9792454838752747, |
| "num_tokens": 1462735.0, |
| "step": 795 |
| }, |
| { |
| "epoch": 2.9304029304029307, |
| "grad_norm": 0.8671875, |
| "learning_rate": 8.376779082875063e-05, |
| "loss": 0.0948, |
| "mean_token_accuracy": 0.9742291688919067, |
| "num_tokens": 1472169.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.948717948717949, |
| "grad_norm": 1.0, |
| "learning_rate": 8.37372822588921e-05, |
| "loss": 0.1112, |
| "mean_token_accuracy": 0.9607684016227722, |
| "num_tokens": 1481405.0, |
| "step": 805 |
| }, |
| { |
| "epoch": 2.967032967032967, |
| "grad_norm": 0.55859375, |
| "learning_rate": 8.370649512163369e-05, |
| "loss": 0.1755, |
| "mean_token_accuracy": 0.9431592702865601, |
| "num_tokens": 1490690.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.9853479853479854, |
| "grad_norm": 0.1259765625, |
| "learning_rate": 8.367542969931792e-05, |
| "loss": 0.0996, |
| "mean_token_accuracy": 0.9651033759117127, |
| "num_tokens": 1499659.0, |
| "step": 815 |
| }, |
| { |
| "epoch": 3.0036630036630036, |
| "grad_norm": 0.1181640625, |
| "learning_rate": 8.364408627683935e-05, |
| "loss": 0.0761, |
| "mean_token_accuracy": 0.9716196894645691, |
| "num_tokens": 1507916.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 3.021978021978022, |
| "grad_norm": 0.080078125, |
| "learning_rate": 8.361246514164205e-05, |
| "loss": 0.0558, |
| "mean_token_accuracy": 0.9798445224761962, |
| "num_tokens": 1517236.0, |
| "step": 825 |
| }, |
| { |
| "epoch": 3.04029304029304, |
| "grad_norm": 0.30078125, |
| "learning_rate": 8.358056658371692e-05, |
| "loss": 0.0628, |
| "mean_token_accuracy": 0.9788961172103882, |
| "num_tokens": 1526056.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 3.0586080586080584, |
| "grad_norm": 0.8359375, |
| "learning_rate": 8.35483908955991e-05, |
| "loss": 0.064, |
| "mean_token_accuracy": 0.9767962694168091, |
| "num_tokens": 1535105.0, |
| "step": 835 |
| }, |
| { |
| "epoch": 3.076923076923077, |
| "grad_norm": 0.71875, |
| "learning_rate": 8.351593837236514e-05, |
| "loss": 0.0672, |
| "mean_token_accuracy": 0.9740965247154236, |
| "num_tokens": 1543572.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 3.0952380952380953, |
| "grad_norm": 0.765625, |
| "learning_rate": 8.348320931163043e-05, |
| "loss": 0.1008, |
| "mean_token_accuracy": 0.962606143951416, |
| "num_tokens": 1553371.0, |
| "step": 845 |
| }, |
| { |
| "epoch": 3.1135531135531136, |
| "grad_norm": 1.4765625, |
| "learning_rate": 8.345020401354646e-05, |
| "loss": 0.0652, |
| "mean_token_accuracy": 0.9775573253631592, |
| "num_tokens": 1563374.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 3.131868131868132, |
| "grad_norm": 1.03125, |
| "learning_rate": 8.341692278079804e-05, |
| "loss": 0.0701, |
| "mean_token_accuracy": 0.9748265624046326, |
| "num_tokens": 1572311.0, |
| "step": 855 |
| }, |
| { |
| "epoch": 3.15018315018315, |
| "grad_norm": 0.447265625, |
| "learning_rate": 8.338336591860042e-05, |
| "loss": 0.0616, |
| "mean_token_accuracy": 0.9770539045333863, |
| "num_tokens": 1581662.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 3.1684981684981683, |
| "grad_norm": 0.8046875, |
| "learning_rate": 8.334953373469673e-05, |
| "loss": 0.0847, |
| "mean_token_accuracy": 0.9684791564941406, |
| "num_tokens": 1590875.0, |
| "step": 865 |
| }, |
| { |
| "epoch": 3.186813186813187, |
| "grad_norm": 0.7265625, |
| "learning_rate": 8.331542653935491e-05, |
| "loss": 0.0618, |
| "mean_token_accuracy": 0.9756833434104919, |
| "num_tokens": 1599508.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 3.2051282051282053, |
| "grad_norm": 0.484375, |
| "learning_rate": 8.328104464536502e-05, |
| "loss": 0.0538, |
| "mean_token_accuracy": 0.9818952322006226, |
| "num_tokens": 1609055.0, |
| "step": 875 |
| }, |
| { |
| "epoch": 3.2234432234432235, |
| "grad_norm": 1.578125, |
| "learning_rate": 8.324638836803633e-05, |
| "loss": 0.066, |
| "mean_token_accuracy": 0.9749167203903198, |
| "num_tokens": 1618211.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 3.241758241758242, |
| "grad_norm": 0.482421875, |
| "learning_rate": 8.32114580251944e-05, |
| "loss": 0.0943, |
| "mean_token_accuracy": 0.9674638390541077, |
| "num_tokens": 1627192.0, |
| "step": 885 |
| }, |
| { |
| "epoch": 3.26007326007326, |
| "grad_norm": 0.09521484375, |
| "learning_rate": 8.317625393717823e-05, |
| "loss": 0.0424, |
| "mean_token_accuracy": 0.9843096375465393, |
| "num_tokens": 1636468.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 3.2783882783882783, |
| "grad_norm": 1.1953125, |
| "learning_rate": 8.314077642683719e-05, |
| "loss": 0.0866, |
| "mean_token_accuracy": 0.971860671043396, |
| "num_tokens": 1645884.0, |
| "step": 895 |
| }, |
| { |
| "epoch": 3.2967032967032965, |
| "grad_norm": 0.796875, |
| "learning_rate": 8.310502581952828e-05, |
| "loss": 0.0579, |
| "mean_token_accuracy": 0.9784857869148255, |
| "num_tokens": 1655337.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 3.315018315018315, |
| "grad_norm": 0.48828125, |
| "learning_rate": 8.306900244311288e-05, |
| "loss": 0.1142, |
| "mean_token_accuracy": 0.961796247959137, |
| "num_tokens": 1663976.0, |
| "step": 905 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 0.287109375, |
| "learning_rate": 8.303270662795399e-05, |
| "loss": 0.075, |
| "mean_token_accuracy": 0.9726075410842896, |
| "num_tokens": 1673433.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 3.3516483516483517, |
| "grad_norm": 0.7578125, |
| "learning_rate": 8.299613870691302e-05, |
| "loss": 0.0939, |
| "mean_token_accuracy": 0.9688026666641235, |
| "num_tokens": 1683030.0, |
| "step": 915 |
| }, |
| { |
| "epoch": 3.36996336996337, |
| "grad_norm": 0.373046875, |
| "learning_rate": 8.295929901534686e-05, |
| "loss": 0.0319, |
| "mean_token_accuracy": 0.9874111294746399, |
| "num_tokens": 1693029.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 3.3882783882783882, |
| "grad_norm": 0.2392578125, |
| "learning_rate": 8.29221878911047e-05, |
| "loss": 0.0532, |
| "mean_token_accuracy": 0.9800354242324829, |
| "num_tokens": 1703258.0, |
| "step": 925 |
| }, |
| { |
| "epoch": 3.4065934065934065, |
| "grad_norm": 0.73046875, |
| "learning_rate": 8.288480567452501e-05, |
| "loss": 0.1188, |
| "mean_token_accuracy": 0.9611821174621582, |
| "num_tokens": 1712754.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 3.4249084249084247, |
| "grad_norm": 0.99609375, |
| "learning_rate": 8.284715270843238e-05, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.9708463668823242, |
| "num_tokens": 1721472.0, |
| "step": 935 |
| }, |
| { |
| "epoch": 3.4432234432234434, |
| "grad_norm": 10.3125, |
| "learning_rate": 8.280922933813442e-05, |
| "loss": 0.04, |
| "mean_token_accuracy": 0.9824108600616455, |
| "num_tokens": 1730959.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 3.4615384615384617, |
| "grad_norm": 0.6015625, |
| "learning_rate": 8.277103591141852e-05, |
| "loss": 0.0678, |
| "mean_token_accuracy": 0.9735846400260926, |
| "num_tokens": 1739674.0, |
| "step": 945 |
| }, |
| { |
| "epoch": 3.47985347985348, |
| "grad_norm": 0.39453125, |
| "learning_rate": 8.273257277854872e-05, |
| "loss": 0.0424, |
| "mean_token_accuracy": 0.9842739105224609, |
| "num_tokens": 1749137.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 3.498168498168498, |
| "grad_norm": 0.1630859375, |
| "learning_rate": 8.269384029226248e-05, |
| "loss": 0.0285, |
| "mean_token_accuracy": 0.9885275959968567, |
| "num_tokens": 1758530.0, |
| "step": 955 |
| }, |
| { |
| "epoch": 3.5164835164835164, |
| "grad_norm": 0.0927734375, |
| "learning_rate": 8.265483880776745e-05, |
| "loss": 0.0741, |
| "mean_token_accuracy": 0.9747227191925049, |
| "num_tokens": 1767672.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 3.5347985347985347, |
| "grad_norm": 0.44921875, |
| "learning_rate": 8.26155686827382e-05, |
| "loss": 0.068, |
| "mean_token_accuracy": 0.975150191783905, |
| "num_tokens": 1776694.0, |
| "step": 965 |
| }, |
| { |
| "epoch": 3.553113553113553, |
| "grad_norm": 2.25, |
| "learning_rate": 8.257603027731291e-05, |
| "loss": 0.0536, |
| "mean_token_accuracy": 0.9809759497642517, |
| "num_tokens": 1785904.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 3.571428571428571, |
| "grad_norm": 0.53125, |
| "learning_rate": 8.253622395409019e-05, |
| "loss": 0.0555, |
| "mean_token_accuracy": 0.9794698238372803, |
| "num_tokens": 1795028.0, |
| "step": 975 |
| }, |
| { |
| "epoch": 3.58974358974359, |
| "grad_norm": 0.9609375, |
| "learning_rate": 8.24961500781256e-05, |
| "loss": 0.1048, |
| "mean_token_accuracy": 0.9619524002075195, |
| "num_tokens": 1802957.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 3.608058608058608, |
| "grad_norm": 0.337890625, |
| "learning_rate": 8.24558090169284e-05, |
| "loss": 0.0801, |
| "mean_token_accuracy": 0.9719898581504822, |
| "num_tokens": 1811233.0, |
| "step": 985 |
| }, |
| { |
| "epoch": 3.6263736263736264, |
| "grad_norm": 0.30859375, |
| "learning_rate": 8.241520114045813e-05, |
| "loss": 0.0932, |
| "mean_token_accuracy": 0.9668406844139099, |
| "num_tokens": 1820206.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 3.6446886446886446, |
| "grad_norm": 0.291015625, |
| "learning_rate": 8.237432682112127e-05, |
| "loss": 0.0814, |
| "mean_token_accuracy": 0.968066930770874, |
| "num_tokens": 1828757.0, |
| "step": 995 |
| }, |
| { |
| "epoch": 3.663003663003663, |
| "grad_norm": 0.51171875, |
| "learning_rate": 8.233318643376773e-05, |
| "loss": 0.0786, |
| "mean_token_accuracy": 0.972130823135376, |
| "num_tokens": 1837693.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 3.6813186813186816, |
| "grad_norm": 0.138671875, |
| "learning_rate": 8.229178035568755e-05, |
| "loss": 0.0772, |
| "mean_token_accuracy": 0.9723419427871705, |
| "num_tokens": 1847020.0, |
| "step": 1005 |
| }, |
| { |
| "epoch": 3.6996336996337, |
| "grad_norm": 1.1796875, |
| "learning_rate": 8.225010896660734e-05, |
| "loss": 0.1051, |
| "mean_token_accuracy": 0.9616187572479248, |
| "num_tokens": 1855982.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 3.717948717948718, |
| "grad_norm": 0.44140625, |
| "learning_rate": 8.220817264868678e-05, |
| "loss": 0.0785, |
| "mean_token_accuracy": 0.9704046010971069, |
| "num_tokens": 1865186.0, |
| "step": 1015 |
| }, |
| { |
| "epoch": 3.7362637362637363, |
| "grad_norm": 0.82421875, |
| "learning_rate": 8.216597178651523e-05, |
| "loss": 0.0473, |
| "mean_token_accuracy": 0.9826258182525635, |
| "num_tokens": 1874733.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 3.7545787545787546, |
| "grad_norm": 1.078125, |
| "learning_rate": 8.212350676710807e-05, |
| "loss": 0.0746, |
| "mean_token_accuracy": 0.9718662738800049, |
| "num_tokens": 1884155.0, |
| "step": 1025 |
| }, |
| { |
| "epoch": 3.772893772893773, |
| "grad_norm": 0.58984375, |
| "learning_rate": 8.208077797990322e-05, |
| "loss": 0.0739, |
| "mean_token_accuracy": 0.9724728226661682, |
| "num_tokens": 1892962.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 3.791208791208791, |
| "grad_norm": 0.5234375, |
| "learning_rate": 8.203778581675761e-05, |
| "loss": 0.0665, |
| "mean_token_accuracy": 0.9769334554672241, |
| "num_tokens": 1902461.0, |
| "step": 1035 |
| }, |
| { |
| "epoch": 3.8095238095238093, |
| "grad_norm": 0.6640625, |
| "learning_rate": 8.199453067194351e-05, |
| "loss": 0.0952, |
| "mean_token_accuracy": 0.9705726265907287, |
| "num_tokens": 1911844.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 3.8278388278388276, |
| "grad_norm": 0.82421875, |
| "learning_rate": 8.195101294214486e-05, |
| "loss": 0.0615, |
| "mean_token_accuracy": 0.9792343139648437, |
| "num_tokens": 1921110.0, |
| "step": 1045 |
| }, |
| { |
| "epoch": 3.8461538461538463, |
| "grad_norm": 0.546875, |
| "learning_rate": 8.190723302645387e-05, |
| "loss": 0.0671, |
| "mean_token_accuracy": 0.9760551929473877, |
| "num_tokens": 1930834.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 3.8644688644688645, |
| "grad_norm": 0.70703125, |
| "learning_rate": 8.186319132636706e-05, |
| "loss": 0.0888, |
| "mean_token_accuracy": 0.9672855019569397, |
| "num_tokens": 1939564.0, |
| "step": 1055 |
| }, |
| { |
| "epoch": 3.8827838827838828, |
| "grad_norm": 0.125, |
| "learning_rate": 8.18188882457818e-05, |
| "loss": 0.0718, |
| "mean_token_accuracy": 0.9734614849090576, |
| "num_tokens": 1948652.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 3.901098901098901, |
| "grad_norm": 0.52734375, |
| "learning_rate": 8.177432419099249e-05, |
| "loss": 0.0496, |
| "mean_token_accuracy": 0.9841477632522583, |
| "num_tokens": 1958891.0, |
| "step": 1065 |
| }, |
| { |
| "epoch": 3.9194139194139193, |
| "grad_norm": 0.482421875, |
| "learning_rate": 8.172949957068689e-05, |
| "loss": 0.0773, |
| "mean_token_accuracy": 0.9700749635696411, |
| "num_tokens": 1968507.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 3.937728937728938, |
| "grad_norm": 0.90234375, |
| "learning_rate": 8.168441479594237e-05, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.9697647333145142, |
| "num_tokens": 1977929.0, |
| "step": 1075 |
| }, |
| { |
| "epoch": 3.956043956043956, |
| "grad_norm": 0.63671875, |
| "learning_rate": 8.163907028022208e-05, |
| "loss": 0.0534, |
| "mean_token_accuracy": 0.9822108268737793, |
| "num_tokens": 1987374.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 3.9743589743589745, |
| "grad_norm": 0.490234375, |
| "learning_rate": 8.159346643937122e-05, |
| "loss": 0.0895, |
| "mean_token_accuracy": 0.9678827285766601, |
| "num_tokens": 1995512.0, |
| "step": 1085 |
| }, |
| { |
| "epoch": 3.9926739926739927, |
| "grad_norm": 1.046875, |
| "learning_rate": 8.154760369161322e-05, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.9745811820030212, |
| "num_tokens": 2005014.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 4.010989010989011, |
| "grad_norm": 5.25, |
| "learning_rate": 8.150148245754586e-05, |
| "loss": 0.0784, |
| "mean_token_accuracy": 0.9694916486740113, |
| "num_tokens": 2013958.0, |
| "step": 1095 |
| }, |
| { |
| "epoch": 4.029304029304029, |
| "grad_norm": 0.482421875, |
| "learning_rate": 8.145510316013748e-05, |
| "loss": 0.0379, |
| "mean_token_accuracy": 0.9864168405532837, |
| "num_tokens": 2023416.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 4.0476190476190474, |
| "grad_norm": 0.11328125, |
| "learning_rate": 8.140846622472304e-05, |
| "loss": 0.0336, |
| "mean_token_accuracy": 0.9863126277923584, |
| "num_tokens": 2032892.0, |
| "step": 1105 |
| }, |
| { |
| "epoch": 4.065934065934066, |
| "grad_norm": 0.1259765625, |
| "learning_rate": 8.13615720790003e-05, |
| "loss": 0.0572, |
| "mean_token_accuracy": 0.9810018301010132, |
| "num_tokens": 2042824.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 4.084249084249084, |
| "grad_norm": 1.078125, |
| "learning_rate": 8.131442115302573e-05, |
| "loss": 0.0579, |
| "mean_token_accuracy": 0.9789334416389466, |
| "num_tokens": 2052256.0, |
| "step": 1115 |
| }, |
| { |
| "epoch": 4.102564102564102, |
| "grad_norm": 2.3125, |
| "learning_rate": 8.12670138792108e-05, |
| "loss": 0.0372, |
| "mean_token_accuracy": 0.9866159200668335, |
| "num_tokens": 2061743.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 4.1208791208791204, |
| "grad_norm": 0.3828125, |
| "learning_rate": 8.121935069231779e-05, |
| "loss": 0.0484, |
| "mean_token_accuracy": 0.9815837264060974, |
| "num_tokens": 2069937.0, |
| "step": 1125 |
| }, |
| { |
| "epoch": 4.13919413919414, |
| "grad_norm": 1.015625, |
| "learning_rate": 8.1171432029456e-05, |
| "loss": 0.0687, |
| "mean_token_accuracy": 0.9753403425216675, |
| "num_tokens": 2079121.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 4.157509157509158, |
| "grad_norm": 0.77734375, |
| "learning_rate": 8.11232583300776e-05, |
| "loss": 0.06, |
| "mean_token_accuracy": 0.9811612010002136, |
| "num_tokens": 2088105.0, |
| "step": 1135 |
| }, |
| { |
| "epoch": 4.175824175824176, |
| "grad_norm": 1.5234375, |
| "learning_rate": 8.107483003597365e-05, |
| "loss": 0.0537, |
| "mean_token_accuracy": 0.9808408856391907, |
| "num_tokens": 2096831.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 4.194139194139194, |
| "grad_norm": 0.1767578125, |
| "learning_rate": 8.102614759127002e-05, |
| "loss": 0.0222, |
| "mean_token_accuracy": 0.9910707116127014, |
| "num_tokens": 2106634.0, |
| "step": 1145 |
| }, |
| { |
| "epoch": 4.212454212454213, |
| "grad_norm": 0.37109375, |
| "learning_rate": 8.097721144242338e-05, |
| "loss": 0.0617, |
| "mean_token_accuracy": 0.9770854115486145, |
| "num_tokens": 2116094.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 4.230769230769231, |
| "grad_norm": 0.1806640625, |
| "learning_rate": 8.092802203821708e-05, |
| "loss": 0.0256, |
| "mean_token_accuracy": 0.9892764806747436, |
| "num_tokens": 2125097.0, |
| "step": 1155 |
| }, |
| { |
| "epoch": 4.249084249084249, |
| "grad_norm": 0.130859375, |
| "learning_rate": 8.087857982975698e-05, |
| "loss": 0.0323, |
| "mean_token_accuracy": 0.9884976744651794, |
| "num_tokens": 2134122.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 4.267399267399267, |
| "grad_norm": 1.625, |
| "learning_rate": 8.082888527046738e-05, |
| "loss": 0.0549, |
| "mean_token_accuracy": 0.9823671579360962, |
| "num_tokens": 2142806.0, |
| "step": 1165 |
| }, |
| { |
| "epoch": 4.285714285714286, |
| "grad_norm": 0.703125, |
| "learning_rate": 8.077893881608685e-05, |
| "loss": 0.0772, |
| "mean_token_accuracy": 0.9735370635986328, |
| "num_tokens": 2151281.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 4.304029304029304, |
| "grad_norm": 0.97265625, |
| "learning_rate": 8.072874092466398e-05, |
| "loss": 0.065, |
| "mean_token_accuracy": 0.9773920774459839, |
| "num_tokens": 2160764.0, |
| "step": 1175 |
| }, |
| { |
| "epoch": 4.322344322344322, |
| "grad_norm": 0.69140625, |
| "learning_rate": 8.067829205655333e-05, |
| "loss": 0.0588, |
| "mean_token_accuracy": 0.9791547775268554, |
| "num_tokens": 2169484.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 4.34065934065934, |
| "grad_norm": 0.64453125, |
| "learning_rate": 8.062759267441103e-05, |
| "loss": 0.0444, |
| "mean_token_accuracy": 0.9826448798179627, |
| "num_tokens": 2178294.0, |
| "step": 1185 |
| }, |
| { |
| "epoch": 4.358974358974359, |
| "grad_norm": 0.09423828125, |
| "learning_rate": 8.057664324319065e-05, |
| "loss": 0.0673, |
| "mean_token_accuracy": 0.9759621739387512, |
| "num_tokens": 2187496.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 4.377289377289378, |
| "grad_norm": 1.2421875, |
| "learning_rate": 8.052544423013895e-05, |
| "loss": 0.0366, |
| "mean_token_accuracy": 0.9850521922111511, |
| "num_tokens": 2196704.0, |
| "step": 1195 |
| }, |
| { |
| "epoch": 4.395604395604396, |
| "grad_norm": 0.73828125, |
| "learning_rate": 8.047399610479149e-05, |
| "loss": 0.0487, |
| "mean_token_accuracy": 0.9822589874267578, |
| "num_tokens": 2205968.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 4.413919413919414, |
| "grad_norm": 0.9140625, |
| "learning_rate": 8.042229933896844e-05, |
| "loss": 0.0742, |
| "mean_token_accuracy": 0.9758718729019165, |
| "num_tokens": 2215463.0, |
| "step": 1205 |
| }, |
| { |
| "epoch": 4.4322344322344325, |
| "grad_norm": 0.5703125, |
| "learning_rate": 8.037035440677016e-05, |
| "loss": 0.0424, |
| "mean_token_accuracy": 0.9832551598548889, |
| "num_tokens": 2224409.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 4.450549450549451, |
| "grad_norm": 0.396484375, |
| "learning_rate": 8.03181617845729e-05, |
| "loss": 0.0614, |
| "mean_token_accuracy": 0.9775595307350159, |
| "num_tokens": 2233382.0, |
| "step": 1215 |
| }, |
| { |
| "epoch": 4.468864468864469, |
| "grad_norm": 0.671875, |
| "learning_rate": 8.026572195102447e-05, |
| "loss": 0.0516, |
| "mean_token_accuracy": 0.982862401008606, |
| "num_tokens": 2242990.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 4.487179487179487, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 8.021303538703972e-05, |
| "loss": 0.0649, |
| "mean_token_accuracy": 0.9760475039482117, |
| "num_tokens": 2251597.0, |
| "step": 1225 |
| }, |
| { |
| "epoch": 4.5054945054945055, |
| "grad_norm": 0.384765625, |
| "learning_rate": 8.01601025757963e-05, |
| "loss": 0.0872, |
| "mean_token_accuracy": 0.9712528467178345, |
| "num_tokens": 2260017.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 4.523809523809524, |
| "grad_norm": 0.1015625, |
| "learning_rate": 8.010692400273009e-05, |
| "loss": 0.0514, |
| "mean_token_accuracy": 0.9804089546203614, |
| "num_tokens": 2270253.0, |
| "step": 1235 |
| }, |
| { |
| "epoch": 4.542124542124542, |
| "grad_norm": 0.53515625, |
| "learning_rate": 8.00535001555308e-05, |
| "loss": 0.0392, |
| "mean_token_accuracy": 0.9841665983200073, |
| "num_tokens": 2279714.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 4.56043956043956, |
| "grad_norm": 0.68359375, |
| "learning_rate": 7.999983152413753e-05, |
| "loss": 0.0686, |
| "mean_token_accuracy": 0.9762724280357361, |
| "num_tokens": 2289630.0, |
| "step": 1245 |
| }, |
| { |
| "epoch": 4.5787545787545785, |
| "grad_norm": 1.515625, |
| "learning_rate": 7.994591860073424e-05, |
| "loss": 0.0641, |
| "mean_token_accuracy": 0.9772836685180664, |
| "num_tokens": 2297980.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 4.597069597069597, |
| "grad_norm": 3.890625, |
| "learning_rate": 7.989176187974522e-05, |
| "loss": 0.0768, |
| "mean_token_accuracy": 0.9789605379104614, |
| "num_tokens": 2307539.0, |
| "step": 1255 |
| }, |
| { |
| "epoch": 4.615384615384615, |
| "grad_norm": 22.25, |
| "learning_rate": 7.983736185783057e-05, |
| "loss": 0.1298, |
| "mean_token_accuracy": 0.9655901789665222, |
| "num_tokens": 2316208.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 4.633699633699633, |
| "grad_norm": 19.875, |
| "learning_rate": 7.97827190338817e-05, |
| "loss": 0.068, |
| "mean_token_accuracy": 0.977760374546051, |
| "num_tokens": 2325231.0, |
| "step": 1265 |
| }, |
| { |
| "epoch": 4.652014652014652, |
| "grad_norm": 0.404296875, |
| "learning_rate": 7.972783390901666e-05, |
| "loss": 0.0608, |
| "mean_token_accuracy": 0.9790961384773255, |
| "num_tokens": 2334422.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 4.670329670329671, |
| "grad_norm": 17.25, |
| "learning_rate": 7.967270698657563e-05, |
| "loss": 0.077, |
| "mean_token_accuracy": 0.9756144642829895, |
| "num_tokens": 2343150.0, |
| "step": 1275 |
| }, |
| { |
| "epoch": 4.688644688644689, |
| "grad_norm": 0.578125, |
| "learning_rate": 7.96173387721162e-05, |
| "loss": 0.0358, |
| "mean_token_accuracy": 0.9854479551315307, |
| "num_tokens": 2352172.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 4.706959706959707, |
| "grad_norm": 0.53125, |
| "learning_rate": 7.95617297734089e-05, |
| "loss": 0.0488, |
| "mean_token_accuracy": 0.9814175963401794, |
| "num_tokens": 2362355.0, |
| "step": 1285 |
| }, |
| { |
| "epoch": 4.725274725274725, |
| "grad_norm": 0.54296875, |
| "learning_rate": 7.950588050043236e-05, |
| "loss": 0.0309, |
| "mean_token_accuracy": 0.9877290248870849, |
| "num_tokens": 2372065.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 4.743589743589744, |
| "grad_norm": 0.37109375, |
| "learning_rate": 7.944979146536874e-05, |
| "loss": 0.0425, |
| "mean_token_accuracy": 0.9832926988601685, |
| "num_tokens": 2381430.0, |
| "step": 1295 |
| }, |
| { |
| "epoch": 4.761904761904762, |
| "grad_norm": 0.55859375, |
| "learning_rate": 7.939346318259904e-05, |
| "loss": 0.0294, |
| "mean_token_accuracy": 0.9892051696777344, |
| "num_tokens": 2390948.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 4.78021978021978, |
| "grad_norm": 0.734375, |
| "learning_rate": 7.933689616869828e-05, |
| "loss": 0.0499, |
| "mean_token_accuracy": 0.9788974165916443, |
| "num_tokens": 2399937.0, |
| "step": 1305 |
| }, |
| { |
| "epoch": 4.798534798534798, |
| "grad_norm": 1.2265625, |
| "learning_rate": 7.92800909424309e-05, |
| "loss": 0.059, |
| "mean_token_accuracy": 0.9804568767547608, |
| "num_tokens": 2408310.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 4.816849816849817, |
| "grad_norm": 0.53515625, |
| "learning_rate": 7.922304802474593e-05, |
| "loss": 0.0689, |
| "mean_token_accuracy": 0.9761590838432312, |
| "num_tokens": 2417776.0, |
| "step": 1315 |
| }, |
| { |
| "epoch": 4.835164835164835, |
| "grad_norm": 0.1728515625, |
| "learning_rate": 7.916576793877218e-05, |
| "loss": 0.096, |
| "mean_token_accuracy": 0.9676541090011597, |
| "num_tokens": 2427560.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 4.853479853479853, |
| "grad_norm": 0.142578125, |
| "learning_rate": 7.91082512098135e-05, |
| "loss": 0.0481, |
| "mean_token_accuracy": 0.9823224782943726, |
| "num_tokens": 2437579.0, |
| "step": 1325 |
| }, |
| { |
| "epoch": 4.871794871794872, |
| "grad_norm": 0.123046875, |
| "learning_rate": 7.905049836534396e-05, |
| "loss": 0.0371, |
| "mean_token_accuracy": 0.9855931043624878, |
| "num_tokens": 2446578.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 4.8901098901098905, |
| "grad_norm": 1.0703125, |
| "learning_rate": 7.8992509935003e-05, |
| "loss": 0.0589, |
| "mean_token_accuracy": 0.9781901359558105, |
| "num_tokens": 2455332.0, |
| "step": 1335 |
| }, |
| { |
| "epoch": 4.908424908424909, |
| "grad_norm": 0.25, |
| "learning_rate": 7.893428645059053e-05, |
| "loss": 0.0478, |
| "mean_token_accuracy": 0.9811420202255249, |
| "num_tokens": 2464469.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 4.926739926739927, |
| "grad_norm": 0.55859375, |
| "learning_rate": 7.887582844606212e-05, |
| "loss": 0.0416, |
| "mean_token_accuracy": 0.9845540761947632, |
| "num_tokens": 2474332.0, |
| "step": 1345 |
| }, |
| { |
| "epoch": 4.945054945054945, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 7.881713645752409e-05, |
| "loss": 0.0567, |
| "mean_token_accuracy": 0.977370023727417, |
| "num_tokens": 2483586.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 4.9633699633699635, |
| "grad_norm": 0.57421875, |
| "learning_rate": 7.875821102322853e-05, |
| "loss": 0.0489, |
| "mean_token_accuracy": 0.9813120841979981, |
| "num_tokens": 2492757.0, |
| "step": 1355 |
| }, |
| { |
| "epoch": 4.981684981684982, |
| "grad_norm": 0.158203125, |
| "learning_rate": 7.869905268356847e-05, |
| "loss": 0.0683, |
| "mean_token_accuracy": 0.9747755646705627, |
| "num_tokens": 2501516.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.41015625, |
| "learning_rate": 7.863966198107285e-05, |
| "loss": 0.0371, |
| "mean_token_accuracy": 0.9825921297073364, |
| "num_tokens": 2509780.0, |
| "step": 1365 |
| }, |
| { |
| "epoch": 5.018315018315018, |
| "grad_norm": 0.1923828125, |
| "learning_rate": 7.858003946040152e-05, |
| "loss": 0.0349, |
| "mean_token_accuracy": 0.9879397273063659, |
| "num_tokens": 2518456.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 5.0366300366300365, |
| "grad_norm": 0.38671875, |
| "learning_rate": 7.852018566834035e-05, |
| "loss": 0.0623, |
| "mean_token_accuracy": 0.9814428091049194, |
| "num_tokens": 2526696.0, |
| "step": 1375 |
| }, |
| { |
| "epoch": 5.054945054945055, |
| "grad_norm": 15.5625, |
| "learning_rate": 7.846010115379609e-05, |
| "loss": 0.0851, |
| "mean_token_accuracy": 0.9754502534866333, |
| "num_tokens": 2535458.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 5.073260073260073, |
| "grad_norm": 0.1337890625, |
| "learning_rate": 7.839978646779148e-05, |
| "loss": 0.0302, |
| "mean_token_accuracy": 0.9864336133003235, |
| "num_tokens": 2544932.0, |
| "step": 1385 |
| }, |
| { |
| "epoch": 5.091575091575091, |
| "grad_norm": 0.474609375, |
| "learning_rate": 7.833924216346e-05, |
| "loss": 0.044, |
| "mean_token_accuracy": 0.9824022054672241, |
| "num_tokens": 2553707.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 5.1098901098901095, |
| "grad_norm": 0.62890625, |
| "learning_rate": 7.827846879604103e-05, |
| "loss": 0.0476, |
| "mean_token_accuracy": 0.9825940251350402, |
| "num_tokens": 2562559.0, |
| "step": 1395 |
| }, |
| { |
| "epoch": 5.128205128205128, |
| "grad_norm": 0.3046875, |
| "learning_rate": 7.821746692287458e-05, |
| "loss": 0.0301, |
| "mean_token_accuracy": 0.9876471161842346, |
| "num_tokens": 2571694.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 5.146520146520147, |
| "grad_norm": 0.2236328125, |
| "learning_rate": 7.815623710339623e-05, |
| "loss": 0.041, |
| "mean_token_accuracy": 0.9862527489662171, |
| "num_tokens": 2580208.0, |
| "step": 1405 |
| }, |
| { |
| "epoch": 5.164835164835165, |
| "grad_norm": 0.96484375, |
| "learning_rate": 7.809477989913203e-05, |
| "loss": 0.0618, |
| "mean_token_accuracy": 0.9779723167419434, |
| "num_tokens": 2588156.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 5.183150183150183, |
| "grad_norm": 0.20703125, |
| "learning_rate": 7.803309587369332e-05, |
| "loss": 0.0303, |
| "mean_token_accuracy": 0.9863034844398498, |
| "num_tokens": 2596974.0, |
| "step": 1415 |
| }, |
| { |
| "epoch": 5.201465201465202, |
| "grad_norm": 0.65234375, |
| "learning_rate": 7.79711855927716e-05, |
| "loss": 0.0248, |
| "mean_token_accuracy": 0.9895213365554809, |
| "num_tokens": 2605921.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 5.21978021978022, |
| "grad_norm": 0.6171875, |
| "learning_rate": 7.790904962413324e-05, |
| "loss": 0.0588, |
| "mean_token_accuracy": 0.9793162941932678, |
| "num_tokens": 2615551.0, |
| "step": 1425 |
| }, |
| { |
| "epoch": 5.238095238095238, |
| "grad_norm": 0.61328125, |
| "learning_rate": 7.784668853761446e-05, |
| "loss": 0.0344, |
| "mean_token_accuracy": 0.987682557106018, |
| "num_tokens": 2624766.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 5.256410256410256, |
| "grad_norm": 0.138671875, |
| "learning_rate": 7.778410290511585e-05, |
| "loss": 0.0183, |
| "mean_token_accuracy": 0.9921578407287598, |
| "num_tokens": 2634654.0, |
| "step": 1435 |
| }, |
| { |
| "epoch": 5.274725274725275, |
| "grad_norm": 0.16796875, |
| "learning_rate": 7.772129330059739e-05, |
| "loss": 0.025, |
| "mean_token_accuracy": 0.991031551361084, |
| "num_tokens": 2644515.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 5.293040293040293, |
| "grad_norm": 0.126953125, |
| "learning_rate": 7.7658260300073e-05, |
| "loss": 0.0384, |
| "mean_token_accuracy": 0.9862215161323548, |
| "num_tokens": 2653798.0, |
| "step": 1445 |
| }, |
| { |
| "epoch": 5.311355311355311, |
| "grad_norm": 5.0, |
| "learning_rate": 7.759500448160529e-05, |
| "loss": 0.0418, |
| "mean_token_accuracy": 0.9845625400543213, |
| "num_tokens": 2662880.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 5.329670329670329, |
| "grad_norm": 12.0625, |
| "learning_rate": 7.753152642530036e-05, |
| "loss": 0.0456, |
| "mean_token_accuracy": 0.9851066589355468, |
| "num_tokens": 2671685.0, |
| "step": 1455 |
| }, |
| { |
| "epoch": 5.347985347985348, |
| "grad_norm": 0.953125, |
| "learning_rate": 7.746782671330237e-05, |
| "loss": 0.0476, |
| "mean_token_accuracy": 0.9837478876113892, |
| "num_tokens": 2680763.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 5.366300366300366, |
| "grad_norm": 0.1611328125, |
| "learning_rate": 7.740390592978824e-05, |
| "loss": 0.0462, |
| "mean_token_accuracy": 0.9852417230606079, |
| "num_tokens": 2690295.0, |
| "step": 1465 |
| }, |
| { |
| "epoch": 5.384615384615385, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.733976466096226e-05, |
| "loss": 0.0513, |
| "mean_token_accuracy": 0.9832407712936402, |
| "num_tokens": 2699104.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 5.402930402930403, |
| "grad_norm": 0.3984375, |
| "learning_rate": 7.727540349505082e-05, |
| "loss": 0.0598, |
| "mean_token_accuracy": 0.9803775191307068, |
| "num_tokens": 2708621.0, |
| "step": 1475 |
| }, |
| { |
| "epoch": 5.4212454212454215, |
| "grad_norm": 0.240234375, |
| "learning_rate": 7.721082302229688e-05, |
| "loss": 0.042, |
| "mean_token_accuracy": 0.9849236011505127, |
| "num_tokens": 2718857.0, |
| "step": 1480 |
| }, |
| { |
| "epoch": 5.43956043956044, |
| "grad_norm": 0.5078125, |
| "learning_rate": 7.714602383495464e-05, |
| "loss": 0.0365, |
| "mean_token_accuracy": 0.986870002746582, |
| "num_tokens": 2728231.0, |
| "step": 1485 |
| }, |
| { |
| "epoch": 5.457875457875458, |
| "grad_norm": 0.578125, |
| "learning_rate": 7.708100652728407e-05, |
| "loss": 0.076, |
| "mean_token_accuracy": 0.9744701385498047, |
| "num_tokens": 2737360.0, |
| "step": 1490 |
| }, |
| { |
| "epoch": 5.476190476190476, |
| "grad_norm": 1.421875, |
| "learning_rate": 7.70157716955455e-05, |
| "loss": 0.0334, |
| "mean_token_accuracy": 0.9870843052864074, |
| "num_tokens": 2746745.0, |
| "step": 1495 |
| }, |
| { |
| "epoch": 5.4945054945054945, |
| "grad_norm": 3.28125, |
| "learning_rate": 7.695031993799411e-05, |
| "loss": 0.0462, |
| "mean_token_accuracy": 0.982709014415741, |
| "num_tokens": 2756089.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 5.512820512820513, |
| "grad_norm": 0.2294921875, |
| "learning_rate": 7.688465185487447e-05, |
| "loss": 0.0319, |
| "mean_token_accuracy": 0.9880306243896484, |
| "num_tokens": 2766072.0, |
| "step": 1505 |
| }, |
| { |
| "epoch": 5.531135531135531, |
| "grad_norm": 0.390625, |
| "learning_rate": 7.681876804841504e-05, |
| "loss": 0.0392, |
| "mean_token_accuracy": 0.9848615050315856, |
| "num_tokens": 2775370.0, |
| "step": 1510 |
| }, |
| { |
| "epoch": 5.549450549450549, |
| "grad_norm": 0.8125, |
| "learning_rate": 7.675266912282259e-05, |
| "loss": 0.0376, |
| "mean_token_accuracy": 0.9833124279975891, |
| "num_tokens": 2784606.0, |
| "step": 1515 |
| }, |
| { |
| "epoch": 5.5677655677655675, |
| "grad_norm": 0.44140625, |
| "learning_rate": 7.668635568427677e-05, |
| "loss": 0.0412, |
| "mean_token_accuracy": 0.9833675742149353, |
| "num_tokens": 2794109.0, |
| "step": 1520 |
| }, |
| { |
| "epoch": 5.586080586080586, |
| "grad_norm": 8.375, |
| "learning_rate": 7.661982834092442e-05, |
| "loss": 0.0441, |
| "mean_token_accuracy": 0.9847989916801453, |
| "num_tokens": 2804339.0, |
| "step": 1525 |
| }, |
| { |
| "epoch": 5.604395604395604, |
| "grad_norm": 1.4609375, |
| "learning_rate": 7.65530877028741e-05, |
| "loss": 0.0445, |
| "mean_token_accuracy": 0.9856428861618042, |
| "num_tokens": 2813328.0, |
| "step": 1530 |
| }, |
| { |
| "epoch": 5.622710622710622, |
| "grad_norm": 0.93359375, |
| "learning_rate": 7.648613438219043e-05, |
| "loss": 0.0599, |
| "mean_token_accuracy": 0.9801060795783997, |
| "num_tokens": 2822846.0, |
| "step": 1535 |
| }, |
| { |
| "epoch": 5.641025641025641, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.64189689928885e-05, |
| "loss": 0.0703, |
| "mean_token_accuracy": 0.9766062736511231, |
| "num_tokens": 2832793.0, |
| "step": 1540 |
| }, |
| { |
| "epoch": 5.65934065934066, |
| "grad_norm": 4.40625, |
| "learning_rate": 7.635159215092825e-05, |
| "loss": 0.0482, |
| "mean_token_accuracy": 0.984825873374939, |
| "num_tokens": 2842566.0, |
| "step": 1545 |
| }, |
| { |
| "epoch": 5.677655677655678, |
| "grad_norm": 0.23828125, |
| "learning_rate": 7.62840044742088e-05, |
| "loss": 0.028, |
| "mean_token_accuracy": 0.9913866996765137, |
| "num_tokens": 2852366.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 5.695970695970696, |
| "grad_norm": 0.4921875, |
| "learning_rate": 7.621620658256279e-05, |
| "loss": 0.0232, |
| "mean_token_accuracy": 0.9910756468772888, |
| "num_tokens": 2861611.0, |
| "step": 1555 |
| }, |
| { |
| "epoch": 5.714285714285714, |
| "grad_norm": 0.55078125, |
| "learning_rate": 7.61481990977507e-05, |
| "loss": 0.0417, |
| "mean_token_accuracy": 0.9838123202323914, |
| "num_tokens": 2870193.0, |
| "step": 1560 |
| }, |
| { |
| "epoch": 5.732600732600733, |
| "grad_norm": 0.26953125, |
| "learning_rate": 7.607998264345515e-05, |
| "loss": 0.0427, |
| "mean_token_accuracy": 0.9834349632263184, |
| "num_tokens": 2879533.0, |
| "step": 1565 |
| }, |
| { |
| "epoch": 5.750915750915751, |
| "grad_norm": 0.09814453125, |
| "learning_rate": 7.601155784527516e-05, |
| "loss": 0.0395, |
| "mean_token_accuracy": 0.9865917205810547, |
| "num_tokens": 2888588.0, |
| "step": 1570 |
| }, |
| { |
| "epoch": 5.769230769230769, |
| "grad_norm": 0.63671875, |
| "learning_rate": 7.594292533072048e-05, |
| "loss": 0.0359, |
| "mean_token_accuracy": 0.9868963241577149, |
| "num_tokens": 2898690.0, |
| "step": 1575 |
| }, |
| { |
| "epoch": 5.787545787545787, |
| "grad_norm": 0.205078125, |
| "learning_rate": 7.587408572920568e-05, |
| "loss": 0.0284, |
| "mean_token_accuracy": 0.9881658792495728, |
| "num_tokens": 2908343.0, |
| "step": 1580 |
| }, |
| { |
| "epoch": 5.805860805860806, |
| "grad_norm": 0.3671875, |
| "learning_rate": 7.58050396720446e-05, |
| "loss": 0.0257, |
| "mean_token_accuracy": 0.990417754650116, |
| "num_tokens": 2918583.0, |
| "step": 1585 |
| }, |
| { |
| "epoch": 5.824175824175824, |
| "grad_norm": 0.470703125, |
| "learning_rate": 7.573578779244438e-05, |
| "loss": 0.0461, |
| "mean_token_accuracy": 0.9842254996299744, |
| "num_tokens": 2927267.0, |
| "step": 1590 |
| }, |
| { |
| "epoch": 5.842490842490842, |
| "grad_norm": 0.462890625, |
| "learning_rate": 7.566633072549971e-05, |
| "loss": 0.0422, |
| "mean_token_accuracy": 0.9848962306976319, |
| "num_tokens": 2937053.0, |
| "step": 1595 |
| }, |
| { |
| "epoch": 5.860805860805861, |
| "grad_norm": 0.388671875, |
| "learning_rate": 7.559666910818704e-05, |
| "loss": 0.0444, |
| "mean_token_accuracy": 0.9840786457061768, |
| "num_tokens": 2945426.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 5.8791208791208796, |
| "grad_norm": 0.431640625, |
| "learning_rate": 7.552680357935869e-05, |
| "loss": 0.04, |
| "mean_token_accuracy": 0.9841191053390503, |
| "num_tokens": 2953945.0, |
| "step": 1605 |
| }, |
| { |
| "epoch": 5.897435897435898, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 7.5456734779737e-05, |
| "loss": 0.0426, |
| "mean_token_accuracy": 0.9842138767242432, |
| "num_tokens": 2962787.0, |
| "step": 1610 |
| }, |
| { |
| "epoch": 5.915750915750916, |
| "grad_norm": 0.671875, |
| "learning_rate": 7.53864633519085e-05, |
| "loss": 0.045, |
| "mean_token_accuracy": 0.9817765593528748, |
| "num_tokens": 2971473.0, |
| "step": 1615 |
| }, |
| { |
| "epoch": 5.934065934065934, |
| "grad_norm": 0.75390625, |
| "learning_rate": 7.531598994031796e-05, |
| "loss": 0.0691, |
| "mean_token_accuracy": 0.9776899933815002, |
| "num_tokens": 2980183.0, |
| "step": 1620 |
| }, |
| { |
| "epoch": 5.9523809523809526, |
| "grad_norm": 0.474609375, |
| "learning_rate": 7.524531519126248e-05, |
| "loss": 0.0509, |
| "mean_token_accuracy": 0.9816537737846375, |
| "num_tokens": 2989666.0, |
| "step": 1625 |
| }, |
| { |
| "epoch": 5.970695970695971, |
| "grad_norm": 0.984375, |
| "learning_rate": 7.517443975288563e-05, |
| "loss": 0.0569, |
| "mean_token_accuracy": 0.9782140016555786, |
| "num_tokens": 2998079.0, |
| "step": 1630 |
| }, |
| { |
| "epoch": 5.989010989010989, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.510336427517143e-05, |
| "loss": 0.0628, |
| "mean_token_accuracy": 0.9775285959243775, |
| "num_tokens": 3007389.0, |
| "step": 1635 |
| }, |
| { |
| "epoch": 6.007326007326007, |
| "grad_norm": 0.3203125, |
| "learning_rate": 7.503208940993842e-05, |
| "loss": 0.0693, |
| "mean_token_accuracy": 0.9747921347618103, |
| "num_tokens": 3015549.0, |
| "step": 1640 |
| }, |
| { |
| "epoch": 6.0256410256410255, |
| "grad_norm": 0.69140625, |
| "learning_rate": 7.49606158108337e-05, |
| "loss": 0.0209, |
| "mean_token_accuracy": 0.9922340869903564, |
| "num_tokens": 3025486.0, |
| "step": 1645 |
| }, |
| { |
| "epoch": 6.043956043956044, |
| "grad_norm": 0.095703125, |
| "learning_rate": 7.488894413332689e-05, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.9714855909347534, |
| "num_tokens": 3034529.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 6.062271062271062, |
| "grad_norm": 26.0, |
| "learning_rate": 7.481707503470417e-05, |
| "loss": 0.0312, |
| "mean_token_accuracy": 0.9905990958213806, |
| "num_tokens": 3043834.0, |
| "step": 1655 |
| }, |
| { |
| "epoch": 6.08058608058608, |
| "grad_norm": 1.6875, |
| "learning_rate": 7.474500917406223e-05, |
| "loss": 0.0467, |
| "mean_token_accuracy": 0.9854714870452881, |
| "num_tokens": 3053424.0, |
| "step": 1660 |
| }, |
| { |
| "epoch": 6.0989010989010985, |
| "grad_norm": 0.1240234375, |
| "learning_rate": 7.467274721230221e-05, |
| "loss": 0.0179, |
| "mean_token_accuracy": 0.9933658838272095, |
| "num_tokens": 3063201.0, |
| "step": 1665 |
| }, |
| { |
| "epoch": 6.117216117216117, |
| "grad_norm": 0.06640625, |
| "learning_rate": 7.460028981212365e-05, |
| "loss": 0.0242, |
| "mean_token_accuracy": 0.9913597822189331, |
| "num_tokens": 3072991.0, |
| "step": 1670 |
| }, |
| { |
| "epoch": 6.135531135531136, |
| "grad_norm": 0.921875, |
| "learning_rate": 7.452763763801842e-05, |
| "loss": 0.032, |
| "mean_token_accuracy": 0.9883728504180909, |
| "num_tokens": 3082543.0, |
| "step": 1675 |
| }, |
| { |
| "epoch": 6.153846153846154, |
| "grad_norm": 0.09619140625, |
| "learning_rate": 7.445479135626463e-05, |
| "loss": 0.0306, |
| "mean_token_accuracy": 0.9895648956298828, |
| "num_tokens": 3091715.0, |
| "step": 1680 |
| }, |
| { |
| "epoch": 6.172161172161172, |
| "grad_norm": 0.345703125, |
| "learning_rate": 7.43817516349205e-05, |
| "loss": 0.0357, |
| "mean_token_accuracy": 0.9860040664672851, |
| "num_tokens": 3100483.0, |
| "step": 1685 |
| }, |
| { |
| "epoch": 6.190476190476191, |
| "grad_norm": 0.255859375, |
| "learning_rate": 7.430851914381827e-05, |
| "loss": 0.0254, |
| "mean_token_accuracy": 0.9893843650817871, |
| "num_tokens": 3109988.0, |
| "step": 1690 |
| }, |
| { |
| "epoch": 6.208791208791209, |
| "grad_norm": 0.384765625, |
| "learning_rate": 7.423509455455799e-05, |
| "loss": 0.0415, |
| "mean_token_accuracy": 0.9854371070861816, |
| "num_tokens": 3119503.0, |
| "step": 1695 |
| }, |
| { |
| "epoch": 6.227106227106227, |
| "grad_norm": 0.095703125, |
| "learning_rate": 7.416147854050143e-05, |
| "loss": 0.0345, |
| "mean_token_accuracy": 0.9878103852272033, |
| "num_tokens": 3128641.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 6.245421245421245, |
| "grad_norm": 0.447265625, |
| "learning_rate": 7.408767177676586e-05, |
| "loss": 0.0222, |
| "mean_token_accuracy": 0.9916223526000977, |
| "num_tokens": 3137766.0, |
| "step": 1705 |
| }, |
| { |
| "epoch": 6.263736263736264, |
| "grad_norm": 1.140625, |
| "learning_rate": 7.40136749402179e-05, |
| "loss": 0.0295, |
| "mean_token_accuracy": 0.9880544662475585, |
| "num_tokens": 3146326.0, |
| "step": 1710 |
| }, |
| { |
| "epoch": 6.282051282051282, |
| "grad_norm": 0.09765625, |
| "learning_rate": 7.393948870946729e-05, |
| "loss": 0.0219, |
| "mean_token_accuracy": 0.9904427409172059, |
| "num_tokens": 3155485.0, |
| "step": 1715 |
| }, |
| { |
| "epoch": 6.3003663003663, |
| "grad_norm": 0.62890625, |
| "learning_rate": 7.386511376486061e-05, |
| "loss": 0.0399, |
| "mean_token_accuracy": 0.984571659564972, |
| "num_tokens": 3164816.0, |
| "step": 1720 |
| }, |
| { |
| "epoch": 6.318681318681318, |
| "grad_norm": 0.318359375, |
| "learning_rate": 7.37905507884752e-05, |
| "loss": 0.0201, |
| "mean_token_accuracy": 0.992521858215332, |
| "num_tokens": 3174718.0, |
| "step": 1725 |
| }, |
| { |
| "epoch": 6.336996336996337, |
| "grad_norm": 0.578125, |
| "learning_rate": 7.371580046411267e-05, |
| "loss": 0.0357, |
| "mean_token_accuracy": 0.986468493938446, |
| "num_tokens": 3183409.0, |
| "step": 1730 |
| }, |
| { |
| "epoch": 6.355311355311355, |
| "grad_norm": 0.39453125, |
| "learning_rate": 7.364086347729285e-05, |
| "loss": 0.0232, |
| "mean_token_accuracy": 0.9905255913734436, |
| "num_tokens": 3192371.0, |
| "step": 1735 |
| }, |
| { |
| "epoch": 6.373626373626374, |
| "grad_norm": 0.333984375, |
| "learning_rate": 7.356574051524742e-05, |
| "loss": 0.0307, |
| "mean_token_accuracy": 0.9887727737426758, |
| "num_tokens": 3201677.0, |
| "step": 1740 |
| }, |
| { |
| "epoch": 6.391941391941392, |
| "grad_norm": 0.703125, |
| "learning_rate": 7.349043226691354e-05, |
| "loss": 0.0274, |
| "mean_token_accuracy": 0.9895096063613892, |
| "num_tokens": 3211218.0, |
| "step": 1745 |
| }, |
| { |
| "epoch": 6.410256410256411, |
| "grad_norm": 4.03125, |
| "learning_rate": 7.341493942292763e-05, |
| "loss": 0.0364, |
| "mean_token_accuracy": 0.9867009520530701, |
| "num_tokens": 3219808.0, |
| "step": 1750 |
| }, |
| { |
| "epoch": 6.428571428571429, |
| "grad_norm": 1.0703125, |
| "learning_rate": 7.333926267561898e-05, |
| "loss": 0.0389, |
| "mean_token_accuracy": 0.9854157328605652, |
| "num_tokens": 3229195.0, |
| "step": 1755 |
| }, |
| { |
| "epoch": 6.446886446886447, |
| "grad_norm": 0.61328125, |
| "learning_rate": 7.326340271900346e-05, |
| "loss": 0.0479, |
| "mean_token_accuracy": 0.9841797947883606, |
| "num_tokens": 3237885.0, |
| "step": 1760 |
| }, |
| { |
| "epoch": 6.465201465201465, |
| "grad_norm": 0.95703125, |
| "learning_rate": 7.318736024877707e-05, |
| "loss": 0.04, |
| "mean_token_accuracy": 0.9877835512161255, |
| "num_tokens": 3247579.0, |
| "step": 1765 |
| }, |
| { |
| "epoch": 6.483516483516484, |
| "grad_norm": 0.1904296875, |
| "learning_rate": 7.31111359623096e-05, |
| "loss": 0.0299, |
| "mean_token_accuracy": 0.9874194860458374, |
| "num_tokens": 3256966.0, |
| "step": 1770 |
| }, |
| { |
| "epoch": 6.501831501831502, |
| "grad_norm": 0.99609375, |
| "learning_rate": 7.30347305586383e-05, |
| "loss": 0.0335, |
| "mean_token_accuracy": 0.9878918051719665, |
| "num_tokens": 3266350.0, |
| "step": 1775 |
| }, |
| { |
| "epoch": 6.52014652014652, |
| "grad_norm": 0.57421875, |
| "learning_rate": 7.295814473846134e-05, |
| "loss": 0.0326, |
| "mean_token_accuracy": 0.9886090993881226, |
| "num_tokens": 3274954.0, |
| "step": 1780 |
| }, |
| { |
| "epoch": 6.538461538461538, |
| "grad_norm": 0.55078125, |
| "learning_rate": 7.288137920413148e-05, |
| "loss": 0.0426, |
| "mean_token_accuracy": 0.9847822427749634, |
| "num_tokens": 3282952.0, |
| "step": 1785 |
| }, |
| { |
| "epoch": 6.556776556776557, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 7.280443465964961e-05, |
| "loss": 0.0197, |
| "mean_token_accuracy": 0.9919935941696167, |
| "num_tokens": 3292733.0, |
| "step": 1790 |
| }, |
| { |
| "epoch": 6.575091575091575, |
| "grad_norm": 0.365234375, |
| "learning_rate": 7.272731181065829e-05, |
| "loss": 0.0531, |
| "mean_token_accuracy": 0.9843693256378174, |
| "num_tokens": 3302237.0, |
| "step": 1795 |
| }, |
| { |
| "epoch": 6.593406593406593, |
| "grad_norm": 0.45703125, |
| "learning_rate": 7.265001136443525e-05, |
| "loss": 0.0221, |
| "mean_token_accuracy": 0.9909010767936707, |
| "num_tokens": 3311751.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 6.611721611721611, |
| "grad_norm": 0.275390625, |
| "learning_rate": 7.257253402988693e-05, |
| "loss": 0.032, |
| "mean_token_accuracy": 0.9890513896942139, |
| "num_tokens": 3320625.0, |
| "step": 1805 |
| }, |
| { |
| "epoch": 6.63003663003663, |
| "grad_norm": 0.48828125, |
| "learning_rate": 7.249488051754199e-05, |
| "loss": 0.0278, |
| "mean_token_accuracy": 0.989040732383728, |
| "num_tokens": 3329629.0, |
| "step": 1810 |
| }, |
| { |
| "epoch": 6.648351648351649, |
| "grad_norm": 0.158203125, |
| "learning_rate": 7.241705153954479e-05, |
| "loss": 0.0291, |
| "mean_token_accuracy": 0.988949990272522, |
| "num_tokens": 3338881.0, |
| "step": 1815 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 1.484375, |
| "learning_rate": 7.23390478096488e-05, |
| "loss": 0.0316, |
| "mean_token_accuracy": 0.9878135204315186, |
| "num_tokens": 3348032.0, |
| "step": 1820 |
| }, |
| { |
| "epoch": 6.684981684981685, |
| "grad_norm": 0.65234375, |
| "learning_rate": 7.226087004321018e-05, |
| "loss": 0.032, |
| "mean_token_accuracy": 0.9878805875778198, |
| "num_tokens": 3357685.0, |
| "step": 1825 |
| }, |
| { |
| "epoch": 6.7032967032967035, |
| "grad_norm": 0.51171875, |
| "learning_rate": 7.218251895718108e-05, |
| "loss": 0.0314, |
| "mean_token_accuracy": 0.9871991038322449, |
| "num_tokens": 3366081.0, |
| "step": 1830 |
| }, |
| { |
| "epoch": 6.721611721611722, |
| "grad_norm": 0.52734375, |
| "learning_rate": 7.210399527010315e-05, |
| "loss": 0.0371, |
| "mean_token_accuracy": 0.9859683156013489, |
| "num_tokens": 3375665.0, |
| "step": 1835 |
| }, |
| { |
| "epoch": 6.73992673992674, |
| "grad_norm": 0.6875, |
| "learning_rate": 7.202529970210093e-05, |
| "loss": 0.0437, |
| "mean_token_accuracy": 0.9847039103507995, |
| "num_tokens": 3385407.0, |
| "step": 1840 |
| }, |
| { |
| "epoch": 6.758241758241758, |
| "grad_norm": 0.9296875, |
| "learning_rate": 7.194643297487525e-05, |
| "loss": 0.0482, |
| "mean_token_accuracy": 0.9819490432739257, |
| "num_tokens": 3394276.0, |
| "step": 1845 |
| }, |
| { |
| "epoch": 6.7765567765567765, |
| "grad_norm": 0.46875, |
| "learning_rate": 7.186739581169659e-05, |
| "loss": 0.0768, |
| "mean_token_accuracy": 0.9809007167816162, |
| "num_tokens": 3403876.0, |
| "step": 1850 |
| }, |
| { |
| "epoch": 6.794871794871795, |
| "grad_norm": 0.921875, |
| "learning_rate": 7.178818893739847e-05, |
| "loss": 0.0345, |
| "mean_token_accuracy": 0.9873276352882385, |
| "num_tokens": 3413010.0, |
| "step": 1855 |
| }, |
| { |
| "epoch": 6.813186813186813, |
| "grad_norm": 0.3203125, |
| "learning_rate": 7.170881307837081e-05, |
| "loss": 0.0364, |
| "mean_token_accuracy": 0.9852291464805603, |
| "num_tokens": 3420921.0, |
| "step": 1860 |
| }, |
| { |
| "epoch": 6.831501831501831, |
| "grad_norm": 0.44921875, |
| "learning_rate": 7.162926896255323e-05, |
| "loss": 0.0379, |
| "mean_token_accuracy": 0.9871521234512329, |
| "num_tokens": 3429748.0, |
| "step": 1865 |
| }, |
| { |
| "epoch": 6.8498168498168495, |
| "grad_norm": 0.09765625, |
| "learning_rate": 7.154955731942842e-05, |
| "loss": 0.0338, |
| "mean_token_accuracy": 0.9871858716011047, |
| "num_tokens": 3438647.0, |
| "step": 1870 |
| }, |
| { |
| "epoch": 6.868131868131869, |
| "grad_norm": 0.111328125, |
| "learning_rate": 7.146967888001541e-05, |
| "loss": 0.0384, |
| "mean_token_accuracy": 0.9856713056564331, |
| "num_tokens": 3448087.0, |
| "step": 1875 |
| }, |
| { |
| "epoch": 6.886446886446887, |
| "grad_norm": 0.51171875, |
| "learning_rate": 7.138963437686289e-05, |
| "loss": 0.0423, |
| "mean_token_accuracy": 0.9847253203392029, |
| "num_tokens": 3457095.0, |
| "step": 1880 |
| }, |
| { |
| "epoch": 6.904761904761905, |
| "grad_norm": 0.09521484375, |
| "learning_rate": 7.13094245440425e-05, |
| "loss": 0.0335, |
| "mean_token_accuracy": 0.9881868481636047, |
| "num_tokens": 3466170.0, |
| "step": 1885 |
| }, |
| { |
| "epoch": 6.923076923076923, |
| "grad_norm": 0.255859375, |
| "learning_rate": 7.122905011714206e-05, |
| "loss": 0.0331, |
| "mean_token_accuracy": 0.987188744544983, |
| "num_tokens": 3475299.0, |
| "step": 1890 |
| }, |
| { |
| "epoch": 6.941391941391942, |
| "grad_norm": 0.365234375, |
| "learning_rate": 7.114851183325886e-05, |
| "loss": 0.0412, |
| "mean_token_accuracy": 0.984969186782837, |
| "num_tokens": 3485021.0, |
| "step": 1895 |
| }, |
| { |
| "epoch": 6.95970695970696, |
| "grad_norm": 0.609375, |
| "learning_rate": 7.10678104309929e-05, |
| "loss": 0.0348, |
| "mean_token_accuracy": 0.9886750221252442, |
| "num_tokens": 3493774.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 6.978021978021978, |
| "grad_norm": 0.703125, |
| "learning_rate": 7.098694665044011e-05, |
| "loss": 0.0339, |
| "mean_token_accuracy": 0.9876073241233826, |
| "num_tokens": 3503382.0, |
| "step": 1905 |
| }, |
| { |
| "epoch": 6.996336996336996, |
| "grad_norm": 1.0078125, |
| "learning_rate": 7.090592123318553e-05, |
| "loss": 0.0437, |
| "mean_token_accuracy": 0.9858802318572998, |
| "num_tokens": 3512668.0, |
| "step": 1910 |
| }, |
| { |
| "epoch": 7.014652014652015, |
| "grad_norm": 0.1640625, |
| "learning_rate": 7.082473492229653e-05, |
| "loss": 0.0192, |
| "mean_token_accuracy": 0.9916712999343872, |
| "num_tokens": 3520969.0, |
| "step": 1915 |
| }, |
| { |
| "epoch": 7.032967032967033, |
| "grad_norm": 0.52734375, |
| "learning_rate": 7.074338846231605e-05, |
| "loss": 0.0239, |
| "mean_token_accuracy": 0.9903509378433227, |
| "num_tokens": 3529196.0, |
| "step": 1920 |
| }, |
| { |
| "epoch": 7.051282051282051, |
| "grad_norm": 0.84765625, |
| "learning_rate": 7.066188259925569e-05, |
| "loss": 0.0569, |
| "mean_token_accuracy": 0.9826701760292054, |
| "num_tokens": 3538654.0, |
| "step": 1925 |
| }, |
| { |
| "epoch": 7.069597069597069, |
| "grad_norm": 0.373046875, |
| "learning_rate": 7.05802180805889e-05, |
| "loss": 0.0255, |
| "mean_token_accuracy": 0.9906298637390136, |
| "num_tokens": 3547981.0, |
| "step": 1930 |
| }, |
| { |
| "epoch": 7.087912087912088, |
| "grad_norm": 0.94921875, |
| "learning_rate": 7.049839565524414e-05, |
| "loss": 0.0212, |
| "mean_token_accuracy": 0.9925713777542114, |
| "num_tokens": 3557721.0, |
| "step": 1935 |
| }, |
| { |
| "epoch": 7.106227106227106, |
| "grad_norm": 1.0, |
| "learning_rate": 7.041641607359798e-05, |
| "loss": 0.024, |
| "mean_token_accuracy": 0.991481339931488, |
| "num_tokens": 3566312.0, |
| "step": 1940 |
| }, |
| { |
| "epoch": 7.124542124542124, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 7.033428008746831e-05, |
| "loss": 0.0199, |
| "mean_token_accuracy": 0.9931520938873291, |
| "num_tokens": 3575508.0, |
| "step": 1945 |
| }, |
| { |
| "epoch": 7.142857142857143, |
| "grad_norm": 0.21484375, |
| "learning_rate": 7.025198845010726e-05, |
| "loss": 0.016, |
| "mean_token_accuracy": 0.9932388305664063, |
| "num_tokens": 3584603.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 7.1611721611721615, |
| "grad_norm": 11.4375, |
| "learning_rate": 7.016954191619448e-05, |
| "loss": 0.0315, |
| "mean_token_accuracy": 0.9889041304588317, |
| "num_tokens": 3593828.0, |
| "step": 1955 |
| }, |
| { |
| "epoch": 7.17948717948718, |
| "grad_norm": 1.15625, |
| "learning_rate": 7.008694124183013e-05, |
| "loss": 0.0246, |
| "mean_token_accuracy": 0.9922425985336304, |
| "num_tokens": 3602200.0, |
| "step": 1960 |
| }, |
| { |
| "epoch": 7.197802197802198, |
| "grad_norm": 0.326171875, |
| "learning_rate": 7.000418718452799e-05, |
| "loss": 0.0145, |
| "mean_token_accuracy": 0.993871533870697, |
| "num_tokens": 3611380.0, |
| "step": 1965 |
| }, |
| { |
| "epoch": 7.216117216117216, |
| "grad_norm": 0.5078125, |
| "learning_rate": 6.992128050320839e-05, |
| "loss": 0.024, |
| "mean_token_accuracy": 0.9902616381645203, |
| "num_tokens": 3621064.0, |
| "step": 1970 |
| }, |
| { |
| "epoch": 7.2344322344322345, |
| "grad_norm": 0.07763671875, |
| "learning_rate": 6.983822195819146e-05, |
| "loss": 0.0157, |
| "mean_token_accuracy": 0.9932525634765625, |
| "num_tokens": 3630093.0, |
| "step": 1975 |
| }, |
| { |
| "epoch": 7.252747252747253, |
| "grad_norm": 0.2353515625, |
| "learning_rate": 6.975501231118994e-05, |
| "loss": 0.0236, |
| "mean_token_accuracy": 0.9911327123641968, |
| "num_tokens": 3639168.0, |
| "step": 1980 |
| }, |
| { |
| "epoch": 7.271062271062271, |
| "grad_norm": 0.5859375, |
| "learning_rate": 6.967165232530237e-05, |
| "loss": 0.0216, |
| "mean_token_accuracy": 0.991173791885376, |
| "num_tokens": 3647754.0, |
| "step": 1985 |
| }, |
| { |
| "epoch": 7.289377289377289, |
| "grad_norm": 0.2080078125, |
| "learning_rate": 6.958814276500599e-05, |
| "loss": 0.0162, |
| "mean_token_accuracy": 0.9926367402076721, |
| "num_tokens": 3657191.0, |
| "step": 1990 |
| }, |
| { |
| "epoch": 7.3076923076923075, |
| "grad_norm": 0.1357421875, |
| "learning_rate": 6.950448439614973e-05, |
| "loss": 0.0163, |
| "mean_token_accuracy": 0.9930072546005249, |
| "num_tokens": 3667054.0, |
| "step": 1995 |
| }, |
| { |
| "epoch": 7.326007326007326, |
| "grad_norm": 0.384765625, |
| "learning_rate": 6.942067798594726e-05, |
| "loss": 0.0211, |
| "mean_token_accuracy": 0.9917723655700683, |
| "num_tokens": 3677025.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 7.344322344322344, |
| "grad_norm": 2.3125, |
| "learning_rate": 6.933672430296986e-05, |
| "loss": 0.0389, |
| "mean_token_accuracy": 0.9862228393554687, |
| "num_tokens": 3685112.0, |
| "step": 2005 |
| }, |
| { |
| "epoch": 7.362637362637362, |
| "grad_norm": 0.126953125, |
| "learning_rate": 6.925262411713945e-05, |
| "loss": 0.0225, |
| "mean_token_accuracy": 0.9903972864151, |
| "num_tokens": 3694862.0, |
| "step": 2010 |
| }, |
| { |
| "epoch": 7.380952380952381, |
| "grad_norm": 0.26171875, |
| "learning_rate": 6.916837819972149e-05, |
| "loss": 0.0274, |
| "mean_token_accuracy": 0.9918236613273621, |
| "num_tokens": 3704805.0, |
| "step": 2015 |
| }, |
| { |
| "epoch": 7.3992673992674, |
| "grad_norm": 0.08349609375, |
| "learning_rate": 6.908398732331793e-05, |
| "loss": 0.017, |
| "mean_token_accuracy": 0.9925737380981445, |
| "num_tokens": 3714304.0, |
| "step": 2020 |
| }, |
| { |
| "epoch": 7.417582417582418, |
| "grad_norm": 0.375, |
| "learning_rate": 6.899945226186005e-05, |
| "loss": 0.0219, |
| "mean_token_accuracy": 0.9916564226150513, |
| "num_tokens": 3723305.0, |
| "step": 2025 |
| }, |
| { |
| "epoch": 7.435897435897436, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.89147737906015e-05, |
| "loss": 0.0269, |
| "mean_token_accuracy": 0.9908839344978333, |
| "num_tokens": 3732073.0, |
| "step": 2030 |
| }, |
| { |
| "epoch": 7.454212454212454, |
| "grad_norm": 0.091796875, |
| "learning_rate": 6.882995268611106e-05, |
| "loss": 0.0226, |
| "mean_token_accuracy": 0.9915896058082581, |
| "num_tokens": 3741071.0, |
| "step": 2035 |
| }, |
| { |
| "epoch": 7.472527472527473, |
| "grad_norm": 0.62890625, |
| "learning_rate": 6.874498972626559e-05, |
| "loss": 0.0204, |
| "mean_token_accuracy": 0.9924831748008728, |
| "num_tokens": 3750420.0, |
| "step": 2040 |
| }, |
| { |
| "epoch": 7.490842490842491, |
| "grad_norm": 0.15625, |
| "learning_rate": 6.865988569024286e-05, |
| "loss": 0.03, |
| "mean_token_accuracy": 0.9895938873291016, |
| "num_tokens": 3760153.0, |
| "step": 2045 |
| }, |
| { |
| "epoch": 7.509157509157509, |
| "grad_norm": 0.416015625, |
| "learning_rate": 6.857464135851444e-05, |
| "loss": 0.0304, |
| "mean_token_accuracy": 0.9892897367477417, |
| "num_tokens": 3768898.0, |
| "step": 2050 |
| }, |
| { |
| "epoch": 7.527472527472527, |
| "grad_norm": 0.2578125, |
| "learning_rate": 6.848925751283853e-05, |
| "loss": 0.0203, |
| "mean_token_accuracy": 0.9921239137649536, |
| "num_tokens": 3778718.0, |
| "step": 2055 |
| }, |
| { |
| "epoch": 7.545787545787546, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 6.840373493625274e-05, |
| "loss": 0.0203, |
| "mean_token_accuracy": 0.9922136068344116, |
| "num_tokens": 3788022.0, |
| "step": 2060 |
| }, |
| { |
| "epoch": 7.564102564102564, |
| "grad_norm": 0.69921875, |
| "learning_rate": 6.831807441306698e-05, |
| "loss": 0.036, |
| "mean_token_accuracy": 0.988727355003357, |
| "num_tokens": 3797131.0, |
| "step": 2065 |
| }, |
| { |
| "epoch": 7.582417582417582, |
| "grad_norm": 0.263671875, |
| "learning_rate": 6.823227672885628e-05, |
| "loss": 0.0319, |
| "mean_token_accuracy": 0.9886006474494934, |
| "num_tokens": 3806894.0, |
| "step": 2070 |
| }, |
| { |
| "epoch": 7.6007326007326, |
| "grad_norm": 0.435546875, |
| "learning_rate": 6.814634267045346e-05, |
| "loss": 0.0245, |
| "mean_token_accuracy": 0.9917014598846435, |
| "num_tokens": 3815606.0, |
| "step": 2075 |
| }, |
| { |
| "epoch": 7.619047619047619, |
| "grad_norm": 0.26171875, |
| "learning_rate": 6.806027302594206e-05, |
| "loss": 0.0308, |
| "mean_token_accuracy": 0.988996148109436, |
| "num_tokens": 3824337.0, |
| "step": 2080 |
| }, |
| { |
| "epoch": 7.637362637362637, |
| "grad_norm": 0.1240234375, |
| "learning_rate": 6.797406858464905e-05, |
| "loss": 0.0289, |
| "mean_token_accuracy": 0.9898035883903503, |
| "num_tokens": 3833957.0, |
| "step": 2085 |
| }, |
| { |
| "epoch": 7.655677655677656, |
| "grad_norm": 0.369140625, |
| "learning_rate": 6.788773013713758e-05, |
| "loss": 0.0265, |
| "mean_token_accuracy": 0.9909451246261597, |
| "num_tokens": 3843114.0, |
| "step": 2090 |
| }, |
| { |
| "epoch": 7.673992673992674, |
| "grad_norm": 0.62109375, |
| "learning_rate": 6.780125847519971e-05, |
| "loss": 0.0206, |
| "mean_token_accuracy": 0.9923561453819275, |
| "num_tokens": 3852882.0, |
| "step": 2095 |
| }, |
| { |
| "epoch": 7.6923076923076925, |
| "grad_norm": 0.3984375, |
| "learning_rate": 6.771465439184927e-05, |
| "loss": 0.026, |
| "mean_token_accuracy": 0.9902096509933471, |
| "num_tokens": 3862419.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 7.710622710622711, |
| "grad_norm": 0.69921875, |
| "learning_rate": 6.762791868131442e-05, |
| "loss": 0.0226, |
| "mean_token_accuracy": 0.9907670021057129, |
| "num_tokens": 3871716.0, |
| "step": 2105 |
| }, |
| { |
| "epoch": 7.728937728937729, |
| "grad_norm": 0.10888671875, |
| "learning_rate": 6.754105213903045e-05, |
| "loss": 0.023, |
| "mean_token_accuracy": 0.9909697294235229, |
| "num_tokens": 3880542.0, |
| "step": 2110 |
| }, |
| { |
| "epoch": 7.747252747252747, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 6.745405556163253e-05, |
| "loss": 0.0346, |
| "mean_token_accuracy": 0.9859986186027527, |
| "num_tokens": 3889484.0, |
| "step": 2115 |
| }, |
| { |
| "epoch": 7.7655677655677655, |
| "grad_norm": 0.37890625, |
| "learning_rate": 6.736692974694833e-05, |
| "loss": 0.022, |
| "mean_token_accuracy": 0.9915480494499207, |
| "num_tokens": 3898636.0, |
| "step": 2120 |
| }, |
| { |
| "epoch": 7.783882783882784, |
| "grad_norm": 0.41796875, |
| "learning_rate": 6.727967549399072e-05, |
| "loss": 0.0253, |
| "mean_token_accuracy": 0.9902077794075013, |
| "num_tokens": 3907808.0, |
| "step": 2125 |
| }, |
| { |
| "epoch": 7.802197802197802, |
| "grad_norm": 0.1796875, |
| "learning_rate": 6.719229360295044e-05, |
| "loss": 0.0352, |
| "mean_token_accuracy": 0.9878880977630615, |
| "num_tokens": 3916667.0, |
| "step": 2130 |
| }, |
| { |
| "epoch": 7.82051282051282, |
| "grad_norm": 0.26171875, |
| "learning_rate": 6.710478487518882e-05, |
| "loss": 0.0247, |
| "mean_token_accuracy": 0.9887702345848084, |
| "num_tokens": 3926085.0, |
| "step": 2135 |
| }, |
| { |
| "epoch": 7.8388278388278385, |
| "grad_norm": 0.5703125, |
| "learning_rate": 6.701715011323034e-05, |
| "loss": 0.0318, |
| "mean_token_accuracy": 0.9896463632583619, |
| "num_tokens": 3934900.0, |
| "step": 2140 |
| }, |
| { |
| "epoch": 7.857142857142857, |
| "grad_norm": 0.09326171875, |
| "learning_rate": 6.692939012075532e-05, |
| "loss": 0.0124, |
| "mean_token_accuracy": 0.9943976402282715, |
| "num_tokens": 3944656.0, |
| "step": 2145 |
| }, |
| { |
| "epoch": 7.875457875457876, |
| "grad_norm": 0.291015625, |
| "learning_rate": 6.684150570259256e-05, |
| "loss": 0.0216, |
| "mean_token_accuracy": 0.9906636476516724, |
| "num_tokens": 3954257.0, |
| "step": 2150 |
| }, |
| { |
| "epoch": 7.893772893772894, |
| "grad_norm": 0.3984375, |
| "learning_rate": 6.675349766471193e-05, |
| "loss": 0.0266, |
| "mean_token_accuracy": 0.9893643379211425, |
| "num_tokens": 3962549.0, |
| "step": 2155 |
| }, |
| { |
| "epoch": 7.912087912087912, |
| "grad_norm": 0.3125, |
| "learning_rate": 6.6665366814217e-05, |
| "loss": 0.0203, |
| "mean_token_accuracy": 0.9919966578483581, |
| "num_tokens": 3971970.0, |
| "step": 2160 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 5460, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 20, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.623835599352627e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|