{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 76.92307692307692, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.038461538461538464, "grad_norm": 0.0, "learning_rate": 4.9980000000000006e-05, "loss": 1.7957, "mean_token_accuracy": 0.6374512881040573, "num_tokens": 17597.0, "step": 5 }, { "epoch": 0.07692307692307693, "grad_norm": 0.0, "learning_rate": 4.9955e-05, "loss": 1.7508, "mean_token_accuracy": 0.6454170644283295, "num_tokens": 35641.0, "step": 10 }, { "epoch": 0.11538461538461539, "grad_norm": 0.0, "learning_rate": 4.9930000000000005e-05, "loss": 1.6757, "mean_token_accuracy": 0.6618378311395645, "num_tokens": 54379.0, "step": 15 }, { "epoch": 0.15384615384615385, "grad_norm": 0.0, "learning_rate": 4.9905000000000004e-05, "loss": 1.7597, "mean_token_accuracy": 0.6484396398067475, "num_tokens": 71860.0, "step": 20 }, { "epoch": 0.19230769230769232, "grad_norm": 0.0, "learning_rate": 4.9880000000000004e-05, "loss": 1.669, "mean_token_accuracy": 0.6693991690874099, "num_tokens": 90064.0, "step": 25 }, { "epoch": 0.23076923076923078, "grad_norm": 0.0, "learning_rate": 4.9855e-05, "loss": 1.7247, "mean_token_accuracy": 0.6533850133419037, "num_tokens": 108700.0, "step": 30 }, { "epoch": 0.2692307692307692, "grad_norm": 0.0, "learning_rate": 4.983e-05, "loss": 1.7422, "mean_token_accuracy": 0.6492581188678741, "num_tokens": 126746.0, "step": 35 }, { "epoch": 0.3076923076923077, "grad_norm": 0.0, "learning_rate": 4.9805e-05, "loss": 1.8267, "mean_token_accuracy": 0.6348982483148575, "num_tokens": 144478.0, "step": 40 }, { "epoch": 0.34615384615384615, "grad_norm": 0.0, "learning_rate": 4.978e-05, "loss": 1.7499, "mean_token_accuracy": 0.6513226598501205, "num_tokens": 163267.0, "step": 45 }, { "epoch": 0.38461538461538464, "grad_norm": 0.0, "learning_rate": 4.9755e-05, "loss": 1.8227, "mean_token_accuracy": 0.6310899078845977, "num_tokens": 181224.0, "step": 50 }, { "epoch": 0.4230769230769231, "grad_norm": 0.0, "learning_rate": 4.973000000000001e-05, "loss": 1.7641, "mean_token_accuracy": 0.6496782422065734, "num_tokens": 199090.0, "step": 55 }, { "epoch": 0.46153846153846156, "grad_norm": 0.0, "learning_rate": 4.9705e-05, "loss": 1.7405, "mean_token_accuracy": 0.6453521817922592, "num_tokens": 217701.0, "step": 60 }, { "epoch": 0.5, "grad_norm": 0.0, "learning_rate": 4.9680000000000005e-05, "loss": 1.7214, "mean_token_accuracy": 0.658767506480217, "num_tokens": 236196.0, "step": 65 }, { "epoch": 0.5384615384615384, "grad_norm": 0.0, "learning_rate": 4.9655000000000005e-05, "loss": 1.7138, "mean_token_accuracy": 0.6556680232286454, "num_tokens": 254323.0, "step": 70 }, { "epoch": 0.5769230769230769, "grad_norm": 0.0, "learning_rate": 4.9630000000000004e-05, "loss": 1.6761, "mean_token_accuracy": 0.6626079142093658, "num_tokens": 273788.0, "step": 75 }, { "epoch": 0.6153846153846154, "grad_norm": 0.0, "learning_rate": 4.9605000000000004e-05, "loss": 1.7992, "mean_token_accuracy": 0.6399475276470185, "num_tokens": 291226.0, "step": 80 }, { "epoch": 0.6538461538461539, "grad_norm": 0.0, "learning_rate": 4.958e-05, "loss": 1.7466, "mean_token_accuracy": 0.645823523402214, "num_tokens": 309131.0, "step": 85 }, { "epoch": 0.6923076923076923, "grad_norm": 0.0, "learning_rate": 4.9555e-05, "loss": 1.6855, "mean_token_accuracy": 0.6573584616184235, "num_tokens": 328164.0, "step": 90 }, { "epoch": 0.7307692307692307, "grad_norm": 0.0, "learning_rate": 4.953e-05, "loss": 1.7877, "mean_token_accuracy": 0.636717626452446, "num_tokens": 346359.0, "step": 95 }, { "epoch": 0.7692307692307693, "grad_norm": 0.0, "learning_rate": 4.9505e-05, "loss": 1.6851, "mean_token_accuracy": 0.6630265355110169, "num_tokens": 364985.0, "step": 100 }, { "epoch": 0.8076923076923077, "grad_norm": 0.0, "learning_rate": 4.948000000000001e-05, "loss": 1.7477, "mean_token_accuracy": 0.6559244930744171, "num_tokens": 382893.0, "step": 105 }, { "epoch": 0.8461538461538461, "grad_norm": 0.0, "learning_rate": 4.9455e-05, "loss": 1.7006, "mean_token_accuracy": 0.6579539209604264, "num_tokens": 401596.0, "step": 110 }, { "epoch": 0.8846153846153846, "grad_norm": 0.0, "learning_rate": 4.9430000000000006e-05, "loss": 1.7604, "mean_token_accuracy": 0.6474329262971878, "num_tokens": 419595.0, "step": 115 }, { "epoch": 0.9230769230769231, "grad_norm": 0.0, "learning_rate": 4.9405e-05, "loss": 1.761, "mean_token_accuracy": 0.644348555803299, "num_tokens": 437633.0, "step": 120 }, { "epoch": 0.9615384615384616, "grad_norm": 0.0, "learning_rate": 4.9380000000000005e-05, "loss": 1.7521, "mean_token_accuracy": 0.6455009877681732, "num_tokens": 455648.0, "step": 125 }, { "epoch": 1.0, "grad_norm": 0.0, "learning_rate": 4.9355000000000004e-05, "loss": 1.7483, "mean_token_accuracy": 0.6529789954423905, "num_tokens": 474105.0, "step": 130 }, { "epoch": 1.0384615384615385, "grad_norm": 0.0, "learning_rate": 4.9330000000000004e-05, "loss": 1.7308, "mean_token_accuracy": 0.6532562792301178, "num_tokens": 492861.0, "step": 135 }, { "epoch": 1.0769230769230769, "grad_norm": 0.0, "learning_rate": 4.9305e-05, "loss": 1.7815, "mean_token_accuracy": 0.6435200899839402, "num_tokens": 510988.0, "step": 140 }, { "epoch": 1.1153846153846154, "grad_norm": 0.0, "learning_rate": 4.928e-05, "loss": 1.8093, "mean_token_accuracy": 0.6363381177186966, "num_tokens": 528455.0, "step": 145 }, { "epoch": 1.1538461538461537, "grad_norm": 0.0, "learning_rate": 4.9255e-05, "loss": 1.7437, "mean_token_accuracy": 0.651939743757248, "num_tokens": 546439.0, "step": 150 }, { "epoch": 1.1923076923076923, "grad_norm": 0.0, "learning_rate": 4.923e-05, "loss": 1.7413, "mean_token_accuracy": 0.6526101022958756, "num_tokens": 564750.0, "step": 155 }, { "epoch": 1.2307692307692308, "grad_norm": 0.0, "learning_rate": 4.9205e-05, "loss": 1.7649, "mean_token_accuracy": 0.6482356518507004, "num_tokens": 582901.0, "step": 160 }, { "epoch": 1.2692307692307692, "grad_norm": 0.0, "learning_rate": 4.918000000000001e-05, "loss": 1.7267, "mean_token_accuracy": 0.6545612186193466, "num_tokens": 601222.0, "step": 165 }, { "epoch": 1.3076923076923077, "grad_norm": 0.0, "learning_rate": 4.9155e-05, "loss": 1.7107, "mean_token_accuracy": 0.6566696882247924, "num_tokens": 619157.0, "step": 170 }, { "epoch": 1.3461538461538463, "grad_norm": 0.0, "learning_rate": 4.9130000000000006e-05, "loss": 1.6747, "mean_token_accuracy": 0.6630686283111572, "num_tokens": 638896.0, "step": 175 }, { "epoch": 1.3846153846153846, "grad_norm": 0.0, "learning_rate": 4.9105e-05, "loss": 1.7635, "mean_token_accuracy": 0.6433518409729004, "num_tokens": 656329.0, "step": 180 }, { "epoch": 1.4230769230769231, "grad_norm": 0.0, "learning_rate": 4.9080000000000004e-05, "loss": 1.759, "mean_token_accuracy": 0.6476027637720108, "num_tokens": 674377.0, "step": 185 }, { "epoch": 1.4615384615384617, "grad_norm": 0.0, "learning_rate": 4.9055000000000004e-05, "loss": 1.7288, "mean_token_accuracy": 0.6561032950878143, "num_tokens": 692477.0, "step": 190 }, { "epoch": 1.5, "grad_norm": 0.0, "learning_rate": 4.903e-05, "loss": 1.6869, "mean_token_accuracy": 0.6576298862695694, "num_tokens": 711509.0, "step": 195 }, { "epoch": 1.5384615384615383, "grad_norm": 0.0, "learning_rate": 4.9005e-05, "loss": 1.8134, "mean_token_accuracy": 0.6296403974294662, "num_tokens": 728738.0, "step": 200 }, { "epoch": 1.5769230769230769, "grad_norm": 0.0, "learning_rate": 4.898e-05, "loss": 1.7075, "mean_token_accuracy": 0.6586553543806076, "num_tokens": 747160.0, "step": 205 }, { "epoch": 1.6153846153846154, "grad_norm": 0.0, "learning_rate": 4.8955e-05, "loss": 1.7835, "mean_token_accuracy": 0.6415642440319062, "num_tokens": 765489.0, "step": 210 }, { "epoch": 1.6538461538461537, "grad_norm": 0.0, "learning_rate": 4.893e-05, "loss": 1.6952, "mean_token_accuracy": 0.6612388670444489, "num_tokens": 783973.0, "step": 215 }, { "epoch": 1.6923076923076923, "grad_norm": 0.0, "learning_rate": 4.8905e-05, "loss": 1.7186, "mean_token_accuracy": 0.6559958964586258, "num_tokens": 802295.0, "step": 220 }, { "epoch": 1.7307692307692308, "grad_norm": 0.0, "learning_rate": 4.8880000000000006e-05, "loss": 1.7534, "mean_token_accuracy": 0.647523045539856, "num_tokens": 820482.0, "step": 225 }, { "epoch": 1.7692307692307692, "grad_norm": 0.0, "learning_rate": 4.8855e-05, "loss": 1.7675, "mean_token_accuracy": 0.6462311297655106, "num_tokens": 838437.0, "step": 230 }, { "epoch": 1.8076923076923077, "grad_norm": 0.0, "learning_rate": 4.8830000000000005e-05, "loss": 1.753, "mean_token_accuracy": 0.6491482555866241, "num_tokens": 856808.0, "step": 235 }, { "epoch": 1.8461538461538463, "grad_norm": 0.0, "learning_rate": 4.8805e-05, "loss": 1.756, "mean_token_accuracy": 0.6505926042795181, "num_tokens": 874848.0, "step": 240 }, { "epoch": 1.8846153846153846, "grad_norm": 0.0, "learning_rate": 4.8780000000000004e-05, "loss": 1.7598, "mean_token_accuracy": 0.6512235313653946, "num_tokens": 893014.0, "step": 245 }, { "epoch": 1.9230769230769231, "grad_norm": 0.0, "learning_rate": 4.8755e-05, "loss": 1.748, "mean_token_accuracy": 0.6448923200368881, "num_tokens": 911604.0, "step": 250 }, { "epoch": 1.9615384615384617, "grad_norm": 0.0, "learning_rate": 4.873e-05, "loss": 1.7326, "mean_token_accuracy": 0.6498657464981079, "num_tokens": 929944.0, "step": 255 }, { "epoch": 2.0, "grad_norm": 0.0, "learning_rate": 4.8705e-05, "loss": 1.6613, "mean_token_accuracy": 0.6733784437179565, "num_tokens": 948210.0, "step": 260 }, { "epoch": 2.0384615384615383, "grad_norm": 0.0, "learning_rate": 4.868e-05, "loss": 1.74, "mean_token_accuracy": 0.6532588005065918, "num_tokens": 966435.0, "step": 265 }, { "epoch": 2.076923076923077, "grad_norm": 0.0, "learning_rate": 4.8655e-05, "loss": 1.6645, "mean_token_accuracy": 0.6623793184757233, "num_tokens": 985701.0, "step": 270 }, { "epoch": 2.1153846153846154, "grad_norm": 0.0, "learning_rate": 4.863e-05, "loss": 1.7274, "mean_token_accuracy": 0.6545837461948395, "num_tokens": 1004078.0, "step": 275 }, { "epoch": 2.1538461538461537, "grad_norm": 0.0, "learning_rate": 4.8605e-05, "loss": 1.7044, "mean_token_accuracy": 0.6571849673986435, "num_tokens": 1023219.0, "step": 280 }, { "epoch": 2.1923076923076925, "grad_norm": 0.0, "learning_rate": 4.8580000000000006e-05, "loss": 1.7217, "mean_token_accuracy": 0.6511914372444153, "num_tokens": 1041512.0, "step": 285 }, { "epoch": 2.230769230769231, "grad_norm": 0.0, "learning_rate": 4.8555e-05, "loss": 1.7571, "mean_token_accuracy": 0.6483366042375565, "num_tokens": 1059633.0, "step": 290 }, { "epoch": 2.269230769230769, "grad_norm": 0.0, "learning_rate": 4.8530000000000005e-05, "loss": 1.6675, "mean_token_accuracy": 0.6683109492063523, "num_tokens": 1078768.0, "step": 295 }, { "epoch": 2.3076923076923075, "grad_norm": 0.0, "learning_rate": 4.8505e-05, "loss": 1.7856, "mean_token_accuracy": 0.6457105249166488, "num_tokens": 1096433.0, "step": 300 }, { "epoch": 2.3461538461538463, "grad_norm": 0.0, "learning_rate": 4.8480000000000003e-05, "loss": 1.7599, "mean_token_accuracy": 0.6529647082090377, "num_tokens": 1114140.0, "step": 305 }, { "epoch": 2.3846153846153846, "grad_norm": 0.0, "learning_rate": 4.8455e-05, "loss": 1.7591, "mean_token_accuracy": 0.6483295440673829, "num_tokens": 1132366.0, "step": 310 }, { "epoch": 2.423076923076923, "grad_norm": 0.0, "learning_rate": 4.843e-05, "loss": 1.6774, "mean_token_accuracy": 0.660743135213852, "num_tokens": 1151832.0, "step": 315 }, { "epoch": 2.4615384615384617, "grad_norm": 0.0, "learning_rate": 4.8405e-05, "loss": 1.8422, "mean_token_accuracy": 0.6320727407932282, "num_tokens": 1168795.0, "step": 320 }, { "epoch": 2.5, "grad_norm": 0.0, "learning_rate": 4.838e-05, "loss": 1.7624, "mean_token_accuracy": 0.6471054494380951, "num_tokens": 1186567.0, "step": 325 }, { "epoch": 2.5384615384615383, "grad_norm": 0.0, "learning_rate": 4.8355e-05, "loss": 1.7518, "mean_token_accuracy": 0.6520386338233948, "num_tokens": 1204773.0, "step": 330 }, { "epoch": 2.5769230769230766, "grad_norm": 0.0, "learning_rate": 4.833e-05, "loss": 1.8146, "mean_token_accuracy": 0.63519766330719, "num_tokens": 1222245.0, "step": 335 }, { "epoch": 2.6153846153846154, "grad_norm": 0.0, "learning_rate": 4.8305e-05, "loss": 1.7304, "mean_token_accuracy": 0.6472829192876816, "num_tokens": 1240309.0, "step": 340 }, { "epoch": 2.6538461538461537, "grad_norm": 0.0, "learning_rate": 4.8280000000000005e-05, "loss": 1.7343, "mean_token_accuracy": 0.6456574499607086, "num_tokens": 1258875.0, "step": 345 }, { "epoch": 2.6923076923076925, "grad_norm": 0.0, "learning_rate": 4.8255e-05, "loss": 1.7172, "mean_token_accuracy": 0.6524255007505417, "num_tokens": 1277367.0, "step": 350 }, { "epoch": 2.730769230769231, "grad_norm": 0.0, "learning_rate": 4.8230000000000004e-05, "loss": 1.6652, "mean_token_accuracy": 0.6617098182439805, "num_tokens": 1296866.0, "step": 355 }, { "epoch": 2.769230769230769, "grad_norm": 0.0, "learning_rate": 4.8205000000000003e-05, "loss": 1.6834, "mean_token_accuracy": 0.662646809220314, "num_tokens": 1315690.0, "step": 360 }, { "epoch": 2.8076923076923075, "grad_norm": 0.0, "learning_rate": 4.818e-05, "loss": 1.7963, "mean_token_accuracy": 0.6397953987121582, "num_tokens": 1333332.0, "step": 365 }, { "epoch": 2.8461538461538463, "grad_norm": 0.0, "learning_rate": 4.8155e-05, "loss": 1.7771, "mean_token_accuracy": 0.6447630435228348, "num_tokens": 1351095.0, "step": 370 }, { "epoch": 2.8846153846153846, "grad_norm": 0.0, "learning_rate": 4.813e-05, "loss": 1.7453, "mean_token_accuracy": 0.6486737489700317, "num_tokens": 1369302.0, "step": 375 }, { "epoch": 2.9230769230769234, "grad_norm": 0.0, "learning_rate": 4.8105e-05, "loss": 1.7446, "mean_token_accuracy": 0.6489187270402909, "num_tokens": 1387456.0, "step": 380 }, { "epoch": 2.9615384615384617, "grad_norm": 0.0, "learning_rate": 4.808e-05, "loss": 1.7578, "mean_token_accuracy": 0.6475874185562134, "num_tokens": 1404820.0, "step": 385 }, { "epoch": 3.0, "grad_norm": 0.0, "learning_rate": 4.8055e-05, "loss": 1.8244, "mean_token_accuracy": 0.636838635802269, "num_tokens": 1422315.0, "step": 390 }, { "epoch": 3.0384615384615383, "grad_norm": 0.0, "learning_rate": 4.8030000000000006e-05, "loss": 1.7325, "mean_token_accuracy": 0.6527264744043351, "num_tokens": 1440658.0, "step": 395 }, { "epoch": 3.076923076923077, "grad_norm": 0.0, "learning_rate": 4.8005e-05, "loss": 1.7647, "mean_token_accuracy": 0.6484371930360794, "num_tokens": 1458704.0, "step": 400 }, { "epoch": 3.1153846153846154, "grad_norm": 0.0, "learning_rate": 4.7980000000000005e-05, "loss": 1.7741, "mean_token_accuracy": 0.6491596013307571, "num_tokens": 1476178.0, "step": 405 }, { "epoch": 3.1538461538461537, "grad_norm": 0.0, "learning_rate": 4.7955e-05, "loss": 1.7011, "mean_token_accuracy": 0.6573891878128052, "num_tokens": 1494800.0, "step": 410 }, { "epoch": 3.1923076923076925, "grad_norm": 0.0, "learning_rate": 4.7930000000000004e-05, "loss": 1.8141, "mean_token_accuracy": 0.6418175488710404, "num_tokens": 1512686.0, "step": 415 }, { "epoch": 3.230769230769231, "grad_norm": 0.0, "learning_rate": 4.7905e-05, "loss": 1.7422, "mean_token_accuracy": 0.6511821269989013, "num_tokens": 1530549.0, "step": 420 }, { "epoch": 3.269230769230769, "grad_norm": 0.0, "learning_rate": 4.788e-05, "loss": 1.7655, "mean_token_accuracy": 0.6506276488304138, "num_tokens": 1548502.0, "step": 425 }, { "epoch": 3.3076923076923075, "grad_norm": 0.0, "learning_rate": 4.7855e-05, "loss": 1.6594, "mean_token_accuracy": 0.6622169464826584, "num_tokens": 1568042.0, "step": 430 }, { "epoch": 3.3461538461538463, "grad_norm": 0.0, "learning_rate": 4.783e-05, "loss": 1.6894, "mean_token_accuracy": 0.6561817824840546, "num_tokens": 1586468.0, "step": 435 }, { "epoch": 3.3846153846153846, "grad_norm": 0.0, "learning_rate": 4.7805e-05, "loss": 1.6675, "mean_token_accuracy": 0.6636441439390183, "num_tokens": 1605918.0, "step": 440 }, { "epoch": 3.423076923076923, "grad_norm": 0.0, "learning_rate": 4.778e-05, "loss": 1.7238, "mean_token_accuracy": 0.6505128175020218, "num_tokens": 1624275.0, "step": 445 }, { "epoch": 3.4615384615384617, "grad_norm": 0.0, "learning_rate": 4.7755e-05, "loss": 1.684, "mean_token_accuracy": 0.6628347337245941, "num_tokens": 1642694.0, "step": 450 }, { "epoch": 3.5, "grad_norm": 0.0, "learning_rate": 4.7730000000000005e-05, "loss": 1.7789, "mean_token_accuracy": 0.6409151911735534, "num_tokens": 1660710.0, "step": 455 }, { "epoch": 3.5384615384615383, "grad_norm": 0.0, "learning_rate": 4.7705e-05, "loss": 1.8119, "mean_token_accuracy": 0.6357834041118622, "num_tokens": 1678569.0, "step": 460 }, { "epoch": 3.5769230769230766, "grad_norm": 0.0, "learning_rate": 4.7680000000000004e-05, "loss": 1.7048, "mean_token_accuracy": 0.6561109334230423, "num_tokens": 1697511.0, "step": 465 }, { "epoch": 3.6153846153846154, "grad_norm": 0.0, "learning_rate": 4.7655e-05, "loss": 1.6402, "mean_token_accuracy": 0.6671018153429031, "num_tokens": 1717113.0, "step": 470 }, { "epoch": 3.6538461538461537, "grad_norm": 0.0, "learning_rate": 4.763e-05, "loss": 1.7797, "mean_token_accuracy": 0.6456864833831787, "num_tokens": 1735083.0, "step": 475 }, { "epoch": 3.6923076923076925, "grad_norm": 0.0, "learning_rate": 4.7605e-05, "loss": 1.83, "mean_token_accuracy": 0.6383150666952133, "num_tokens": 1752374.0, "step": 480 }, { "epoch": 3.730769230769231, "grad_norm": 0.0, "learning_rate": 4.758e-05, "loss": 1.7517, "mean_token_accuracy": 0.6486301571130753, "num_tokens": 1770507.0, "step": 485 }, { "epoch": 3.769230769230769, "grad_norm": 0.0, "learning_rate": 4.7555e-05, "loss": 1.786, "mean_token_accuracy": 0.6404696077108383, "num_tokens": 1788454.0, "step": 490 }, { "epoch": 3.8076923076923075, "grad_norm": 0.0, "learning_rate": 4.753e-05, "loss": 1.7256, "mean_token_accuracy": 0.6483594387769699, "num_tokens": 1807208.0, "step": 495 }, { "epoch": 3.8461538461538463, "grad_norm": 0.0, "learning_rate": 4.7505e-05, "loss": 1.6791, "mean_token_accuracy": 0.6660710662603379, "num_tokens": 1826017.0, "step": 500 }, { "epoch": 3.8846153846153846, "grad_norm": 0.0, "learning_rate": 4.748e-05, "loss": 1.7516, "mean_token_accuracy": 0.6497161477804184, "num_tokens": 1843251.0, "step": 505 }, { "epoch": 3.9230769230769234, "grad_norm": 0.0, "learning_rate": 4.7455000000000006e-05, "loss": 1.7586, "mean_token_accuracy": 0.6480034649372101, "num_tokens": 1861334.0, "step": 510 }, { "epoch": 3.9615384615384617, "grad_norm": 0.0, "learning_rate": 4.7430000000000005e-05, "loss": 1.7783, "mean_token_accuracy": 0.6420908480882644, "num_tokens": 1879077.0, "step": 515 }, { "epoch": 4.0, "grad_norm": 0.0, "learning_rate": 4.7405000000000004e-05, "loss": 1.7984, "mean_token_accuracy": 0.6410065919160843, "num_tokens": 1896420.0, "step": 520 }, { "epoch": 4.038461538461538, "grad_norm": 0.0, "learning_rate": 4.7380000000000004e-05, "loss": 1.7327, "mean_token_accuracy": 0.651974669098854, "num_tokens": 1915163.0, "step": 525 }, { "epoch": 4.076923076923077, "grad_norm": 0.0, "learning_rate": 4.7355e-05, "loss": 1.8077, "mean_token_accuracy": 0.6375770300626755, "num_tokens": 1932820.0, "step": 530 }, { "epoch": 4.115384615384615, "grad_norm": 0.0, "learning_rate": 4.733e-05, "loss": 1.7939, "mean_token_accuracy": 0.635331529378891, "num_tokens": 1951154.0, "step": 535 }, { "epoch": 4.153846153846154, "grad_norm": 0.0, "learning_rate": 4.7305e-05, "loss": 1.7697, "mean_token_accuracy": 0.6442851930856704, "num_tokens": 1968989.0, "step": 540 }, { "epoch": 4.1923076923076925, "grad_norm": 0.0, "learning_rate": 4.728e-05, "loss": 1.7535, "mean_token_accuracy": 0.6471328794956207, "num_tokens": 1987280.0, "step": 545 }, { "epoch": 4.230769230769231, "grad_norm": 0.0, "learning_rate": 4.725500000000001e-05, "loss": 1.7691, "mean_token_accuracy": 0.6456631302833558, "num_tokens": 2005118.0, "step": 550 }, { "epoch": 4.269230769230769, "grad_norm": 0.0, "learning_rate": 4.723e-05, "loss": 1.6754, "mean_token_accuracy": 0.6634574323892594, "num_tokens": 2024373.0, "step": 555 }, { "epoch": 4.3076923076923075, "grad_norm": 0.0, "learning_rate": 4.7205000000000006e-05, "loss": 1.7345, "mean_token_accuracy": 0.6540951490402221, "num_tokens": 2042349.0, "step": 560 }, { "epoch": 4.346153846153846, "grad_norm": 0.0, "learning_rate": 4.718e-05, "loss": 1.7127, "mean_token_accuracy": 0.6528139978647232, "num_tokens": 2060824.0, "step": 565 }, { "epoch": 4.384615384615385, "grad_norm": 0.0, "learning_rate": 4.7155000000000005e-05, "loss": 1.7282, "mean_token_accuracy": 0.6538840979337692, "num_tokens": 2079449.0, "step": 570 }, { "epoch": 4.423076923076923, "grad_norm": 0.0, "learning_rate": 4.7130000000000004e-05, "loss": 1.719, "mean_token_accuracy": 0.6608097106218338, "num_tokens": 2097502.0, "step": 575 }, { "epoch": 4.461538461538462, "grad_norm": 0.0, "learning_rate": 4.7105000000000004e-05, "loss": 1.6963, "mean_token_accuracy": 0.6655160158872604, "num_tokens": 2115911.0, "step": 580 }, { "epoch": 4.5, "grad_norm": 0.0, "learning_rate": 4.708e-05, "loss": 1.7936, "mean_token_accuracy": 0.635148537158966, "num_tokens": 2133543.0, "step": 585 }, { "epoch": 4.538461538461538, "grad_norm": 0.0, "learning_rate": 4.7055e-05, "loss": 1.7272, "mean_token_accuracy": 0.6549283742904664, "num_tokens": 2151767.0, "step": 590 }, { "epoch": 4.576923076923077, "grad_norm": 0.0, "learning_rate": 4.703e-05, "loss": 1.7624, "mean_token_accuracy": 0.6483795702457428, "num_tokens": 2169614.0, "step": 595 }, { "epoch": 4.615384615384615, "grad_norm": 0.0, "learning_rate": 4.7005e-05, "loss": 1.7502, "mean_token_accuracy": 0.6509936511516571, "num_tokens": 2187439.0, "step": 600 }, { "epoch": 4.653846153846154, "grad_norm": 0.0, "learning_rate": 4.698e-05, "loss": 1.712, "mean_token_accuracy": 0.6584094524383545, "num_tokens": 2206096.0, "step": 605 }, { "epoch": 4.6923076923076925, "grad_norm": 0.0, "learning_rate": 4.695500000000001e-05, "loss": 1.6726, "mean_token_accuracy": 0.6688409745693207, "num_tokens": 2225085.0, "step": 610 }, { "epoch": 4.730769230769231, "grad_norm": 0.0, "learning_rate": 4.693e-05, "loss": 1.6953, "mean_token_accuracy": 0.6592785984277725, "num_tokens": 2243907.0, "step": 615 }, { "epoch": 4.769230769230769, "grad_norm": 0.0, "learning_rate": 4.6905000000000006e-05, "loss": 1.7659, "mean_token_accuracy": 0.6406325221061706, "num_tokens": 2261272.0, "step": 620 }, { "epoch": 4.8076923076923075, "grad_norm": 0.0, "learning_rate": 4.688e-05, "loss": 1.7167, "mean_token_accuracy": 0.6534736841917038, "num_tokens": 2279807.0, "step": 625 }, { "epoch": 4.846153846153846, "grad_norm": 0.0, "learning_rate": 4.6855000000000005e-05, "loss": 1.8479, "mean_token_accuracy": 0.6247967123985291, "num_tokens": 2297161.0, "step": 630 }, { "epoch": 4.884615384615385, "grad_norm": 0.0, "learning_rate": 4.6830000000000004e-05, "loss": 1.8183, "mean_token_accuracy": 0.6385220259428024, "num_tokens": 2314811.0, "step": 635 }, { "epoch": 4.923076923076923, "grad_norm": 0.0, "learning_rate": 4.6805e-05, "loss": 1.6373, "mean_token_accuracy": 0.6711235165596008, "num_tokens": 2333816.0, "step": 640 }, { "epoch": 4.961538461538462, "grad_norm": 0.0, "learning_rate": 4.678e-05, "loss": 1.7512, "mean_token_accuracy": 0.647821244597435, "num_tokens": 2352150.0, "step": 645 }, { "epoch": 5.0, "grad_norm": 0.0, "learning_rate": 4.6755e-05, "loss": 1.7376, "mean_token_accuracy": 0.6522481203079223, "num_tokens": 2370525.0, "step": 650 }, { "epoch": 5.038461538461538, "grad_norm": 0.0, "learning_rate": 4.673e-05, "loss": 1.7032, "mean_token_accuracy": 0.66204434633255, "num_tokens": 2388858.0, "step": 655 }, { "epoch": 5.076923076923077, "grad_norm": 0.0, "learning_rate": 4.670500000000001e-05, "loss": 1.6985, "mean_token_accuracy": 0.6554162800312042, "num_tokens": 2408143.0, "step": 660 }, { "epoch": 5.115384615384615, "grad_norm": 0.0, "learning_rate": 4.668e-05, "loss": 1.7786, "mean_token_accuracy": 0.645899361371994, "num_tokens": 2425600.0, "step": 665 }, { "epoch": 5.153846153846154, "grad_norm": 0.0, "learning_rate": 4.6655000000000006e-05, "loss": 1.7416, "mean_token_accuracy": 0.6496942937374115, "num_tokens": 2443792.0, "step": 670 }, { "epoch": 5.1923076923076925, "grad_norm": 0.0, "learning_rate": 4.663e-05, "loss": 1.7879, "mean_token_accuracy": 0.6398061364889145, "num_tokens": 2461179.0, "step": 675 }, { "epoch": 5.230769230769231, "grad_norm": 0.0, "learning_rate": 4.6605000000000005e-05, "loss": 1.7348, "mean_token_accuracy": 0.642570036649704, "num_tokens": 2480028.0, "step": 680 }, { "epoch": 5.269230769230769, "grad_norm": 0.0, "learning_rate": 4.6580000000000005e-05, "loss": 1.7696, "mean_token_accuracy": 0.6482695400714874, "num_tokens": 2497710.0, "step": 685 }, { "epoch": 5.3076923076923075, "grad_norm": 0.0, "learning_rate": 4.6555000000000004e-05, "loss": 1.7291, "mean_token_accuracy": 0.655092054605484, "num_tokens": 2515935.0, "step": 690 }, { "epoch": 5.346153846153846, "grad_norm": 0.0, "learning_rate": 4.6530000000000003e-05, "loss": 1.7177, "mean_token_accuracy": 0.6567189127206803, "num_tokens": 2534143.0, "step": 695 }, { "epoch": 5.384615384615385, "grad_norm": 0.0, "learning_rate": 4.6505e-05, "loss": 1.771, "mean_token_accuracy": 0.6448938339948654, "num_tokens": 2552295.0, "step": 700 }, { "epoch": 5.423076923076923, "grad_norm": 0.0, "learning_rate": 4.648e-05, "loss": 1.7079, "mean_token_accuracy": 0.6543023705482482, "num_tokens": 2571330.0, "step": 705 }, { "epoch": 5.461538461538462, "grad_norm": 0.0, "learning_rate": 4.6455e-05, "loss": 1.6662, "mean_token_accuracy": 0.6654859989881515, "num_tokens": 2590405.0, "step": 710 }, { "epoch": 5.5, "grad_norm": 0.0, "learning_rate": 4.643e-05, "loss": 1.7794, "mean_token_accuracy": 0.6464732140302658, "num_tokens": 2607594.0, "step": 715 }, { "epoch": 5.538461538461538, "grad_norm": 0.0, "learning_rate": 4.640500000000001e-05, "loss": 1.7341, "mean_token_accuracy": 0.6509989589452744, "num_tokens": 2625915.0, "step": 720 }, { "epoch": 5.576923076923077, "grad_norm": 0.0, "learning_rate": 4.638e-05, "loss": 1.6809, "mean_token_accuracy": 0.6608490258455276, "num_tokens": 2644604.0, "step": 725 }, { "epoch": 5.615384615384615, "grad_norm": 0.0, "learning_rate": 4.6355000000000006e-05, "loss": 1.7797, "mean_token_accuracy": 0.6429726481437683, "num_tokens": 2662473.0, "step": 730 }, { "epoch": 5.653846153846154, "grad_norm": 0.0, "learning_rate": 4.633e-05, "loss": 1.7277, "mean_token_accuracy": 0.6474452137947082, "num_tokens": 2681136.0, "step": 735 }, { "epoch": 5.6923076923076925, "grad_norm": 0.0, "learning_rate": 4.6305000000000005e-05, "loss": 1.7779, "mean_token_accuracy": 0.6477038502693176, "num_tokens": 2698512.0, "step": 740 }, { "epoch": 5.730769230769231, "grad_norm": 0.0, "learning_rate": 4.6280000000000004e-05, "loss": 1.7583, "mean_token_accuracy": 0.6484469920396805, "num_tokens": 2716683.0, "step": 745 }, { "epoch": 5.769230769230769, "grad_norm": 0.0, "learning_rate": 4.6255000000000004e-05, "loss": 1.7656, "mean_token_accuracy": 0.6475028216838836, "num_tokens": 2734221.0, "step": 750 }, { "epoch": 5.8076923076923075, "grad_norm": 0.0, "learning_rate": 4.623e-05, "loss": 1.7389, "mean_token_accuracy": 0.6497486799955368, "num_tokens": 2752696.0, "step": 755 }, { "epoch": 5.846153846153846, "grad_norm": 0.0, "learning_rate": 4.6205e-05, "loss": 1.751, "mean_token_accuracy": 0.6450219005346298, "num_tokens": 2771304.0, "step": 760 }, { "epoch": 5.884615384615385, "grad_norm": 0.0, "learning_rate": 4.618e-05, "loss": 1.6985, "mean_token_accuracy": 0.655203464627266, "num_tokens": 2789693.0, "step": 765 }, { "epoch": 5.923076923076923, "grad_norm": 0.0, "learning_rate": 4.6155e-05, "loss": 1.7607, "mean_token_accuracy": 0.6439933836460113, "num_tokens": 2808244.0, "step": 770 }, { "epoch": 5.961538461538462, "grad_norm": 0.0, "learning_rate": 4.613e-05, "loss": 1.7215, "mean_token_accuracy": 0.6554672598838807, "num_tokens": 2826964.0, "step": 775 }, { "epoch": 6.0, "grad_norm": 0.0, "learning_rate": 4.610500000000001e-05, "loss": 1.8066, "mean_token_accuracy": 0.6397966831922531, "num_tokens": 2844630.0, "step": 780 }, { "epoch": 6.038461538461538, "grad_norm": 0.0, "learning_rate": 4.608e-05, "loss": 1.7057, "mean_token_accuracy": 0.6596891850233078, "num_tokens": 2863298.0, "step": 785 }, { "epoch": 6.076923076923077, "grad_norm": 0.0, "learning_rate": 4.6055000000000005e-05, "loss": 1.7406, "mean_token_accuracy": 0.6515646994113922, "num_tokens": 2880896.0, "step": 790 }, { "epoch": 6.115384615384615, "grad_norm": 0.0, "learning_rate": 4.603e-05, "loss": 1.7509, "mean_token_accuracy": 0.6432402580976486, "num_tokens": 2899195.0, "step": 795 }, { "epoch": 6.153846153846154, "grad_norm": 0.0, "learning_rate": 4.6005000000000004e-05, "loss": 1.7547, "mean_token_accuracy": 0.6499321848154068, "num_tokens": 2916949.0, "step": 800 }, { "epoch": 6.1923076923076925, "grad_norm": 0.0, "learning_rate": 4.5980000000000004e-05, "loss": 1.7903, "mean_token_accuracy": 0.6418056339025497, "num_tokens": 2935129.0, "step": 805 }, { "epoch": 6.230769230769231, "grad_norm": 0.0, "learning_rate": 4.5955e-05, "loss": 1.7601, "mean_token_accuracy": 0.64379281103611, "num_tokens": 2953207.0, "step": 810 }, { "epoch": 6.269230769230769, "grad_norm": 0.0, "learning_rate": 4.593e-05, "loss": 1.7694, "mean_token_accuracy": 0.6440808981657028, "num_tokens": 2971018.0, "step": 815 }, { "epoch": 6.3076923076923075, "grad_norm": 0.0, "learning_rate": 4.5905e-05, "loss": 1.6246, "mean_token_accuracy": 0.6741865813732147, "num_tokens": 2990306.0, "step": 820 }, { "epoch": 6.346153846153846, "grad_norm": 0.0, "learning_rate": 4.588e-05, "loss": 1.7449, "mean_token_accuracy": 0.6494547843933105, "num_tokens": 3008827.0, "step": 825 }, { "epoch": 6.384615384615385, "grad_norm": 0.0, "learning_rate": 4.5855e-05, "loss": 1.716, "mean_token_accuracy": 0.6559219211339951, "num_tokens": 3027465.0, "step": 830 }, { "epoch": 6.423076923076923, "grad_norm": 0.0, "learning_rate": 4.583e-05, "loss": 1.7343, "mean_token_accuracy": 0.6473995357751846, "num_tokens": 3046088.0, "step": 835 }, { "epoch": 6.461538461538462, "grad_norm": 0.0, "learning_rate": 4.5805000000000006e-05, "loss": 1.7288, "mean_token_accuracy": 0.653452581167221, "num_tokens": 3064572.0, "step": 840 }, { "epoch": 6.5, "grad_norm": 0.0, "learning_rate": 4.578e-05, "loss": 1.7867, "mean_token_accuracy": 0.6452230870723724, "num_tokens": 3081959.0, "step": 845 }, { "epoch": 6.538461538461538, "grad_norm": 0.0, "learning_rate": 4.5755000000000005e-05, "loss": 1.7674, "mean_token_accuracy": 0.6463764935731888, "num_tokens": 3100123.0, "step": 850 }, { "epoch": 6.576923076923077, "grad_norm": 0.0, "learning_rate": 4.573e-05, "loss": 1.7434, "mean_token_accuracy": 0.6475829243659973, "num_tokens": 3117886.0, "step": 855 }, { "epoch": 6.615384615384615, "grad_norm": 0.0, "learning_rate": 4.5705000000000004e-05, "loss": 1.6876, "mean_token_accuracy": 0.6611418306827546, "num_tokens": 3136818.0, "step": 860 }, { "epoch": 6.653846153846154, "grad_norm": 0.0, "learning_rate": 4.568e-05, "loss": 1.6901, "mean_token_accuracy": 0.6592308074235916, "num_tokens": 3156269.0, "step": 865 }, { "epoch": 6.6923076923076925, "grad_norm": 0.0, "learning_rate": 4.5655e-05, "loss": 1.7783, "mean_token_accuracy": 0.6404279589653015, "num_tokens": 3174097.0, "step": 870 }, { "epoch": 6.730769230769231, "grad_norm": 0.0, "learning_rate": 4.563e-05, "loss": 1.7347, "mean_token_accuracy": 0.649324044585228, "num_tokens": 3192152.0, "step": 875 }, { "epoch": 6.769230769230769, "grad_norm": 0.0, "learning_rate": 4.5605e-05, "loss": 1.7318, "mean_token_accuracy": 0.6562680333852768, "num_tokens": 3210441.0, "step": 880 }, { "epoch": 6.8076923076923075, "grad_norm": 0.0, "learning_rate": 4.558e-05, "loss": 1.754, "mean_token_accuracy": 0.6517583757638932, "num_tokens": 3228565.0, "step": 885 }, { "epoch": 6.846153846153846, "grad_norm": 0.0, "learning_rate": 4.5555e-05, "loss": 1.719, "mean_token_accuracy": 0.6558358192443847, "num_tokens": 3246686.0, "step": 890 }, { "epoch": 6.884615384615385, "grad_norm": 0.0, "learning_rate": 4.553e-05, "loss": 1.7535, "mean_token_accuracy": 0.6434608608484268, "num_tokens": 3264528.0, "step": 895 }, { "epoch": 6.923076923076923, "grad_norm": 0.0, "learning_rate": 4.5505000000000006e-05, "loss": 1.7823, "mean_token_accuracy": 0.6435014694929123, "num_tokens": 3282692.0, "step": 900 }, { "epoch": 6.961538461538462, "grad_norm": 0.0, "learning_rate": 4.548e-05, "loss": 1.8069, "mean_token_accuracy": 0.6398796170949936, "num_tokens": 3300093.0, "step": 905 }, { "epoch": 7.0, "grad_norm": 0.0, "learning_rate": 4.5455000000000004e-05, "loss": 1.7239, "mean_token_accuracy": 0.6568648606538773, "num_tokens": 3318735.0, "step": 910 }, { "epoch": 7.038461538461538, "grad_norm": 0.0, "learning_rate": 4.543e-05, "loss": 1.6958, "mean_token_accuracy": 0.657955089211464, "num_tokens": 3337799.0, "step": 915 }, { "epoch": 7.076923076923077, "grad_norm": 0.0, "learning_rate": 4.5405e-05, "loss": 1.704, "mean_token_accuracy": 0.6572122246026992, "num_tokens": 3356448.0, "step": 920 }, { "epoch": 7.115384615384615, "grad_norm": 0.0, "learning_rate": 4.538e-05, "loss": 1.7811, "mean_token_accuracy": 0.6435384958982467, "num_tokens": 3374165.0, "step": 925 }, { "epoch": 7.153846153846154, "grad_norm": 0.0, "learning_rate": 4.5355e-05, "loss": 1.7932, "mean_token_accuracy": 0.643210482597351, "num_tokens": 3391722.0, "step": 930 }, { "epoch": 7.1923076923076925, "grad_norm": 0.0, "learning_rate": 4.533e-05, "loss": 1.7659, "mean_token_accuracy": 0.6444118946790696, "num_tokens": 3409413.0, "step": 935 }, { "epoch": 7.230769230769231, "grad_norm": 0.0, "learning_rate": 4.5305e-05, "loss": 1.6323, "mean_token_accuracy": 0.672014307975769, "num_tokens": 3428160.0, "step": 940 }, { "epoch": 7.269230769230769, "grad_norm": 0.0, "learning_rate": 4.528e-05, "loss": 1.7143, "mean_token_accuracy": 0.6540875136852264, "num_tokens": 3447098.0, "step": 945 }, { "epoch": 7.3076923076923075, "grad_norm": 0.0, "learning_rate": 4.5255000000000006e-05, "loss": 1.7804, "mean_token_accuracy": 0.6401316851377488, "num_tokens": 3465037.0, "step": 950 }, { "epoch": 7.346153846153846, "grad_norm": 0.0, "learning_rate": 4.523e-05, "loss": 1.7542, "mean_token_accuracy": 0.6487213045358657, "num_tokens": 3483393.0, "step": 955 }, { "epoch": 7.384615384615385, "grad_norm": 0.0, "learning_rate": 4.5205000000000005e-05, "loss": 1.7863, "mean_token_accuracy": 0.6442375689744949, "num_tokens": 3500954.0, "step": 960 }, { "epoch": 7.423076923076923, "grad_norm": 0.0, "learning_rate": 4.518e-05, "loss": 1.807, "mean_token_accuracy": 0.6401415497064591, "num_tokens": 3519001.0, "step": 965 }, { "epoch": 7.461538461538462, "grad_norm": 0.0, "learning_rate": 4.5155000000000004e-05, "loss": 1.7395, "mean_token_accuracy": 0.6544391065835953, "num_tokens": 3537230.0, "step": 970 }, { "epoch": 7.5, "grad_norm": 0.0, "learning_rate": 4.513e-05, "loss": 1.7505, "mean_token_accuracy": 0.6461744338274003, "num_tokens": 3555470.0, "step": 975 }, { "epoch": 7.538461538461538, "grad_norm": 0.0, "learning_rate": 4.5105e-05, "loss": 1.7005, "mean_token_accuracy": 0.6609607338905334, "num_tokens": 3574088.0, "step": 980 }, { "epoch": 7.576923076923077, "grad_norm": 0.0, "learning_rate": 4.508e-05, "loss": 1.7752, "mean_token_accuracy": 0.6407621741294861, "num_tokens": 3592361.0, "step": 985 }, { "epoch": 7.615384615384615, "grad_norm": 0.0, "learning_rate": 4.5055e-05, "loss": 1.7903, "mean_token_accuracy": 0.6438402742147445, "num_tokens": 3610243.0, "step": 990 }, { "epoch": 7.653846153846154, "grad_norm": 0.0, "learning_rate": 4.503e-05, "loss": 1.7926, "mean_token_accuracy": 0.6445695728063583, "num_tokens": 3627858.0, "step": 995 }, { "epoch": 7.6923076923076925, "grad_norm": 0.0, "learning_rate": 4.5005e-05, "loss": 1.679, "mean_token_accuracy": 0.6652253627777099, "num_tokens": 3646594.0, "step": 1000 }, { "epoch": 7.730769230769231, "grad_norm": 0.0, "learning_rate": 4.498e-05, "loss": 1.8077, "mean_token_accuracy": 0.6364081174135208, "num_tokens": 3664583.0, "step": 1005 }, { "epoch": 7.769230769230769, "grad_norm": 0.0, "learning_rate": 4.4955000000000006e-05, "loss": 1.7589, "mean_token_accuracy": 0.6449447929859161, "num_tokens": 3682471.0, "step": 1010 }, { "epoch": 7.8076923076923075, "grad_norm": 0.0, "learning_rate": 4.493e-05, "loss": 1.6992, "mean_token_accuracy": 0.6603027373552323, "num_tokens": 3701373.0, "step": 1015 }, { "epoch": 7.846153846153846, "grad_norm": 0.0, "learning_rate": 4.4905000000000005e-05, "loss": 1.6505, "mean_token_accuracy": 0.6682931154966354, "num_tokens": 3720280.0, "step": 1020 }, { "epoch": 7.884615384615385, "grad_norm": 0.0, "learning_rate": 4.488e-05, "loss": 1.7144, "mean_token_accuracy": 0.6550876766443252, "num_tokens": 3738885.0, "step": 1025 }, { "epoch": 7.923076923076923, "grad_norm": 0.0, "learning_rate": 4.4855e-05, "loss": 1.6457, "mean_token_accuracy": 0.6694303750991821, "num_tokens": 3757638.0, "step": 1030 }, { "epoch": 7.961538461538462, "grad_norm": 0.0, "learning_rate": 4.483e-05, "loss": 1.8101, "mean_token_accuracy": 0.6355878323316574, "num_tokens": 3775161.0, "step": 1035 }, { "epoch": 8.0, "grad_norm": 0.0, "learning_rate": 4.4805e-05, "loss": 1.7585, "mean_token_accuracy": 0.6487736940383911, "num_tokens": 3792840.0, "step": 1040 }, { "epoch": 8.038461538461538, "grad_norm": 0.0, "learning_rate": 4.478e-05, "loss": 1.7554, "mean_token_accuracy": 0.6465101599693298, "num_tokens": 3810513.0, "step": 1045 }, { "epoch": 8.076923076923077, "grad_norm": 0.0, "learning_rate": 4.4755e-05, "loss": 1.7955, "mean_token_accuracy": 0.6376824468374253, "num_tokens": 3828269.0, "step": 1050 }, { "epoch": 8.115384615384615, "grad_norm": 0.0, "learning_rate": 4.473e-05, "loss": 1.7313, "mean_token_accuracy": 0.6513699233531952, "num_tokens": 3846785.0, "step": 1055 }, { "epoch": 8.153846153846153, "grad_norm": 0.0, "learning_rate": 4.4705e-05, "loss": 1.779, "mean_token_accuracy": 0.6457870662212372, "num_tokens": 3864589.0, "step": 1060 }, { "epoch": 8.192307692307692, "grad_norm": 0.0, "learning_rate": 4.468e-05, "loss": 1.7206, "mean_token_accuracy": 0.655270129442215, "num_tokens": 3882774.0, "step": 1065 }, { "epoch": 8.23076923076923, "grad_norm": 0.0, "learning_rate": 4.4655000000000005e-05, "loss": 1.6979, "mean_token_accuracy": 0.6581394881010055, "num_tokens": 3901162.0, "step": 1070 }, { "epoch": 8.26923076923077, "grad_norm": 0.0, "learning_rate": 4.463e-05, "loss": 1.7762, "mean_token_accuracy": 0.6421261847019195, "num_tokens": 3919194.0, "step": 1075 }, { "epoch": 8.307692307692308, "grad_norm": 0.0, "learning_rate": 4.4605000000000004e-05, "loss": 1.7165, "mean_token_accuracy": 0.6560532629489899, "num_tokens": 3937378.0, "step": 1080 }, { "epoch": 8.346153846153847, "grad_norm": 0.0, "learning_rate": 4.458e-05, "loss": 1.7614, "mean_token_accuracy": 0.6535450726747513, "num_tokens": 3955373.0, "step": 1085 }, { "epoch": 8.384615384615385, "grad_norm": 0.0, "learning_rate": 4.4555e-05, "loss": 1.6481, "mean_token_accuracy": 0.6688576519489289, "num_tokens": 3974246.0, "step": 1090 }, { "epoch": 8.423076923076923, "grad_norm": 0.0, "learning_rate": 4.453e-05, "loss": 1.6684, "mean_token_accuracy": 0.6643936514854432, "num_tokens": 3992871.0, "step": 1095 }, { "epoch": 8.461538461538462, "grad_norm": 0.0, "learning_rate": 4.4505e-05, "loss": 1.7571, "mean_token_accuracy": 0.6398821622133255, "num_tokens": 4011349.0, "step": 1100 }, { "epoch": 8.5, "grad_norm": 0.0, "learning_rate": 4.448e-05, "loss": 1.784, "mean_token_accuracy": 0.6396127253770828, "num_tokens": 4029422.0, "step": 1105 }, { "epoch": 8.538461538461538, "grad_norm": 0.0, "learning_rate": 4.4455e-05, "loss": 1.7972, "mean_token_accuracy": 0.6439882755279541, "num_tokens": 4047081.0, "step": 1110 }, { "epoch": 8.576923076923077, "grad_norm": 0.0, "learning_rate": 4.443e-05, "loss": 1.8174, "mean_token_accuracy": 0.6325599551200867, "num_tokens": 4065118.0, "step": 1115 }, { "epoch": 8.615384615384615, "grad_norm": 0.0, "learning_rate": 4.4405e-05, "loss": 1.7934, "mean_token_accuracy": 0.6423143684864044, "num_tokens": 4083327.0, "step": 1120 }, { "epoch": 8.653846153846153, "grad_norm": 0.0, "learning_rate": 4.438e-05, "loss": 1.746, "mean_token_accuracy": 0.646085262298584, "num_tokens": 4101731.0, "step": 1125 }, { "epoch": 8.692307692307692, "grad_norm": 0.0, "learning_rate": 4.4355000000000005e-05, "loss": 1.7223, "mean_token_accuracy": 0.6499712377786636, "num_tokens": 4120160.0, "step": 1130 }, { "epoch": 8.73076923076923, "grad_norm": 0.0, "learning_rate": 4.4330000000000004e-05, "loss": 1.7363, "mean_token_accuracy": 0.6554156959056854, "num_tokens": 4138405.0, "step": 1135 }, { "epoch": 8.76923076923077, "grad_norm": 0.0, "learning_rate": 4.4305000000000004e-05, "loss": 1.6461, "mean_token_accuracy": 0.6685263246297837, "num_tokens": 4157763.0, "step": 1140 }, { "epoch": 8.807692307692308, "grad_norm": 0.0, "learning_rate": 4.428e-05, "loss": 1.7541, "mean_token_accuracy": 0.6515760749578476, "num_tokens": 4176205.0, "step": 1145 }, { "epoch": 8.846153846153847, "grad_norm": 0.0, "learning_rate": 4.4255e-05, "loss": 1.6547, "mean_token_accuracy": 0.6698046773672104, "num_tokens": 4195030.0, "step": 1150 }, { "epoch": 8.884615384615385, "grad_norm": 0.0, "learning_rate": 4.423e-05, "loss": 1.7975, "mean_token_accuracy": 0.638244041800499, "num_tokens": 4212955.0, "step": 1155 }, { "epoch": 8.923076923076923, "grad_norm": 0.0, "learning_rate": 4.4205e-05, "loss": 1.7643, "mean_token_accuracy": 0.6474886476993561, "num_tokens": 4230913.0, "step": 1160 }, { "epoch": 8.961538461538462, "grad_norm": 0.0, "learning_rate": 4.418000000000001e-05, "loss": 1.763, "mean_token_accuracy": 0.6492164492607116, "num_tokens": 4248472.0, "step": 1165 }, { "epoch": 9.0, "grad_norm": 0.0, "learning_rate": 4.4155e-05, "loss": 1.6968, "mean_token_accuracy": 0.661212831735611, "num_tokens": 4266945.0, "step": 1170 }, { "epoch": 9.038461538461538, "grad_norm": 0.0, "learning_rate": 4.4130000000000006e-05, "loss": 1.6492, "mean_token_accuracy": 0.6669099152088165, "num_tokens": 4285875.0, "step": 1175 }, { "epoch": 9.076923076923077, "grad_norm": 0.0, "learning_rate": 4.4105e-05, "loss": 1.7311, "mean_token_accuracy": 0.6541598349809646, "num_tokens": 4304365.0, "step": 1180 }, { "epoch": 9.115384615384615, "grad_norm": 0.0, "learning_rate": 4.4080000000000005e-05, "loss": 1.6924, "mean_token_accuracy": 0.6604053825139999, "num_tokens": 4323536.0, "step": 1185 }, { "epoch": 9.153846153846153, "grad_norm": 0.0, "learning_rate": 4.4055000000000004e-05, "loss": 1.8026, "mean_token_accuracy": 0.6396621257066727, "num_tokens": 4341032.0, "step": 1190 }, { "epoch": 9.192307692307692, "grad_norm": 0.0, "learning_rate": 4.4030000000000004e-05, "loss": 1.7641, "mean_token_accuracy": 0.644762921333313, "num_tokens": 4358653.0, "step": 1195 }, { "epoch": 9.23076923076923, "grad_norm": 0.0, "learning_rate": 4.4005e-05, "loss": 1.8077, "mean_token_accuracy": 0.6364401876926422, "num_tokens": 4376472.0, "step": 1200 }, { "epoch": 9.26923076923077, "grad_norm": 0.0, "learning_rate": 4.398e-05, "loss": 1.7567, "mean_token_accuracy": 0.6473460882902146, "num_tokens": 4394898.0, "step": 1205 }, { "epoch": 9.307692307692308, "grad_norm": 0.0, "learning_rate": 4.3955e-05, "loss": 1.6926, "mean_token_accuracy": 0.6541574269533157, "num_tokens": 4413886.0, "step": 1210 }, { "epoch": 9.346153846153847, "grad_norm": 0.0, "learning_rate": 4.393e-05, "loss": 1.6822, "mean_token_accuracy": 0.6630573570728302, "num_tokens": 4432735.0, "step": 1215 }, { "epoch": 9.384615384615385, "grad_norm": 0.0, "learning_rate": 4.3905e-05, "loss": 1.7395, "mean_token_accuracy": 0.6511468648910522, "num_tokens": 4450737.0, "step": 1220 }, { "epoch": 9.423076923076923, "grad_norm": 0.0, "learning_rate": 4.388000000000001e-05, "loss": 1.7185, "mean_token_accuracy": 0.6551208138465882, "num_tokens": 4469277.0, "step": 1225 }, { "epoch": 9.461538461538462, "grad_norm": 0.0, "learning_rate": 4.3855e-05, "loss": 1.7301, "mean_token_accuracy": 0.6529223084449768, "num_tokens": 4487471.0, "step": 1230 }, { "epoch": 9.5, "grad_norm": 0.0, "learning_rate": 4.3830000000000006e-05, "loss": 1.7984, "mean_token_accuracy": 0.6414839684963226, "num_tokens": 4504876.0, "step": 1235 }, { "epoch": 9.538461538461538, "grad_norm": 0.0, "learning_rate": 4.3805000000000005e-05, "loss": 1.7501, "mean_token_accuracy": 0.650190019607544, "num_tokens": 4522905.0, "step": 1240 }, { "epoch": 9.576923076923077, "grad_norm": 0.0, "learning_rate": 4.3780000000000004e-05, "loss": 1.7429, "mean_token_accuracy": 0.6522337943315506, "num_tokens": 4541124.0, "step": 1245 }, { "epoch": 9.615384615384615, "grad_norm": 0.0, "learning_rate": 4.3755000000000004e-05, "loss": 1.6859, "mean_token_accuracy": 0.6608754128217698, "num_tokens": 4560264.0, "step": 1250 }, { "epoch": 9.653846153846153, "grad_norm": 0.0, "learning_rate": 4.373e-05, "loss": 1.6919, "mean_token_accuracy": 0.6609419882297516, "num_tokens": 4579633.0, "step": 1255 }, { "epoch": 9.692307692307692, "grad_norm": 0.0, "learning_rate": 4.3705e-05, "loss": 1.7671, "mean_token_accuracy": 0.6452909797430039, "num_tokens": 4597291.0, "step": 1260 }, { "epoch": 9.73076923076923, "grad_norm": 0.0, "learning_rate": 4.368e-05, "loss": 1.7981, "mean_token_accuracy": 0.6409468352794647, "num_tokens": 4615469.0, "step": 1265 }, { "epoch": 9.76923076923077, "grad_norm": 0.0, "learning_rate": 4.3655e-05, "loss": 1.7488, "mean_token_accuracy": 0.6545175701379776, "num_tokens": 4633258.0, "step": 1270 }, { "epoch": 9.807692307692308, "grad_norm": 0.0, "learning_rate": 4.363000000000001e-05, "loss": 1.7886, "mean_token_accuracy": 0.6439413070678711, "num_tokens": 4650771.0, "step": 1275 }, { "epoch": 9.846153846153847, "grad_norm": 0.0, "learning_rate": 4.3605e-05, "loss": 1.6981, "mean_token_accuracy": 0.658900797367096, "num_tokens": 4669522.0, "step": 1280 }, { "epoch": 9.884615384615385, "grad_norm": 0.0, "learning_rate": 4.3580000000000006e-05, "loss": 1.778, "mean_token_accuracy": 0.6451369762420655, "num_tokens": 4687769.0, "step": 1285 }, { "epoch": 9.923076923076923, "grad_norm": 0.0, "learning_rate": 4.3555e-05, "loss": 1.733, "mean_token_accuracy": 0.6531021893024445, "num_tokens": 4705563.0, "step": 1290 }, { "epoch": 9.961538461538462, "grad_norm": 0.0, "learning_rate": 4.3530000000000005e-05, "loss": 1.7758, "mean_token_accuracy": 0.6376697093248367, "num_tokens": 4723178.0, "step": 1295 }, { "epoch": 10.0, "grad_norm": 0.0, "learning_rate": 4.3505000000000004e-05, "loss": 1.7658, "mean_token_accuracy": 0.6424418658018112, "num_tokens": 4741050.0, "step": 1300 }, { "epoch": 10.038461538461538, "grad_norm": 0.0, "learning_rate": 4.3480000000000004e-05, "loss": 1.7156, "mean_token_accuracy": 0.6535384625196456, "num_tokens": 4759671.0, "step": 1305 }, { "epoch": 10.076923076923077, "grad_norm": 0.0, "learning_rate": 4.3455e-05, "loss": 1.7894, "mean_token_accuracy": 0.6466272503137589, "num_tokens": 4777543.0, "step": 1310 }, { "epoch": 10.115384615384615, "grad_norm": 0.0, "learning_rate": 4.343e-05, "loss": 1.7418, "mean_token_accuracy": 0.6525053322315216, "num_tokens": 4795767.0, "step": 1315 }, { "epoch": 10.153846153846153, "grad_norm": 0.0, "learning_rate": 4.3405e-05, "loss": 1.8019, "mean_token_accuracy": 0.641847088932991, "num_tokens": 4812856.0, "step": 1320 }, { "epoch": 10.192307692307692, "grad_norm": 0.0, "learning_rate": 4.338e-05, "loss": 1.713, "mean_token_accuracy": 0.6589140474796296, "num_tokens": 4831209.0, "step": 1325 }, { "epoch": 10.23076923076923, "grad_norm": 0.0, "learning_rate": 4.3355e-05, "loss": 1.7601, "mean_token_accuracy": 0.6469725668430328, "num_tokens": 4849251.0, "step": 1330 }, { "epoch": 10.26923076923077, "grad_norm": 0.0, "learning_rate": 4.333000000000001e-05, "loss": 1.7413, "mean_token_accuracy": 0.6520713210105896, "num_tokens": 4867548.0, "step": 1335 }, { "epoch": 10.307692307692308, "grad_norm": 0.0, "learning_rate": 4.3305e-05, "loss": 1.7205, "mean_token_accuracy": 0.650857400894165, "num_tokens": 4885905.0, "step": 1340 }, { "epoch": 10.346153846153847, "grad_norm": 0.0, "learning_rate": 4.3280000000000006e-05, "loss": 1.7016, "mean_token_accuracy": 0.6605552464723587, "num_tokens": 4904774.0, "step": 1345 }, { "epoch": 10.384615384615385, "grad_norm": 0.0, "learning_rate": 4.3255e-05, "loss": 1.6779, "mean_token_accuracy": 0.6666248500347137, "num_tokens": 4923475.0, "step": 1350 }, { "epoch": 10.423076923076923, "grad_norm": 0.0, "learning_rate": 4.3230000000000005e-05, "loss": 1.7332, "mean_token_accuracy": 0.6488200217485428, "num_tokens": 4942112.0, "step": 1355 }, { "epoch": 10.461538461538462, "grad_norm": 0.0, "learning_rate": 4.3205000000000004e-05, "loss": 1.7457, "mean_token_accuracy": 0.6494636863470078, "num_tokens": 4960202.0, "step": 1360 }, { "epoch": 10.5, "grad_norm": 0.0, "learning_rate": 4.318e-05, "loss": 1.7592, "mean_token_accuracy": 0.6439968854188919, "num_tokens": 4978469.0, "step": 1365 }, { "epoch": 10.538461538461538, "grad_norm": 0.0, "learning_rate": 4.3155e-05, "loss": 1.7408, "mean_token_accuracy": 0.6498746544122695, "num_tokens": 4997586.0, "step": 1370 }, { "epoch": 10.576923076923077, "grad_norm": 0.0, "learning_rate": 4.313e-05, "loss": 1.7511, "mean_token_accuracy": 0.647632023692131, "num_tokens": 5015966.0, "step": 1375 }, { "epoch": 10.615384615384615, "grad_norm": 0.0, "learning_rate": 4.3105e-05, "loss": 1.7565, "mean_token_accuracy": 0.6496962130069732, "num_tokens": 5033532.0, "step": 1380 }, { "epoch": 10.653846153846153, "grad_norm": 0.0, "learning_rate": 4.308e-05, "loss": 1.7098, "mean_token_accuracy": 0.6550800025463104, "num_tokens": 5052100.0, "step": 1385 }, { "epoch": 10.692307692307692, "grad_norm": 0.0, "learning_rate": 4.3055e-05, "loss": 1.7241, "mean_token_accuracy": 0.6496223241090775, "num_tokens": 5070966.0, "step": 1390 }, { "epoch": 10.73076923076923, "grad_norm": 0.0, "learning_rate": 4.3030000000000006e-05, "loss": 1.7204, "mean_token_accuracy": 0.6540403842926026, "num_tokens": 5089950.0, "step": 1395 }, { "epoch": 10.76923076923077, "grad_norm": 0.0, "learning_rate": 4.3005e-05, "loss": 1.7644, "mean_token_accuracy": 0.6442721575498581, "num_tokens": 5107960.0, "step": 1400 }, { "epoch": 10.807692307692308, "grad_norm": 0.0, "learning_rate": 4.2980000000000005e-05, "loss": 1.7286, "mean_token_accuracy": 0.6496645718812942, "num_tokens": 5126317.0, "step": 1405 }, { "epoch": 10.846153846153847, "grad_norm": 0.0, "learning_rate": 4.2955e-05, "loss": 1.7809, "mean_token_accuracy": 0.6493318378925323, "num_tokens": 5143608.0, "step": 1410 }, { "epoch": 10.884615384615385, "grad_norm": 0.0, "learning_rate": 4.2930000000000004e-05, "loss": 1.7568, "mean_token_accuracy": 0.6470446825027466, "num_tokens": 5161679.0, "step": 1415 }, { "epoch": 10.923076923076923, "grad_norm": 0.0, "learning_rate": 4.2905000000000003e-05, "loss": 1.7742, "mean_token_accuracy": 0.6468683093786239, "num_tokens": 5179282.0, "step": 1420 }, { "epoch": 10.961538461538462, "grad_norm": 0.0, "learning_rate": 4.288e-05, "loss": 1.6856, "mean_token_accuracy": 0.6599813520908355, "num_tokens": 5197538.0, "step": 1425 }, { "epoch": 11.0, "grad_norm": 0.0, "learning_rate": 4.2855e-05, "loss": 1.7956, "mean_token_accuracy": 0.642606571316719, "num_tokens": 5215155.0, "step": 1430 }, { "epoch": 11.038461538461538, "grad_norm": 0.0, "learning_rate": 4.283e-05, "loss": 1.7845, "mean_token_accuracy": 0.6424670696258545, "num_tokens": 5232784.0, "step": 1435 }, { "epoch": 11.076923076923077, "grad_norm": 0.0, "learning_rate": 4.2805e-05, "loss": 1.7199, "mean_token_accuracy": 0.6572603791952133, "num_tokens": 5250874.0, "step": 1440 }, { "epoch": 11.115384615384615, "grad_norm": 0.0, "learning_rate": 4.278e-05, "loss": 1.6408, "mean_token_accuracy": 0.6750534921884537, "num_tokens": 5270343.0, "step": 1445 }, { "epoch": 11.153846153846153, "grad_norm": 0.0, "learning_rate": 4.2755e-05, "loss": 1.7426, "mean_token_accuracy": 0.6530660510063171, "num_tokens": 5287928.0, "step": 1450 }, { "epoch": 11.192307692307692, "grad_norm": 0.0, "learning_rate": 4.2730000000000006e-05, "loss": 1.7704, "mean_token_accuracy": 0.6498903334140778, "num_tokens": 5305899.0, "step": 1455 }, { "epoch": 11.23076923076923, "grad_norm": 0.0, "learning_rate": 4.2705e-05, "loss": 1.8094, "mean_token_accuracy": 0.6360325515270233, "num_tokens": 5323814.0, "step": 1460 }, { "epoch": 11.26923076923077, "grad_norm": 0.0, "learning_rate": 4.2680000000000005e-05, "loss": 1.7991, "mean_token_accuracy": 0.6365718424320221, "num_tokens": 5341443.0, "step": 1465 }, { "epoch": 11.307692307692308, "grad_norm": 0.0, "learning_rate": 4.2655e-05, "loss": 1.6735, "mean_token_accuracy": 0.6595575481653213, "num_tokens": 5361091.0, "step": 1470 }, { "epoch": 11.346153846153847, "grad_norm": 0.0, "learning_rate": 4.2630000000000004e-05, "loss": 1.8037, "mean_token_accuracy": 0.639024817943573, "num_tokens": 5378623.0, "step": 1475 }, { "epoch": 11.384615384615385, "grad_norm": 0.0, "learning_rate": 4.2605e-05, "loss": 1.7528, "mean_token_accuracy": 0.6455299586057663, "num_tokens": 5397097.0, "step": 1480 }, { "epoch": 11.423076923076923, "grad_norm": 0.0, "learning_rate": 4.258e-05, "loss": 1.7405, "mean_token_accuracy": 0.6521054923534393, "num_tokens": 5415277.0, "step": 1485 }, { "epoch": 11.461538461538462, "grad_norm": 0.0, "learning_rate": 4.2555e-05, "loss": 1.7168, "mean_token_accuracy": 0.6546613723039627, "num_tokens": 5433912.0, "step": 1490 }, { "epoch": 11.5, "grad_norm": 0.0, "learning_rate": 4.253e-05, "loss": 1.6738, "mean_token_accuracy": 0.6659036576747894, "num_tokens": 5453181.0, "step": 1495 }, { "epoch": 11.538461538461538, "grad_norm": 0.0, "learning_rate": 4.2505e-05, "loss": 1.7129, "mean_token_accuracy": 0.6556811064481736, "num_tokens": 5472123.0, "step": 1500 }, { "epoch": 11.576923076923077, "grad_norm": 0.0, "learning_rate": 4.248e-05, "loss": 1.7642, "mean_token_accuracy": 0.6464402437210083, "num_tokens": 5489385.0, "step": 1505 }, { "epoch": 11.615384615384615, "grad_norm": 0.0, "learning_rate": 4.2455e-05, "loss": 1.7674, "mean_token_accuracy": 0.6475972950458526, "num_tokens": 5506841.0, "step": 1510 }, { "epoch": 11.653846153846153, "grad_norm": 0.0, "learning_rate": 4.2430000000000005e-05, "loss": 1.7238, "mean_token_accuracy": 0.6543399661779403, "num_tokens": 5525059.0, "step": 1515 }, { "epoch": 11.692307692307692, "grad_norm": 0.0, "learning_rate": 4.2405e-05, "loss": 1.7736, "mean_token_accuracy": 0.6412011951208114, "num_tokens": 5542986.0, "step": 1520 }, { "epoch": 11.73076923076923, "grad_norm": 0.0, "learning_rate": 4.2380000000000004e-05, "loss": 1.7368, "mean_token_accuracy": 0.6486877024173736, "num_tokens": 5560991.0, "step": 1525 }, { "epoch": 11.76923076923077, "grad_norm": 0.0, "learning_rate": 4.2355000000000004e-05, "loss": 1.7506, "mean_token_accuracy": 0.6509437531232833, "num_tokens": 5579072.0, "step": 1530 }, { "epoch": 11.807692307692308, "grad_norm": 0.0, "learning_rate": 4.233e-05, "loss": 1.7534, "mean_token_accuracy": 0.6474777191877366, "num_tokens": 5597799.0, "step": 1535 }, { "epoch": 11.846153846153847, "grad_norm": 0.0, "learning_rate": 4.2305e-05, "loss": 1.7265, "mean_token_accuracy": 0.6489449918270112, "num_tokens": 5616381.0, "step": 1540 }, { "epoch": 11.884615384615385, "grad_norm": 0.0, "learning_rate": 4.228e-05, "loss": 1.7094, "mean_token_accuracy": 0.6595662713050843, "num_tokens": 5635297.0, "step": 1545 }, { "epoch": 11.923076923076923, "grad_norm": 0.0, "learning_rate": 4.2255e-05, "loss": 1.7149, "mean_token_accuracy": 0.6520039916038514, "num_tokens": 5653729.0, "step": 1550 }, { "epoch": 11.961538461538462, "grad_norm": 0.0, "learning_rate": 4.223e-05, "loss": 1.7762, "mean_token_accuracy": 0.6417809814214707, "num_tokens": 5671517.0, "step": 1555 }, { "epoch": 12.0, "grad_norm": 0.0, "learning_rate": 4.2205e-05, "loss": 1.7578, "mean_token_accuracy": 0.6531469017267227, "num_tokens": 5689260.0, "step": 1560 }, { "epoch": 12.038461538461538, "grad_norm": 0.0, "learning_rate": 4.2180000000000006e-05, "loss": 1.7791, "mean_token_accuracy": 0.6447166264057159, "num_tokens": 5706593.0, "step": 1565 }, { "epoch": 12.076923076923077, "grad_norm": 0.0, "learning_rate": 4.2155e-05, "loss": 1.7705, "mean_token_accuracy": 0.6480188548564911, "num_tokens": 5724377.0, "step": 1570 }, { "epoch": 12.115384615384615, "grad_norm": 0.0, "learning_rate": 4.2130000000000005e-05, "loss": 1.7288, "mean_token_accuracy": 0.653497377038002, "num_tokens": 5742391.0, "step": 1575 }, { "epoch": 12.153846153846153, "grad_norm": 0.0, "learning_rate": 4.2105e-05, "loss": 1.8046, "mean_token_accuracy": 0.6407176822423934, "num_tokens": 5759857.0, "step": 1580 }, { "epoch": 12.192307692307692, "grad_norm": 0.0, "learning_rate": 4.2080000000000004e-05, "loss": 1.8206, "mean_token_accuracy": 0.6390235781669616, "num_tokens": 5777131.0, "step": 1585 }, { "epoch": 12.23076923076923, "grad_norm": 0.0, "learning_rate": 4.2055e-05, "loss": 1.7858, "mean_token_accuracy": 0.6427943378686904, "num_tokens": 5795346.0, "step": 1590 }, { "epoch": 12.26923076923077, "grad_norm": 0.0, "learning_rate": 4.203e-05, "loss": 1.7938, "mean_token_accuracy": 0.6385728657245636, "num_tokens": 5812776.0, "step": 1595 }, { "epoch": 12.307692307692308, "grad_norm": 0.0, "learning_rate": 4.2005e-05, "loss": 1.7442, "mean_token_accuracy": 0.6511879444122315, "num_tokens": 5831416.0, "step": 1600 }, { "epoch": 12.346153846153847, "grad_norm": 0.0, "learning_rate": 4.198e-05, "loss": 1.7307, "mean_token_accuracy": 0.6532302528619767, "num_tokens": 5850125.0, "step": 1605 }, { "epoch": 12.384615384615385, "grad_norm": 0.0, "learning_rate": 4.1955e-05, "loss": 1.7729, "mean_token_accuracy": 0.6425417333841323, "num_tokens": 5868205.0, "step": 1610 }, { "epoch": 12.423076923076923, "grad_norm": 0.0, "learning_rate": 4.193e-05, "loss": 1.6777, "mean_token_accuracy": 0.6608704894781112, "num_tokens": 5887309.0, "step": 1615 }, { "epoch": 12.461538461538462, "grad_norm": 0.0, "learning_rate": 4.1905e-05, "loss": 1.7262, "mean_token_accuracy": 0.651836696267128, "num_tokens": 5906272.0, "step": 1620 }, { "epoch": 12.5, "grad_norm": 0.0, "learning_rate": 4.1880000000000006e-05, "loss": 1.7005, "mean_token_accuracy": 0.6608695417642594, "num_tokens": 5924826.0, "step": 1625 }, { "epoch": 12.538461538461538, "grad_norm": 0.0, "learning_rate": 4.1855e-05, "loss": 1.6497, "mean_token_accuracy": 0.6752032458782196, "num_tokens": 5944267.0, "step": 1630 }, { "epoch": 12.576923076923077, "grad_norm": 0.0, "learning_rate": 4.1830000000000004e-05, "loss": 1.7544, "mean_token_accuracy": 0.6387367337942124, "num_tokens": 5962881.0, "step": 1635 }, { "epoch": 12.615384615384615, "grad_norm": 0.0, "learning_rate": 4.1805e-05, "loss": 1.7518, "mean_token_accuracy": 0.6540467470884324, "num_tokens": 5980352.0, "step": 1640 }, { "epoch": 12.653846153846153, "grad_norm": 0.0, "learning_rate": 4.178e-05, "loss": 1.6636, "mean_token_accuracy": 0.6685753583908081, "num_tokens": 5998878.0, "step": 1645 }, { "epoch": 12.692307692307692, "grad_norm": 0.0, "learning_rate": 4.1755e-05, "loss": 1.7515, "mean_token_accuracy": 0.6498158782720566, "num_tokens": 6016722.0, "step": 1650 }, { "epoch": 12.73076923076923, "grad_norm": 0.0, "learning_rate": 4.173e-05, "loss": 1.6561, "mean_token_accuracy": 0.6655506461858749, "num_tokens": 6035815.0, "step": 1655 }, { "epoch": 12.76923076923077, "grad_norm": 0.0, "learning_rate": 4.1705e-05, "loss": 1.7659, "mean_token_accuracy": 0.6420869499444961, "num_tokens": 6053674.0, "step": 1660 }, { "epoch": 12.807692307692308, "grad_norm": 0.0, "learning_rate": 4.168e-05, "loss": 1.7457, "mean_token_accuracy": 0.6514944612979889, "num_tokens": 6071380.0, "step": 1665 }, { "epoch": 12.846153846153847, "grad_norm": 0.0, "learning_rate": 4.1655e-05, "loss": 1.7276, "mean_token_accuracy": 0.6513452738523483, "num_tokens": 6090024.0, "step": 1670 }, { "epoch": 12.884615384615385, "grad_norm": 0.0, "learning_rate": 4.163e-05, "loss": 1.7536, "mean_token_accuracy": 0.6504515618085861, "num_tokens": 6108154.0, "step": 1675 }, { "epoch": 12.923076923076923, "grad_norm": 0.0, "learning_rate": 4.1605e-05, "loss": 1.7592, "mean_token_accuracy": 0.6449838757514954, "num_tokens": 6126273.0, "step": 1680 }, { "epoch": 12.961538461538462, "grad_norm": 0.0, "learning_rate": 4.1580000000000005e-05, "loss": 1.6927, "mean_token_accuracy": 0.6574580699205399, "num_tokens": 6144991.0, "step": 1685 }, { "epoch": 13.0, "grad_norm": 0.0, "learning_rate": 4.1555e-05, "loss": 1.7501, "mean_token_accuracy": 0.6433569133281708, "num_tokens": 6163365.0, "step": 1690 }, { "epoch": 13.038461538461538, "grad_norm": 0.0, "learning_rate": 4.1530000000000004e-05, "loss": 1.7762, "mean_token_accuracy": 0.6436060070991516, "num_tokens": 6181008.0, "step": 1695 }, { "epoch": 13.076923076923077, "grad_norm": 0.0, "learning_rate": 4.1504999999999996e-05, "loss": 1.8095, "mean_token_accuracy": 0.6394443988800049, "num_tokens": 6198379.0, "step": 1700 }, { "epoch": 13.115384615384615, "grad_norm": 0.0, "learning_rate": 4.148e-05, "loss": 1.7128, "mean_token_accuracy": 0.6577008962631226, "num_tokens": 6217116.0, "step": 1705 }, { "epoch": 13.153846153846153, "grad_norm": 0.0, "learning_rate": 4.1455e-05, "loss": 1.7454, "mean_token_accuracy": 0.6536228835582734, "num_tokens": 6235654.0, "step": 1710 }, { "epoch": 13.192307692307692, "grad_norm": 0.0, "learning_rate": 4.143e-05, "loss": 1.8063, "mean_token_accuracy": 0.6444395124912262, "num_tokens": 6253216.0, "step": 1715 }, { "epoch": 13.23076923076923, "grad_norm": 0.0, "learning_rate": 4.1405e-05, "loss": 1.7436, "mean_token_accuracy": 0.6505606740713119, "num_tokens": 6270726.0, "step": 1720 }, { "epoch": 13.26923076923077, "grad_norm": 0.0, "learning_rate": 4.138e-05, "loss": 1.7738, "mean_token_accuracy": 0.6452360033988953, "num_tokens": 6287830.0, "step": 1725 }, { "epoch": 13.307692307692308, "grad_norm": 0.0, "learning_rate": 4.1355e-05, "loss": 1.6774, "mean_token_accuracy": 0.6625390321016311, "num_tokens": 6306547.0, "step": 1730 }, { "epoch": 13.346153846153847, "grad_norm": 0.0, "learning_rate": 4.133e-05, "loss": 1.6985, "mean_token_accuracy": 0.6607826173305511, "num_tokens": 6325192.0, "step": 1735 }, { "epoch": 13.384615384615385, "grad_norm": 0.0, "learning_rate": 4.1305e-05, "loss": 1.7158, "mean_token_accuracy": 0.6505321115255356, "num_tokens": 6344059.0, "step": 1740 }, { "epoch": 13.423076923076923, "grad_norm": 0.0, "learning_rate": 4.1280000000000005e-05, "loss": 1.7164, "mean_token_accuracy": 0.652134558558464, "num_tokens": 6362364.0, "step": 1745 }, { "epoch": 13.461538461538462, "grad_norm": 0.0, "learning_rate": 4.1255e-05, "loss": 1.7474, "mean_token_accuracy": 0.6487911343574524, "num_tokens": 6380492.0, "step": 1750 }, { "epoch": 13.5, "grad_norm": 0.0, "learning_rate": 4.123e-05, "loss": 1.7241, "mean_token_accuracy": 0.6540584862232208, "num_tokens": 6399116.0, "step": 1755 }, { "epoch": 13.538461538461538, "grad_norm": 0.0, "learning_rate": 4.1205e-05, "loss": 1.6792, "mean_token_accuracy": 0.6668368667364121, "num_tokens": 6417790.0, "step": 1760 }, { "epoch": 13.576923076923077, "grad_norm": 0.0, "learning_rate": 4.118e-05, "loss": 1.7109, "mean_token_accuracy": 0.651064109802246, "num_tokens": 6436016.0, "step": 1765 }, { "epoch": 13.615384615384615, "grad_norm": 0.0, "learning_rate": 4.1155e-05, "loss": 1.7844, "mean_token_accuracy": 0.6407437533140182, "num_tokens": 6453594.0, "step": 1770 }, { "epoch": 13.653846153846153, "grad_norm": 0.0, "learning_rate": 4.113e-05, "loss": 1.7622, "mean_token_accuracy": 0.642905455827713, "num_tokens": 6471529.0, "step": 1775 }, { "epoch": 13.692307692307692, "grad_norm": 0.0, "learning_rate": 4.110500000000001e-05, "loss": 1.7422, "mean_token_accuracy": 0.6522348582744598, "num_tokens": 6489689.0, "step": 1780 }, { "epoch": 13.73076923076923, "grad_norm": 0.0, "learning_rate": 4.108e-05, "loss": 1.764, "mean_token_accuracy": 0.6411303877830505, "num_tokens": 6508258.0, "step": 1785 }, { "epoch": 13.76923076923077, "grad_norm": 0.0, "learning_rate": 4.1055000000000006e-05, "loss": 1.733, "mean_token_accuracy": 0.6555010586977005, "num_tokens": 6526939.0, "step": 1790 }, { "epoch": 13.807692307692308, "grad_norm": 0.0, "learning_rate": 4.103e-05, "loss": 1.7877, "mean_token_accuracy": 0.6444415628910065, "num_tokens": 6544917.0, "step": 1795 }, { "epoch": 13.846153846153847, "grad_norm": 0.0, "learning_rate": 4.1005000000000005e-05, "loss": 1.6953, "mean_token_accuracy": 0.6559826284646988, "num_tokens": 6563694.0, "step": 1800 }, { "epoch": 13.884615384615385, "grad_norm": 0.0, "learning_rate": 4.0980000000000004e-05, "loss": 1.7663, "mean_token_accuracy": 0.6459847629070282, "num_tokens": 6581448.0, "step": 1805 }, { "epoch": 13.923076923076923, "grad_norm": 0.0, "learning_rate": 4.0955000000000003e-05, "loss": 1.7447, "mean_token_accuracy": 0.6494243443012238, "num_tokens": 6600126.0, "step": 1810 }, { "epoch": 13.961538461538462, "grad_norm": 0.0, "learning_rate": 4.093e-05, "loss": 1.7329, "mean_token_accuracy": 0.6540934264659881, "num_tokens": 6618707.0, "step": 1815 }, { "epoch": 14.0, "grad_norm": 0.0, "learning_rate": 4.0905e-05, "loss": 1.7251, "mean_token_accuracy": 0.6513684660196304, "num_tokens": 6637470.0, "step": 1820 }, { "epoch": 14.038461538461538, "grad_norm": 0.0, "learning_rate": 4.088e-05, "loss": 1.6674, "mean_token_accuracy": 0.656906321644783, "num_tokens": 6657126.0, "step": 1825 }, { "epoch": 14.076923076923077, "grad_norm": 0.0, "learning_rate": 4.0855e-05, "loss": 1.7146, "mean_token_accuracy": 0.6526720136404037, "num_tokens": 6675746.0, "step": 1830 }, { "epoch": 14.115384615384615, "grad_norm": 0.0, "learning_rate": 4.083e-05, "loss": 1.6588, "mean_token_accuracy": 0.663616544008255, "num_tokens": 6695210.0, "step": 1835 }, { "epoch": 14.153846153846153, "grad_norm": 0.0, "learning_rate": 4.0805000000000007e-05, "loss": 1.7722, "mean_token_accuracy": 0.6440047711133957, "num_tokens": 6713034.0, "step": 1840 }, { "epoch": 14.192307692307692, "grad_norm": 0.0, "learning_rate": 4.078e-05, "loss": 1.7022, "mean_token_accuracy": 0.6584948360919952, "num_tokens": 6731872.0, "step": 1845 }, { "epoch": 14.23076923076923, "grad_norm": 0.0, "learning_rate": 4.0755000000000005e-05, "loss": 1.7393, "mean_token_accuracy": 0.6503253132104874, "num_tokens": 6750060.0, "step": 1850 }, { "epoch": 14.26923076923077, "grad_norm": 0.0, "learning_rate": 4.0730000000000005e-05, "loss": 1.7848, "mean_token_accuracy": 0.6464446872472763, "num_tokens": 6767696.0, "step": 1855 }, { "epoch": 14.307692307692308, "grad_norm": 0.0, "learning_rate": 4.0705000000000004e-05, "loss": 1.7344, "mean_token_accuracy": 0.6560231685638428, "num_tokens": 6785928.0, "step": 1860 }, { "epoch": 14.346153846153847, "grad_norm": 0.0, "learning_rate": 4.0680000000000004e-05, "loss": 1.7642, "mean_token_accuracy": 0.6425097972154618, "num_tokens": 6804181.0, "step": 1865 }, { "epoch": 14.384615384615385, "grad_norm": 0.0, "learning_rate": 4.0655e-05, "loss": 1.7797, "mean_token_accuracy": 0.6466529428958893, "num_tokens": 6821867.0, "step": 1870 }, { "epoch": 14.423076923076923, "grad_norm": 0.0, "learning_rate": 4.063e-05, "loss": 1.8023, "mean_token_accuracy": 0.6425188779830933, "num_tokens": 6839252.0, "step": 1875 }, { "epoch": 14.461538461538462, "grad_norm": 0.0, "learning_rate": 4.0605e-05, "loss": 1.7618, "mean_token_accuracy": 0.6461942523717881, "num_tokens": 6857208.0, "step": 1880 }, { "epoch": 14.5, "grad_norm": 0.0, "learning_rate": 4.058e-05, "loss": 1.7463, "mean_token_accuracy": 0.6480780661106109, "num_tokens": 6875091.0, "step": 1885 }, { "epoch": 14.538461538461538, "grad_norm": 0.0, "learning_rate": 4.055500000000001e-05, "loss": 1.745, "mean_token_accuracy": 0.6500824391841888, "num_tokens": 6893171.0, "step": 1890 }, { "epoch": 14.576923076923077, "grad_norm": 0.0, "learning_rate": 4.053e-05, "loss": 1.6749, "mean_token_accuracy": 0.6628648489713669, "num_tokens": 6911806.0, "step": 1895 }, { "epoch": 14.615384615384615, "grad_norm": 0.0, "learning_rate": 4.0505000000000006e-05, "loss": 1.7282, "mean_token_accuracy": 0.6511125653982163, "num_tokens": 6930030.0, "step": 1900 }, { "epoch": 14.653846153846153, "grad_norm": 0.0, "learning_rate": 4.048e-05, "loss": 1.738, "mean_token_accuracy": 0.6520918369293213, "num_tokens": 6948110.0, "step": 1905 }, { "epoch": 14.692307692307692, "grad_norm": 0.0, "learning_rate": 4.0455000000000005e-05, "loss": 1.764, "mean_token_accuracy": 0.6516493141651154, "num_tokens": 6966319.0, "step": 1910 }, { "epoch": 14.73076923076923, "grad_norm": 0.0, "learning_rate": 4.0430000000000004e-05, "loss": 1.7763, "mean_token_accuracy": 0.6418019294738769, "num_tokens": 6984403.0, "step": 1915 }, { "epoch": 14.76923076923077, "grad_norm": 0.0, "learning_rate": 4.0405000000000004e-05, "loss": 1.8325, "mean_token_accuracy": 0.6336044192314148, "num_tokens": 7001378.0, "step": 1920 }, { "epoch": 14.807692307692308, "grad_norm": 0.0, "learning_rate": 4.038e-05, "loss": 1.7307, "mean_token_accuracy": 0.6568962961435318, "num_tokens": 7019494.0, "step": 1925 }, { "epoch": 14.846153846153847, "grad_norm": 0.0, "learning_rate": 4.0355e-05, "loss": 1.7421, "mean_token_accuracy": 0.6525963991880417, "num_tokens": 7037790.0, "step": 1930 }, { "epoch": 14.884615384615385, "grad_norm": 0.0, "learning_rate": 4.033e-05, "loss": 1.7614, "mean_token_accuracy": 0.6488069415092468, "num_tokens": 7055472.0, "step": 1935 }, { "epoch": 14.923076923076923, "grad_norm": 0.0, "learning_rate": 4.0305e-05, "loss": 1.7207, "mean_token_accuracy": 0.6501289933919907, "num_tokens": 7074488.0, "step": 1940 }, { "epoch": 14.961538461538462, "grad_norm": 0.0, "learning_rate": 4.028e-05, "loss": 1.7577, "mean_token_accuracy": 0.6519861459732056, "num_tokens": 7092711.0, "step": 1945 }, { "epoch": 15.0, "grad_norm": 0.0, "learning_rate": 4.025500000000001e-05, "loss": 1.6968, "mean_token_accuracy": 0.6624333202838898, "num_tokens": 7111575.0, "step": 1950 }, { "epoch": 15.038461538461538, "grad_norm": 0.0, "learning_rate": 4.023e-05, "loss": 1.7382, "mean_token_accuracy": 0.6494399189949036, "num_tokens": 7130012.0, "step": 1955 }, { "epoch": 15.076923076923077, "grad_norm": 0.0, "learning_rate": 4.0205000000000006e-05, "loss": 1.7673, "mean_token_accuracy": 0.6474148839712143, "num_tokens": 7147764.0, "step": 1960 }, { "epoch": 15.115384615384615, "grad_norm": 0.0, "learning_rate": 4.018e-05, "loss": 1.7608, "mean_token_accuracy": 0.6430147528648377, "num_tokens": 7165688.0, "step": 1965 }, { "epoch": 15.153846153846153, "grad_norm": 0.0, "learning_rate": 4.0155000000000004e-05, "loss": 1.768, "mean_token_accuracy": 0.6483392864465714, "num_tokens": 7183723.0, "step": 1970 }, { "epoch": 15.192307692307692, "grad_norm": 0.0, "learning_rate": 4.0130000000000004e-05, "loss": 1.7712, "mean_token_accuracy": 0.6451886177062989, "num_tokens": 7200954.0, "step": 1975 }, { "epoch": 15.23076923076923, "grad_norm": 0.0, "learning_rate": 4.0105e-05, "loss": 1.7852, "mean_token_accuracy": 0.6437583029270172, "num_tokens": 7218897.0, "step": 1980 }, { "epoch": 15.26923076923077, "grad_norm": 0.0, "learning_rate": 4.008e-05, "loss": 1.7186, "mean_token_accuracy": 0.6547761648893357, "num_tokens": 7236557.0, "step": 1985 }, { "epoch": 15.307692307692308, "grad_norm": 0.0, "learning_rate": 4.0055e-05, "loss": 1.7277, "mean_token_accuracy": 0.6466970533132553, "num_tokens": 7255192.0, "step": 1990 }, { "epoch": 15.346153846153847, "grad_norm": 0.0, "learning_rate": 4.003e-05, "loss": 1.817, "mean_token_accuracy": 0.6392653495073318, "num_tokens": 7272870.0, "step": 1995 }, { "epoch": 15.384615384615385, "grad_norm": 0.0, "learning_rate": 4.0005e-05, "loss": 1.7256, "mean_token_accuracy": 0.6513147324323654, "num_tokens": 7291754.0, "step": 2000 }, { "epoch": 15.423076923076923, "grad_norm": 0.0, "learning_rate": 3.998e-05, "loss": 1.7946, "mean_token_accuracy": 0.6427055537700653, "num_tokens": 7309133.0, "step": 2005 }, { "epoch": 15.461538461538462, "grad_norm": 0.0, "learning_rate": 3.9955000000000006e-05, "loss": 1.7461, "mean_token_accuracy": 0.6472333878278732, "num_tokens": 7328009.0, "step": 2010 }, { "epoch": 15.5, "grad_norm": 0.0, "learning_rate": 3.993e-05, "loss": 1.713, "mean_token_accuracy": 0.6539407223463058, "num_tokens": 7346790.0, "step": 2015 }, { "epoch": 15.538461538461538, "grad_norm": 0.0, "learning_rate": 3.9905000000000005e-05, "loss": 1.7869, "mean_token_accuracy": 0.6394775450229645, "num_tokens": 7364128.0, "step": 2020 }, { "epoch": 15.576923076923077, "grad_norm": 0.0, "learning_rate": 3.988e-05, "loss": 1.7062, "mean_token_accuracy": 0.6575020164251327, "num_tokens": 7382926.0, "step": 2025 }, { "epoch": 15.615384615384615, "grad_norm": 0.0, "learning_rate": 3.9855000000000004e-05, "loss": 1.6781, "mean_token_accuracy": 0.6595021516084671, "num_tokens": 7402054.0, "step": 2030 }, { "epoch": 15.653846153846153, "grad_norm": 0.0, "learning_rate": 3.983e-05, "loss": 1.7171, "mean_token_accuracy": 0.6517434686422348, "num_tokens": 7420489.0, "step": 2035 }, { "epoch": 15.692307692307692, "grad_norm": 0.0, "learning_rate": 3.9805e-05, "loss": 1.7047, "mean_token_accuracy": 0.6589993953704834, "num_tokens": 7438755.0, "step": 2040 }, { "epoch": 15.73076923076923, "grad_norm": 0.0, "learning_rate": 3.978e-05, "loss": 1.741, "mean_token_accuracy": 0.6524647355079651, "num_tokens": 7456837.0, "step": 2045 }, { "epoch": 15.76923076923077, "grad_norm": 0.0, "learning_rate": 3.9755e-05, "loss": 1.8081, "mean_token_accuracy": 0.6410402476787567, "num_tokens": 7473972.0, "step": 2050 }, { "epoch": 15.807692307692308, "grad_norm": 0.0, "learning_rate": 3.973e-05, "loss": 1.6997, "mean_token_accuracy": 0.6587257742881775, "num_tokens": 7493430.0, "step": 2055 }, { "epoch": 15.846153846153847, "grad_norm": 0.0, "learning_rate": 3.9705e-05, "loss": 1.7487, "mean_token_accuracy": 0.648233824968338, "num_tokens": 7511123.0, "step": 2060 }, { "epoch": 15.884615384615385, "grad_norm": 0.0, "learning_rate": 3.968e-05, "loss": 1.6579, "mean_token_accuracy": 0.6663879871368408, "num_tokens": 7530727.0, "step": 2065 }, { "epoch": 15.923076923076923, "grad_norm": 0.0, "learning_rate": 3.9655000000000006e-05, "loss": 1.7077, "mean_token_accuracy": 0.6618927121162415, "num_tokens": 7549141.0, "step": 2070 }, { "epoch": 15.961538461538462, "grad_norm": 0.0, "learning_rate": 3.963e-05, "loss": 1.6903, "mean_token_accuracy": 0.6613986879587174, "num_tokens": 7567995.0, "step": 2075 }, { "epoch": 16.0, "grad_norm": 0.0, "learning_rate": 3.9605000000000005e-05, "loss": 1.7907, "mean_token_accuracy": 0.6406063586473465, "num_tokens": 7585680.0, "step": 2080 }, { "epoch": 16.03846153846154, "grad_norm": 0.0, "learning_rate": 3.958e-05, "loss": 1.7458, "mean_token_accuracy": 0.6477556735277176, "num_tokens": 7604192.0, "step": 2085 }, { "epoch": 16.076923076923077, "grad_norm": 0.0, "learning_rate": 3.9555e-05, "loss": 1.67, "mean_token_accuracy": 0.6667103677988052, "num_tokens": 7622647.0, "step": 2090 }, { "epoch": 16.115384615384617, "grad_norm": 0.0, "learning_rate": 3.953e-05, "loss": 1.7433, "mean_token_accuracy": 0.652290990948677, "num_tokens": 7640570.0, "step": 2095 }, { "epoch": 16.153846153846153, "grad_norm": 0.0, "learning_rate": 3.9505e-05, "loss": 1.6936, "mean_token_accuracy": 0.6566350907087326, "num_tokens": 7658920.0, "step": 2100 }, { "epoch": 16.192307692307693, "grad_norm": 0.0, "learning_rate": 3.948e-05, "loss": 1.8121, "mean_token_accuracy": 0.6355432808399201, "num_tokens": 7676608.0, "step": 2105 }, { "epoch": 16.23076923076923, "grad_norm": 0.0, "learning_rate": 3.9455e-05, "loss": 1.7127, "mean_token_accuracy": 0.65393525660038, "num_tokens": 7695148.0, "step": 2110 }, { "epoch": 16.26923076923077, "grad_norm": 0.0, "learning_rate": 3.943e-05, "loss": 1.7343, "mean_token_accuracy": 0.6496871590614319, "num_tokens": 7713440.0, "step": 2115 }, { "epoch": 16.307692307692307, "grad_norm": 0.0, "learning_rate": 3.9405e-05, "loss": 1.7737, "mean_token_accuracy": 0.6332459360361099, "num_tokens": 7731634.0, "step": 2120 }, { "epoch": 16.346153846153847, "grad_norm": 0.0, "learning_rate": 3.938e-05, "loss": 1.7469, "mean_token_accuracy": 0.6482865303754807, "num_tokens": 7749952.0, "step": 2125 }, { "epoch": 16.384615384615383, "grad_norm": 0.0, "learning_rate": 3.9355000000000005e-05, "loss": 1.7426, "mean_token_accuracy": 0.6485352158546448, "num_tokens": 7768389.0, "step": 2130 }, { "epoch": 16.423076923076923, "grad_norm": 0.0, "learning_rate": 3.933e-05, "loss": 1.6815, "mean_token_accuracy": 0.6569307982921601, "num_tokens": 7787655.0, "step": 2135 }, { "epoch": 16.46153846153846, "grad_norm": 0.0, "learning_rate": 3.9305000000000004e-05, "loss": 1.7333, "mean_token_accuracy": 0.6505914777517319, "num_tokens": 7805096.0, "step": 2140 }, { "epoch": 16.5, "grad_norm": 0.0, "learning_rate": 3.9280000000000003e-05, "loss": 1.7297, "mean_token_accuracy": 0.6588895499706269, "num_tokens": 7823436.0, "step": 2145 }, { "epoch": 16.53846153846154, "grad_norm": 0.0, "learning_rate": 3.9255e-05, "loss": 1.6984, "mean_token_accuracy": 0.6600228071212768, "num_tokens": 7842178.0, "step": 2150 }, { "epoch": 16.576923076923077, "grad_norm": 0.0, "learning_rate": 3.923e-05, "loss": 1.7287, "mean_token_accuracy": 0.6550074636936187, "num_tokens": 7860260.0, "step": 2155 }, { "epoch": 16.615384615384617, "grad_norm": 0.0, "learning_rate": 3.9205e-05, "loss": 1.7764, "mean_token_accuracy": 0.6480809897184372, "num_tokens": 7877839.0, "step": 2160 }, { "epoch": 16.653846153846153, "grad_norm": 0.0, "learning_rate": 3.918e-05, "loss": 1.66, "mean_token_accuracy": 0.6657153934240341, "num_tokens": 7896808.0, "step": 2165 }, { "epoch": 16.692307692307693, "grad_norm": 0.0, "learning_rate": 3.9155e-05, "loss": 1.6743, "mean_token_accuracy": 0.6643134742975235, "num_tokens": 7915776.0, "step": 2170 }, { "epoch": 16.73076923076923, "grad_norm": 0.0, "learning_rate": 3.913e-05, "loss": 1.7643, "mean_token_accuracy": 0.6498584270477294, "num_tokens": 7933578.0, "step": 2175 }, { "epoch": 16.76923076923077, "grad_norm": 0.0, "learning_rate": 3.9105000000000006e-05, "loss": 1.8256, "mean_token_accuracy": 0.638142678141594, "num_tokens": 7950816.0, "step": 2180 }, { "epoch": 16.807692307692307, "grad_norm": 0.0, "learning_rate": 3.908e-05, "loss": 1.7457, "mean_token_accuracy": 0.6490698546171189, "num_tokens": 7969482.0, "step": 2185 }, { "epoch": 16.846153846153847, "grad_norm": 0.0, "learning_rate": 3.9055000000000005e-05, "loss": 1.7675, "mean_token_accuracy": 0.6475545018911362, "num_tokens": 7987748.0, "step": 2190 }, { "epoch": 16.884615384615383, "grad_norm": 0.0, "learning_rate": 3.903e-05, "loss": 1.7839, "mean_token_accuracy": 0.639941617846489, "num_tokens": 8005598.0, "step": 2195 }, { "epoch": 16.923076923076923, "grad_norm": 0.0, "learning_rate": 3.9005000000000003e-05, "loss": 1.7498, "mean_token_accuracy": 0.6494361340999604, "num_tokens": 8023661.0, "step": 2200 }, { "epoch": 16.96153846153846, "grad_norm": 0.0, "learning_rate": 3.898e-05, "loss": 1.7785, "mean_token_accuracy": 0.6451682835817337, "num_tokens": 8042112.0, "step": 2205 }, { "epoch": 17.0, "grad_norm": 0.0, "learning_rate": 3.8955e-05, "loss": 1.7905, "mean_token_accuracy": 0.6438369482755661, "num_tokens": 8059785.0, "step": 2210 }, { "epoch": 17.03846153846154, "grad_norm": 0.0, "learning_rate": 3.893e-05, "loss": 1.6973, "mean_token_accuracy": 0.6598532944917679, "num_tokens": 8078152.0, "step": 2215 }, { "epoch": 17.076923076923077, "grad_norm": 0.0, "learning_rate": 3.8905e-05, "loss": 1.7373, "mean_token_accuracy": 0.6510017067193985, "num_tokens": 8096780.0, "step": 2220 }, { "epoch": 17.115384615384617, "grad_norm": 0.0, "learning_rate": 3.888e-05, "loss": 1.7209, "mean_token_accuracy": 0.6596309930086136, "num_tokens": 8114668.0, "step": 2225 }, { "epoch": 17.153846153846153, "grad_norm": 0.0, "learning_rate": 3.8855e-05, "loss": 1.6939, "mean_token_accuracy": 0.6576298594474792, "num_tokens": 8133880.0, "step": 2230 }, { "epoch": 17.192307692307693, "grad_norm": 0.0, "learning_rate": 3.883e-05, "loss": 1.78, "mean_token_accuracy": 0.6426860511302948, "num_tokens": 8151540.0, "step": 2235 }, { "epoch": 17.23076923076923, "grad_norm": 0.0, "learning_rate": 3.8805000000000005e-05, "loss": 1.7545, "mean_token_accuracy": 0.6476239621639251, "num_tokens": 8170218.0, "step": 2240 }, { "epoch": 17.26923076923077, "grad_norm": 0.0, "learning_rate": 3.878e-05, "loss": 1.7575, "mean_token_accuracy": 0.6513453900814057, "num_tokens": 8188203.0, "step": 2245 }, { "epoch": 17.307692307692307, "grad_norm": 0.0, "learning_rate": 3.8755000000000004e-05, "loss": 1.794, "mean_token_accuracy": 0.6376719444990158, "num_tokens": 8206190.0, "step": 2250 }, { "epoch": 17.346153846153847, "grad_norm": 0.0, "learning_rate": 3.873e-05, "loss": 1.7844, "mean_token_accuracy": 0.6456444442272187, "num_tokens": 8224005.0, "step": 2255 }, { "epoch": 17.384615384615383, "grad_norm": 0.0, "learning_rate": 3.8705e-05, "loss": 1.7172, "mean_token_accuracy": 0.6534038037061691, "num_tokens": 8242421.0, "step": 2260 }, { "epoch": 17.423076923076923, "grad_norm": 0.0, "learning_rate": 3.868e-05, "loss": 1.6841, "mean_token_accuracy": 0.659113222360611, "num_tokens": 8261508.0, "step": 2265 }, { "epoch": 17.46153846153846, "grad_norm": 0.0, "learning_rate": 3.8655e-05, "loss": 1.7875, "mean_token_accuracy": 0.6438043415546417, "num_tokens": 8279321.0, "step": 2270 }, { "epoch": 17.5, "grad_norm": 0.0, "learning_rate": 3.863e-05, "loss": 1.7198, "mean_token_accuracy": 0.6564956575632095, "num_tokens": 8298176.0, "step": 2275 }, { "epoch": 17.53846153846154, "grad_norm": 0.0, "learning_rate": 3.8605e-05, "loss": 1.7513, "mean_token_accuracy": 0.6444843232631683, "num_tokens": 8316630.0, "step": 2280 }, { "epoch": 17.576923076923077, "grad_norm": 0.0, "learning_rate": 3.858e-05, "loss": 1.6235, "mean_token_accuracy": 0.6704483658075333, "num_tokens": 8335880.0, "step": 2285 }, { "epoch": 17.615384615384617, "grad_norm": 0.0, "learning_rate": 3.8555e-05, "loss": 1.7432, "mean_token_accuracy": 0.6476155072450638, "num_tokens": 8354016.0, "step": 2290 }, { "epoch": 17.653846153846153, "grad_norm": 0.0, "learning_rate": 3.853e-05, "loss": 1.7454, "mean_token_accuracy": 0.6534331560134887, "num_tokens": 8371707.0, "step": 2295 }, { "epoch": 17.692307692307693, "grad_norm": 0.0, "learning_rate": 3.8505000000000005e-05, "loss": 1.7394, "mean_token_accuracy": 0.6537044525146485, "num_tokens": 8389636.0, "step": 2300 }, { "epoch": 17.73076923076923, "grad_norm": 0.0, "learning_rate": 3.848e-05, "loss": 1.7573, "mean_token_accuracy": 0.6542636543512345, "num_tokens": 8407513.0, "step": 2305 }, { "epoch": 17.76923076923077, "grad_norm": 0.0, "learning_rate": 3.8455000000000004e-05, "loss": 1.7613, "mean_token_accuracy": 0.641442459821701, "num_tokens": 8425390.0, "step": 2310 }, { "epoch": 17.807692307692307, "grad_norm": 0.0, "learning_rate": 3.8429999999999996e-05, "loss": 1.7467, "mean_token_accuracy": 0.6489367455244064, "num_tokens": 8443863.0, "step": 2315 }, { "epoch": 17.846153846153847, "grad_norm": 0.0, "learning_rate": 3.8405e-05, "loss": 1.8034, "mean_token_accuracy": 0.6354307472705841, "num_tokens": 8461483.0, "step": 2320 }, { "epoch": 17.884615384615383, "grad_norm": 0.0, "learning_rate": 3.838e-05, "loss": 1.7575, "mean_token_accuracy": 0.6472324639558792, "num_tokens": 8479006.0, "step": 2325 }, { "epoch": 17.923076923076923, "grad_norm": 0.0, "learning_rate": 3.8355e-05, "loss": 1.7741, "mean_token_accuracy": 0.646085774898529, "num_tokens": 8496986.0, "step": 2330 }, { "epoch": 17.96153846153846, "grad_norm": 0.0, "learning_rate": 3.833e-05, "loss": 1.7343, "mean_token_accuracy": 0.6486167579889297, "num_tokens": 8515399.0, "step": 2335 }, { "epoch": 18.0, "grad_norm": 0.0, "learning_rate": 3.8305e-05, "loss": 1.7085, "mean_token_accuracy": 0.658332034945488, "num_tokens": 8533890.0, "step": 2340 }, { "epoch": 18.03846153846154, "grad_norm": 0.0, "learning_rate": 3.828e-05, "loss": 1.7182, "mean_token_accuracy": 0.6543307691812515, "num_tokens": 8552119.0, "step": 2345 }, { "epoch": 18.076923076923077, "grad_norm": 0.0, "learning_rate": 3.8255e-05, "loss": 1.721, "mean_token_accuracy": 0.6576748192310333, "num_tokens": 8570610.0, "step": 2350 }, { "epoch": 18.115384615384617, "grad_norm": 0.0, "learning_rate": 3.823e-05, "loss": 1.7423, "mean_token_accuracy": 0.6538129627704621, "num_tokens": 8588510.0, "step": 2355 }, { "epoch": 18.153846153846153, "grad_norm": 0.0, "learning_rate": 3.8205000000000004e-05, "loss": 1.7196, "mean_token_accuracy": 0.6549855172634125, "num_tokens": 8606513.0, "step": 2360 }, { "epoch": 18.192307692307693, "grad_norm": 0.0, "learning_rate": 3.818e-05, "loss": 1.7534, "mean_token_accuracy": 0.6542291522026062, "num_tokens": 8624369.0, "step": 2365 }, { "epoch": 18.23076923076923, "grad_norm": 0.0, "learning_rate": 3.8155e-05, "loss": 1.696, "mean_token_accuracy": 0.6589814841747283, "num_tokens": 8643122.0, "step": 2370 }, { "epoch": 18.26923076923077, "grad_norm": 0.0, "learning_rate": 3.8129999999999996e-05, "loss": 1.7568, "mean_token_accuracy": 0.6480594784021377, "num_tokens": 8660823.0, "step": 2375 }, { "epoch": 18.307692307692307, "grad_norm": 0.0, "learning_rate": 3.8105e-05, "loss": 1.7531, "mean_token_accuracy": 0.6460558891296386, "num_tokens": 8678813.0, "step": 2380 }, { "epoch": 18.346153846153847, "grad_norm": 0.0, "learning_rate": 3.808e-05, "loss": 1.7844, "mean_token_accuracy": 0.6402776807546615, "num_tokens": 8696912.0, "step": 2385 }, { "epoch": 18.384615384615383, "grad_norm": 0.0, "learning_rate": 3.8055e-05, "loss": 1.8192, "mean_token_accuracy": 0.6349106699228286, "num_tokens": 8714150.0, "step": 2390 }, { "epoch": 18.423076923076923, "grad_norm": 0.0, "learning_rate": 3.803000000000001e-05, "loss": 1.7805, "mean_token_accuracy": 0.6420513331890106, "num_tokens": 8732388.0, "step": 2395 }, { "epoch": 18.46153846153846, "grad_norm": 0.0, "learning_rate": 3.8005e-05, "loss": 1.7583, "mean_token_accuracy": 0.6472231477499009, "num_tokens": 8750733.0, "step": 2400 }, { "epoch": 18.5, "grad_norm": 0.0, "learning_rate": 3.7980000000000006e-05, "loss": 1.822, "mean_token_accuracy": 0.6343230813741684, "num_tokens": 8768670.0, "step": 2405 }, { "epoch": 18.53846153846154, "grad_norm": 0.0, "learning_rate": 3.7955e-05, "loss": 1.7632, "mean_token_accuracy": 0.6466387540102005, "num_tokens": 8786506.0, "step": 2410 }, { "epoch": 18.576923076923077, "grad_norm": 0.0, "learning_rate": 3.7930000000000004e-05, "loss": 1.7275, "mean_token_accuracy": 0.6533702045679093, "num_tokens": 8804613.0, "step": 2415 }, { "epoch": 18.615384615384617, "grad_norm": 0.0, "learning_rate": 3.7905000000000004e-05, "loss": 1.6716, "mean_token_accuracy": 0.6637963593006134, "num_tokens": 8823585.0, "step": 2420 }, { "epoch": 18.653846153846153, "grad_norm": 0.0, "learning_rate": 3.788e-05, "loss": 1.7647, "mean_token_accuracy": 0.648153355717659, "num_tokens": 8841533.0, "step": 2425 }, { "epoch": 18.692307692307693, "grad_norm": 0.0, "learning_rate": 3.7855e-05, "loss": 1.7309, "mean_token_accuracy": 0.6484656900167465, "num_tokens": 8860614.0, "step": 2430 }, { "epoch": 18.73076923076923, "grad_norm": 0.0, "learning_rate": 3.783e-05, "loss": 1.775, "mean_token_accuracy": 0.6378404378890992, "num_tokens": 8879255.0, "step": 2435 }, { "epoch": 18.76923076923077, "grad_norm": 0.0, "learning_rate": 3.7805e-05, "loss": 1.7502, "mean_token_accuracy": 0.6478937238454818, "num_tokens": 8896994.0, "step": 2440 }, { "epoch": 18.807692307692307, "grad_norm": 0.0, "learning_rate": 3.778000000000001e-05, "loss": 1.7486, "mean_token_accuracy": 0.6547796040773392, "num_tokens": 8915326.0, "step": 2445 }, { "epoch": 18.846153846153847, "grad_norm": 0.0, "learning_rate": 3.7755e-05, "loss": 1.6805, "mean_token_accuracy": 0.6655582249164581, "num_tokens": 8933916.0, "step": 2450 }, { "epoch": 18.884615384615383, "grad_norm": 0.0, "learning_rate": 3.7730000000000006e-05, "loss": 1.7298, "mean_token_accuracy": 0.6513529330492019, "num_tokens": 8952047.0, "step": 2455 }, { "epoch": 18.923076923076923, "grad_norm": 0.0, "learning_rate": 3.7705e-05, "loss": 1.6774, "mean_token_accuracy": 0.6640833884477615, "num_tokens": 8970281.0, "step": 2460 }, { "epoch": 18.96153846153846, "grad_norm": 0.0, "learning_rate": 3.7680000000000005e-05, "loss": 1.6164, "mean_token_accuracy": 0.6714823067188262, "num_tokens": 8989804.0, "step": 2465 }, { "epoch": 19.0, "grad_norm": 0.0, "learning_rate": 3.7655000000000005e-05, "loss": 1.8069, "mean_token_accuracy": 0.634514644742012, "num_tokens": 9007995.0, "step": 2470 }, { "epoch": 19.03846153846154, "grad_norm": 0.0, "learning_rate": 3.7630000000000004e-05, "loss": 1.7192, "mean_token_accuracy": 0.6563860654830933, "num_tokens": 9026162.0, "step": 2475 }, { "epoch": 19.076923076923077, "grad_norm": 0.0, "learning_rate": 3.7605e-05, "loss": 1.7035, "mean_token_accuracy": 0.6546828061342239, "num_tokens": 9044982.0, "step": 2480 }, { "epoch": 19.115384615384617, "grad_norm": 0.0, "learning_rate": 3.758e-05, "loss": 1.7711, "mean_token_accuracy": 0.6455140680074691, "num_tokens": 9062948.0, "step": 2485 }, { "epoch": 19.153846153846153, "grad_norm": 0.0, "learning_rate": 3.7555e-05, "loss": 1.7543, "mean_token_accuracy": 0.6509835839271545, "num_tokens": 9080800.0, "step": 2490 }, { "epoch": 19.192307692307693, "grad_norm": 0.0, "learning_rate": 3.753e-05, "loss": 1.749, "mean_token_accuracy": 0.6524321138858795, "num_tokens": 9099027.0, "step": 2495 }, { "epoch": 19.23076923076923, "grad_norm": 0.0, "learning_rate": 3.7505e-05, "loss": 1.7379, "mean_token_accuracy": 0.6502025574445724, "num_tokens": 9117027.0, "step": 2500 }, { "epoch": 19.26923076923077, "grad_norm": 0.0, "learning_rate": 3.748000000000001e-05, "loss": 1.7259, "mean_token_accuracy": 0.6557818174362182, "num_tokens": 9135276.0, "step": 2505 }, { "epoch": 19.307692307692307, "grad_norm": 0.0, "learning_rate": 3.7455e-05, "loss": 1.6784, "mean_token_accuracy": 0.6623701632022858, "num_tokens": 9154044.0, "step": 2510 }, { "epoch": 19.346153846153847, "grad_norm": 0.0, "learning_rate": 3.7430000000000006e-05, "loss": 1.7474, "mean_token_accuracy": 0.6527136623859405, "num_tokens": 9172242.0, "step": 2515 }, { "epoch": 19.384615384615383, "grad_norm": 0.0, "learning_rate": 3.7405e-05, "loss": 1.7387, "mean_token_accuracy": 0.647335684299469, "num_tokens": 9190090.0, "step": 2520 }, { "epoch": 19.423076923076923, "grad_norm": 0.0, "learning_rate": 3.7380000000000005e-05, "loss": 1.824, "mean_token_accuracy": 0.6351722240447998, "num_tokens": 9207627.0, "step": 2525 }, { "epoch": 19.46153846153846, "grad_norm": 0.0, "learning_rate": 3.7355000000000004e-05, "loss": 1.8204, "mean_token_accuracy": 0.6388478010892868, "num_tokens": 9224815.0, "step": 2530 }, { "epoch": 19.5, "grad_norm": 0.0, "learning_rate": 3.7330000000000003e-05, "loss": 1.6962, "mean_token_accuracy": 0.6541598588228226, "num_tokens": 9244225.0, "step": 2535 }, { "epoch": 19.53846153846154, "grad_norm": 0.0, "learning_rate": 3.7305e-05, "loss": 1.7581, "mean_token_accuracy": 0.6506271123886108, "num_tokens": 9262267.0, "step": 2540 }, { "epoch": 19.576923076923077, "grad_norm": 0.0, "learning_rate": 3.728e-05, "loss": 1.7494, "mean_token_accuracy": 0.6438189834356308, "num_tokens": 9280916.0, "step": 2545 }, { "epoch": 19.615384615384617, "grad_norm": 0.0, "learning_rate": 3.7255e-05, "loss": 1.7621, "mean_token_accuracy": 0.6443462133407593, "num_tokens": 9299236.0, "step": 2550 }, { "epoch": 19.653846153846153, "grad_norm": 0.0, "learning_rate": 3.723e-05, "loss": 1.7594, "mean_token_accuracy": 0.6466510325670243, "num_tokens": 9317391.0, "step": 2555 }, { "epoch": 19.692307692307693, "grad_norm": 0.0, "learning_rate": 3.7205e-05, "loss": 1.6792, "mean_token_accuracy": 0.6581139594316483, "num_tokens": 9336860.0, "step": 2560 }, { "epoch": 19.73076923076923, "grad_norm": 0.0, "learning_rate": 3.7180000000000007e-05, "loss": 1.7497, "mean_token_accuracy": 0.6539162546396255, "num_tokens": 9354743.0, "step": 2565 }, { "epoch": 19.76923076923077, "grad_norm": 0.0, "learning_rate": 3.7155e-05, "loss": 1.7125, "mean_token_accuracy": 0.6545383244752884, "num_tokens": 9373351.0, "step": 2570 }, { "epoch": 19.807692307692307, "grad_norm": 0.0, "learning_rate": 3.7130000000000005e-05, "loss": 1.7363, "mean_token_accuracy": 0.6508906304836273, "num_tokens": 9391237.0, "step": 2575 }, { "epoch": 19.846153846153847, "grad_norm": 0.0, "learning_rate": 3.7105e-05, "loss": 1.6926, "mean_token_accuracy": 0.6579229325056076, "num_tokens": 9409658.0, "step": 2580 }, { "epoch": 19.884615384615383, "grad_norm": 0.0, "learning_rate": 3.7080000000000004e-05, "loss": 1.7471, "mean_token_accuracy": 0.6523754239082337, "num_tokens": 9427268.0, "step": 2585 }, { "epoch": 19.923076923076923, "grad_norm": 0.0, "learning_rate": 3.7055000000000004e-05, "loss": 1.7494, "mean_token_accuracy": 0.6500475823879241, "num_tokens": 9445373.0, "step": 2590 }, { "epoch": 19.96153846153846, "grad_norm": 0.0, "learning_rate": 3.703e-05, "loss": 1.763, "mean_token_accuracy": 0.6419122099876404, "num_tokens": 9463346.0, "step": 2595 }, { "epoch": 20.0, "grad_norm": 0.0, "learning_rate": 3.7005e-05, "loss": 1.7377, "mean_token_accuracy": 0.6534196078777313, "num_tokens": 9482100.0, "step": 2600 }, { "epoch": 20.03846153846154, "grad_norm": 0.0, "learning_rate": 3.698e-05, "loss": 1.7222, "mean_token_accuracy": 0.6527847439050675, "num_tokens": 9500755.0, "step": 2605 }, { "epoch": 20.076923076923077, "grad_norm": 0.0, "learning_rate": 3.6955e-05, "loss": 1.8368, "mean_token_accuracy": 0.6314696133136749, "num_tokens": 9518008.0, "step": 2610 }, { "epoch": 20.115384615384617, "grad_norm": 0.0, "learning_rate": 3.693e-05, "loss": 1.7166, "mean_token_accuracy": 0.656936526298523, "num_tokens": 9536514.0, "step": 2615 }, { "epoch": 20.153846153846153, "grad_norm": 0.0, "learning_rate": 3.6905e-05, "loss": 1.7997, "mean_token_accuracy": 0.6421493351459503, "num_tokens": 9554049.0, "step": 2620 }, { "epoch": 20.192307692307693, "grad_norm": 0.0, "learning_rate": 3.6880000000000006e-05, "loss": 1.7075, "mean_token_accuracy": 0.6625353038311005, "num_tokens": 9572125.0, "step": 2625 }, { "epoch": 20.23076923076923, "grad_norm": 0.0, "learning_rate": 3.6855e-05, "loss": 1.7367, "mean_token_accuracy": 0.6516992777585984, "num_tokens": 9590370.0, "step": 2630 }, { "epoch": 20.26923076923077, "grad_norm": 0.0, "learning_rate": 3.6830000000000005e-05, "loss": 1.7493, "mean_token_accuracy": 0.6508445411920547, "num_tokens": 9608387.0, "step": 2635 }, { "epoch": 20.307692307692307, "grad_norm": 0.0, "learning_rate": 3.6805e-05, "loss": 1.754, "mean_token_accuracy": 0.6476915091276169, "num_tokens": 9626593.0, "step": 2640 }, { "epoch": 20.346153846153847, "grad_norm": 0.0, "learning_rate": 3.6780000000000004e-05, "loss": 1.7698, "mean_token_accuracy": 0.6473258316516877, "num_tokens": 9644649.0, "step": 2645 }, { "epoch": 20.384615384615383, "grad_norm": 0.0, "learning_rate": 3.6755e-05, "loss": 1.7206, "mean_token_accuracy": 0.6517573058605194, "num_tokens": 9663339.0, "step": 2650 }, { "epoch": 20.423076923076923, "grad_norm": 0.0, "learning_rate": 3.673e-05, "loss": 1.8082, "mean_token_accuracy": 0.6340971022844315, "num_tokens": 9681407.0, "step": 2655 }, { "epoch": 20.46153846153846, "grad_norm": 0.0, "learning_rate": 3.6705e-05, "loss": 1.76, "mean_token_accuracy": 0.6423662751913071, "num_tokens": 9698951.0, "step": 2660 }, { "epoch": 20.5, "grad_norm": 0.0, "learning_rate": 3.668e-05, "loss": 1.7479, "mean_token_accuracy": 0.6455884456634522, "num_tokens": 9716798.0, "step": 2665 }, { "epoch": 20.53846153846154, "grad_norm": 0.0, "learning_rate": 3.6655e-05, "loss": 1.6967, "mean_token_accuracy": 0.6592382907867431, "num_tokens": 9735601.0, "step": 2670 }, { "epoch": 20.576923076923077, "grad_norm": 0.0, "learning_rate": 3.663e-05, "loss": 1.7629, "mean_token_accuracy": 0.6453653991222381, "num_tokens": 9753360.0, "step": 2675 }, { "epoch": 20.615384615384617, "grad_norm": 0.0, "learning_rate": 3.6605e-05, "loss": 1.7007, "mean_token_accuracy": 0.660412722826004, "num_tokens": 9771848.0, "step": 2680 }, { "epoch": 20.653846153846153, "grad_norm": 0.0, "learning_rate": 3.6580000000000006e-05, "loss": 1.7466, "mean_token_accuracy": 0.6478160947561264, "num_tokens": 9789801.0, "step": 2685 }, { "epoch": 20.692307692307693, "grad_norm": 0.0, "learning_rate": 3.6555e-05, "loss": 1.7047, "mean_token_accuracy": 0.6545427888631821, "num_tokens": 9808227.0, "step": 2690 }, { "epoch": 20.73076923076923, "grad_norm": 0.0, "learning_rate": 3.6530000000000004e-05, "loss": 1.7437, "mean_token_accuracy": 0.6440727055072785, "num_tokens": 9826666.0, "step": 2695 }, { "epoch": 20.76923076923077, "grad_norm": 0.0, "learning_rate": 3.6505e-05, "loss": 1.6243, "mean_token_accuracy": 0.6773371279239655, "num_tokens": 9845929.0, "step": 2700 }, { "epoch": 20.807692307692307, "grad_norm": 0.0, "learning_rate": 3.648e-05, "loss": 1.7548, "mean_token_accuracy": 0.6442576110363006, "num_tokens": 9864387.0, "step": 2705 }, { "epoch": 20.846153846153847, "grad_norm": 0.0, "learning_rate": 3.6455e-05, "loss": 1.7623, "mean_token_accuracy": 0.6505689769983292, "num_tokens": 9881953.0, "step": 2710 }, { "epoch": 20.884615384615383, "grad_norm": 0.0, "learning_rate": 3.643e-05, "loss": 1.7447, "mean_token_accuracy": 0.650253239274025, "num_tokens": 9900170.0, "step": 2715 }, { "epoch": 20.923076923076923, "grad_norm": 0.0, "learning_rate": 3.6405e-05, "loss": 1.7563, "mean_token_accuracy": 0.6442853361368179, "num_tokens": 9918308.0, "step": 2720 }, { "epoch": 20.96153846153846, "grad_norm": 0.0, "learning_rate": 3.638e-05, "loss": 1.7022, "mean_token_accuracy": 0.6568427920341492, "num_tokens": 9937402.0, "step": 2725 }, { "epoch": 21.0, "grad_norm": 0.0, "learning_rate": 3.6355e-05, "loss": 1.7716, "mean_token_accuracy": 0.6476004511117935, "num_tokens": 9956205.0, "step": 2730 }, { "epoch": 21.03846153846154, "grad_norm": 0.0, "learning_rate": 3.6330000000000006e-05, "loss": 1.7709, "mean_token_accuracy": 0.6501421749591827, "num_tokens": 9973863.0, "step": 2735 }, { "epoch": 21.076923076923077, "grad_norm": 0.0, "learning_rate": 3.6305e-05, "loss": 1.7418, "mean_token_accuracy": 0.6446869194507598, "num_tokens": 9991767.0, "step": 2740 }, { "epoch": 21.115384615384617, "grad_norm": 0.0, "learning_rate": 3.6280000000000005e-05, "loss": 1.7733, "mean_token_accuracy": 0.6450621813535691, "num_tokens": 10009914.0, "step": 2745 }, { "epoch": 21.153846153846153, "grad_norm": 0.0, "learning_rate": 3.6255e-05, "loss": 1.744, "mean_token_accuracy": 0.6536423653364182, "num_tokens": 10027897.0, "step": 2750 }, { "epoch": 21.192307692307693, "grad_norm": 0.0, "learning_rate": 3.6230000000000004e-05, "loss": 1.7402, "mean_token_accuracy": 0.6494254291057586, "num_tokens": 10046965.0, "step": 2755 }, { "epoch": 21.23076923076923, "grad_norm": 0.0, "learning_rate": 3.6205e-05, "loss": 1.758, "mean_token_accuracy": 0.6502227276563645, "num_tokens": 10065041.0, "step": 2760 }, { "epoch": 21.26923076923077, "grad_norm": 0.0, "learning_rate": 3.618e-05, "loss": 1.805, "mean_token_accuracy": 0.6402064859867096, "num_tokens": 10082396.0, "step": 2765 }, { "epoch": 21.307692307692307, "grad_norm": 0.0, "learning_rate": 3.6155e-05, "loss": 1.7417, "mean_token_accuracy": 0.6509008765220642, "num_tokens": 10100522.0, "step": 2770 }, { "epoch": 21.346153846153847, "grad_norm": 0.0, "learning_rate": 3.613e-05, "loss": 1.7474, "mean_token_accuracy": 0.6551948994398117, "num_tokens": 10118325.0, "step": 2775 }, { "epoch": 21.384615384615383, "grad_norm": 0.0, "learning_rate": 3.6105e-05, "loss": 1.6998, "mean_token_accuracy": 0.6569103240966797, "num_tokens": 10137261.0, "step": 2780 }, { "epoch": 21.423076923076923, "grad_norm": 0.0, "learning_rate": 3.608e-05, "loss": 1.7573, "mean_token_accuracy": 0.6484821021556855, "num_tokens": 10155636.0, "step": 2785 }, { "epoch": 21.46153846153846, "grad_norm": 0.0, "learning_rate": 3.6055e-05, "loss": 1.7989, "mean_token_accuracy": 0.6373539805412293, "num_tokens": 10173181.0, "step": 2790 }, { "epoch": 21.5, "grad_norm": 0.0, "learning_rate": 3.6030000000000006e-05, "loss": 1.7508, "mean_token_accuracy": 0.6406475752592087, "num_tokens": 10191767.0, "step": 2795 }, { "epoch": 21.53846153846154, "grad_norm": 0.0, "learning_rate": 3.6005e-05, "loss": 1.7051, "mean_token_accuracy": 0.6602169930934906, "num_tokens": 10210631.0, "step": 2800 }, { "epoch": 21.576923076923077, "grad_norm": 0.0, "learning_rate": 3.5980000000000004e-05, "loss": 1.6777, "mean_token_accuracy": 0.6621813416481018, "num_tokens": 10229408.0, "step": 2805 }, { "epoch": 21.615384615384617, "grad_norm": 0.0, "learning_rate": 3.5955e-05, "loss": 1.7332, "mean_token_accuracy": 0.6526377439498902, "num_tokens": 10247835.0, "step": 2810 }, { "epoch": 21.653846153846153, "grad_norm": 0.0, "learning_rate": 3.593e-05, "loss": 1.7106, "mean_token_accuracy": 0.6549345046281815, "num_tokens": 10266504.0, "step": 2815 }, { "epoch": 21.692307692307693, "grad_norm": 0.0, "learning_rate": 3.5905e-05, "loss": 1.7201, "mean_token_accuracy": 0.6562414228916168, "num_tokens": 10284706.0, "step": 2820 }, { "epoch": 21.73076923076923, "grad_norm": 0.0, "learning_rate": 3.588e-05, "loss": 1.7226, "mean_token_accuracy": 0.6576842248439789, "num_tokens": 10302875.0, "step": 2825 }, { "epoch": 21.76923076923077, "grad_norm": 0.0, "learning_rate": 3.5855e-05, "loss": 1.6721, "mean_token_accuracy": 0.6636721462011337, "num_tokens": 10321523.0, "step": 2830 }, { "epoch": 21.807692307692307, "grad_norm": 0.0, "learning_rate": 3.583e-05, "loss": 1.7523, "mean_token_accuracy": 0.6470031648874283, "num_tokens": 10339297.0, "step": 2835 }, { "epoch": 21.846153846153847, "grad_norm": 0.0, "learning_rate": 3.5805e-05, "loss": 1.7419, "mean_token_accuracy": 0.6513529509305954, "num_tokens": 10357839.0, "step": 2840 }, { "epoch": 21.884615384615383, "grad_norm": 0.0, "learning_rate": 3.578e-05, "loss": 1.7623, "mean_token_accuracy": 0.6404614865779876, "num_tokens": 10375584.0, "step": 2845 }, { "epoch": 21.923076923076923, "grad_norm": 0.0, "learning_rate": 3.5755e-05, "loss": 1.737, "mean_token_accuracy": 0.6452168464660645, "num_tokens": 10394319.0, "step": 2850 }, { "epoch": 21.96153846153846, "grad_norm": 0.0, "learning_rate": 3.5730000000000005e-05, "loss": 1.6929, "mean_token_accuracy": 0.6594237118959427, "num_tokens": 10412963.0, "step": 2855 }, { "epoch": 22.0, "grad_norm": 0.0, "learning_rate": 3.5705e-05, "loss": 1.786, "mean_token_accuracy": 0.6423673510551453, "num_tokens": 10430310.0, "step": 2860 }, { "epoch": 22.03846153846154, "grad_norm": 0.0, "learning_rate": 3.5680000000000004e-05, "loss": 1.7693, "mean_token_accuracy": 0.6458660453557968, "num_tokens": 10448139.0, "step": 2865 }, { "epoch": 22.076923076923077, "grad_norm": 0.0, "learning_rate": 3.5654999999999997e-05, "loss": 1.7493, "mean_token_accuracy": 0.6492699027061463, "num_tokens": 10466020.0, "step": 2870 }, { "epoch": 22.115384615384617, "grad_norm": 0.0, "learning_rate": 3.563e-05, "loss": 1.7669, "mean_token_accuracy": 0.6457598328590393, "num_tokens": 10483856.0, "step": 2875 }, { "epoch": 22.153846153846153, "grad_norm": 0.0, "learning_rate": 3.5605e-05, "loss": 1.6918, "mean_token_accuracy": 0.6608535885810852, "num_tokens": 10502391.0, "step": 2880 }, { "epoch": 22.192307692307693, "grad_norm": 0.0, "learning_rate": 3.558e-05, "loss": 1.7783, "mean_token_accuracy": 0.6480051964521408, "num_tokens": 10520184.0, "step": 2885 }, { "epoch": 22.23076923076923, "grad_norm": 0.0, "learning_rate": 3.5555e-05, "loss": 1.7946, "mean_token_accuracy": 0.6426310211420059, "num_tokens": 10537625.0, "step": 2890 }, { "epoch": 22.26923076923077, "grad_norm": 0.0, "learning_rate": 3.553e-05, "loss": 1.8393, "mean_token_accuracy": 0.6324712634086609, "num_tokens": 10554726.0, "step": 2895 }, { "epoch": 22.307692307692307, "grad_norm": 0.0, "learning_rate": 3.5505e-05, "loss": 1.7677, "mean_token_accuracy": 0.6496516615152359, "num_tokens": 10572379.0, "step": 2900 }, { "epoch": 22.346153846153847, "grad_norm": 0.0, "learning_rate": 3.548e-05, "loss": 1.7381, "mean_token_accuracy": 0.6457451343536377, "num_tokens": 10591449.0, "step": 2905 }, { "epoch": 22.384615384615383, "grad_norm": 0.0, "learning_rate": 3.5455e-05, "loss": 1.6608, "mean_token_accuracy": 0.6606644958257675, "num_tokens": 10611295.0, "step": 2910 }, { "epoch": 22.423076923076923, "grad_norm": 0.0, "learning_rate": 3.5430000000000005e-05, "loss": 1.6849, "mean_token_accuracy": 0.6587489366531372, "num_tokens": 10629917.0, "step": 2915 }, { "epoch": 22.46153846153846, "grad_norm": 0.0, "learning_rate": 3.5405e-05, "loss": 1.7484, "mean_token_accuracy": 0.6465423703193665, "num_tokens": 10647956.0, "step": 2920 }, { "epoch": 22.5, "grad_norm": 0.0, "learning_rate": 3.5380000000000003e-05, "loss": 1.7415, "mean_token_accuracy": 0.6542441755533218, "num_tokens": 10666004.0, "step": 2925 }, { "epoch": 22.53846153846154, "grad_norm": 0.0, "learning_rate": 3.5354999999999996e-05, "loss": 1.7098, "mean_token_accuracy": 0.654433760046959, "num_tokens": 10684772.0, "step": 2930 }, { "epoch": 22.576923076923077, "grad_norm": 0.0, "learning_rate": 3.533e-05, "loss": 1.8088, "mean_token_accuracy": 0.63780497610569, "num_tokens": 10702520.0, "step": 2935 }, { "epoch": 22.615384615384617, "grad_norm": 0.0, "learning_rate": 3.5305e-05, "loss": 1.7072, "mean_token_accuracy": 0.659371867775917, "num_tokens": 10721196.0, "step": 2940 }, { "epoch": 22.653846153846153, "grad_norm": 0.0, "learning_rate": 3.528e-05, "loss": 1.7689, "mean_token_accuracy": 0.647321754693985, "num_tokens": 10738660.0, "step": 2945 }, { "epoch": 22.692307692307693, "grad_norm": 0.0, "learning_rate": 3.5255e-05, "loss": 1.7366, "mean_token_accuracy": 0.6515435039997101, "num_tokens": 10757158.0, "step": 2950 }, { "epoch": 22.73076923076923, "grad_norm": 0.0, "learning_rate": 3.523e-05, "loss": 1.7007, "mean_token_accuracy": 0.6563295513391495, "num_tokens": 10775636.0, "step": 2955 }, { "epoch": 22.76923076923077, "grad_norm": 0.0, "learning_rate": 3.5205e-05, "loss": 1.7159, "mean_token_accuracy": 0.6521429270505905, "num_tokens": 10794398.0, "step": 2960 }, { "epoch": 22.807692307692307, "grad_norm": 0.0, "learning_rate": 3.518e-05, "loss": 1.667, "mean_token_accuracy": 0.6644465386867523, "num_tokens": 10813475.0, "step": 2965 }, { "epoch": 22.846153846153847, "grad_norm": 0.0, "learning_rate": 3.5155e-05, "loss": 1.7503, "mean_token_accuracy": 0.6510158330202103, "num_tokens": 10831125.0, "step": 2970 }, { "epoch": 22.884615384615383, "grad_norm": 0.0, "learning_rate": 3.5130000000000004e-05, "loss": 1.7005, "mean_token_accuracy": 0.662487056851387, "num_tokens": 10849387.0, "step": 2975 }, { "epoch": 22.923076923076923, "grad_norm": 0.0, "learning_rate": 3.5105e-05, "loss": 1.7911, "mean_token_accuracy": 0.6427533149719238, "num_tokens": 10867773.0, "step": 2980 }, { "epoch": 22.96153846153846, "grad_norm": 0.0, "learning_rate": 3.508e-05, "loss": 1.7629, "mean_token_accuracy": 0.6471167922019958, "num_tokens": 10885829.0, "step": 2985 }, { "epoch": 23.0, "grad_norm": 0.0, "learning_rate": 3.5055e-05, "loss": 1.7349, "mean_token_accuracy": 0.65148167014122, "num_tokens": 10904415.0, "step": 2990 }, { "epoch": 23.03846153846154, "grad_norm": 0.0, "learning_rate": 3.503e-05, "loss": 1.7296, "mean_token_accuracy": 0.6501170575618744, "num_tokens": 10922228.0, "step": 2995 }, { "epoch": 23.076923076923077, "grad_norm": 0.0, "learning_rate": 3.5005e-05, "loss": 1.7787, "mean_token_accuracy": 0.6366728276014328, "num_tokens": 10940239.0, "step": 3000 }, { "epoch": 23.115384615384617, "grad_norm": 0.0, "learning_rate": 3.498e-05, "loss": 1.7362, "mean_token_accuracy": 0.6528029084205628, "num_tokens": 10958391.0, "step": 3005 }, { "epoch": 23.153846153846153, "grad_norm": 0.0, "learning_rate": 3.495500000000001e-05, "loss": 1.7442, "mean_token_accuracy": 0.6534345805644989, "num_tokens": 10976257.0, "step": 3010 }, { "epoch": 23.192307692307693, "grad_norm": 0.0, "learning_rate": 3.493e-05, "loss": 1.7464, "mean_token_accuracy": 0.6423383712768554, "num_tokens": 10995098.0, "step": 3015 }, { "epoch": 23.23076923076923, "grad_norm": 0.0, "learning_rate": 3.4905000000000005e-05, "loss": 1.7095, "mean_token_accuracy": 0.6603679537773133, "num_tokens": 11013198.0, "step": 3020 }, { "epoch": 23.26923076923077, "grad_norm": 0.0, "learning_rate": 3.4880000000000005e-05, "loss": 1.6675, "mean_token_accuracy": 0.6619115889072418, "num_tokens": 11032529.0, "step": 3025 }, { "epoch": 23.307692307692307, "grad_norm": 0.0, "learning_rate": 3.4855000000000004e-05, "loss": 1.7215, "mean_token_accuracy": 0.6535635858774185, "num_tokens": 11051447.0, "step": 3030 }, { "epoch": 23.346153846153847, "grad_norm": 0.0, "learning_rate": 3.4830000000000004e-05, "loss": 1.7505, "mean_token_accuracy": 0.6513790518045426, "num_tokens": 11069725.0, "step": 3035 }, { "epoch": 23.384615384615383, "grad_norm": 0.0, "learning_rate": 3.4805e-05, "loss": 1.7062, "mean_token_accuracy": 0.660124909877777, "num_tokens": 11088473.0, "step": 3040 }, { "epoch": 23.423076923076923, "grad_norm": 0.0, "learning_rate": 3.478e-05, "loss": 1.7365, "mean_token_accuracy": 0.6506242454051971, "num_tokens": 11106701.0, "step": 3045 }, { "epoch": 23.46153846153846, "grad_norm": 0.0, "learning_rate": 3.4755e-05, "loss": 1.747, "mean_token_accuracy": 0.6486080348491668, "num_tokens": 11124657.0, "step": 3050 }, { "epoch": 23.5, "grad_norm": 0.0, "learning_rate": 3.473e-05, "loss": 1.7576, "mean_token_accuracy": 0.6459647357463837, "num_tokens": 11142612.0, "step": 3055 }, { "epoch": 23.53846153846154, "grad_norm": 0.0, "learning_rate": 3.470500000000001e-05, "loss": 1.7698, "mean_token_accuracy": 0.6444110184907913, "num_tokens": 11160572.0, "step": 3060 }, { "epoch": 23.576923076923077, "grad_norm": 0.0, "learning_rate": 3.468e-05, "loss": 1.7671, "mean_token_accuracy": 0.6471373438835144, "num_tokens": 11178814.0, "step": 3065 }, { "epoch": 23.615384615384617, "grad_norm": 0.0, "learning_rate": 3.4655000000000006e-05, "loss": 1.7604, "mean_token_accuracy": 0.650464779138565, "num_tokens": 11196788.0, "step": 3070 }, { "epoch": 23.653846153846153, "grad_norm": 0.0, "learning_rate": 3.463e-05, "loss": 1.7563, "mean_token_accuracy": 0.6483624398708343, "num_tokens": 11214549.0, "step": 3075 }, { "epoch": 23.692307692307693, "grad_norm": 0.0, "learning_rate": 3.4605000000000005e-05, "loss": 1.6635, "mean_token_accuracy": 0.6658029884099961, "num_tokens": 11233473.0, "step": 3080 }, { "epoch": 23.73076923076923, "grad_norm": 0.0, "learning_rate": 3.4580000000000004e-05, "loss": 1.8208, "mean_token_accuracy": 0.6367295324802399, "num_tokens": 11250907.0, "step": 3085 }, { "epoch": 23.76923076923077, "grad_norm": 0.0, "learning_rate": 3.4555000000000004e-05, "loss": 1.7758, "mean_token_accuracy": 0.6431278616189957, "num_tokens": 11268887.0, "step": 3090 }, { "epoch": 23.807692307692307, "grad_norm": 0.0, "learning_rate": 3.453e-05, "loss": 1.6927, "mean_token_accuracy": 0.6590600997209549, "num_tokens": 11287728.0, "step": 3095 }, { "epoch": 23.846153846153847, "grad_norm": 0.0, "learning_rate": 3.4505e-05, "loss": 1.6928, "mean_token_accuracy": 0.6589304953813553, "num_tokens": 11306559.0, "step": 3100 }, { "epoch": 23.884615384615383, "grad_norm": 0.0, "learning_rate": 3.448e-05, "loss": 1.7559, "mean_token_accuracy": 0.647815215587616, "num_tokens": 11324582.0, "step": 3105 }, { "epoch": 23.923076923076923, "grad_norm": 0.0, "learning_rate": 3.4455e-05, "loss": 1.7362, "mean_token_accuracy": 0.6531584694981575, "num_tokens": 11343111.0, "step": 3110 }, { "epoch": 23.96153846153846, "grad_norm": 0.0, "learning_rate": 3.443e-05, "loss": 1.7569, "mean_token_accuracy": 0.6495847851037979, "num_tokens": 11361415.0, "step": 3115 }, { "epoch": 24.0, "grad_norm": 0.0, "learning_rate": 3.440500000000001e-05, "loss": 1.8106, "mean_token_accuracy": 0.6377548843622207, "num_tokens": 11378520.0, "step": 3120 }, { "epoch": 24.03846153846154, "grad_norm": 0.0, "learning_rate": 3.438e-05, "loss": 1.7197, "mean_token_accuracy": 0.6562131911516189, "num_tokens": 11397124.0, "step": 3125 }, { "epoch": 24.076923076923077, "grad_norm": 0.0, "learning_rate": 3.4355000000000006e-05, "loss": 1.7154, "mean_token_accuracy": 0.6540484815835953, "num_tokens": 11415431.0, "step": 3130 }, { "epoch": 24.115384615384617, "grad_norm": 0.0, "learning_rate": 3.433e-05, "loss": 1.7791, "mean_token_accuracy": 0.6443376511335372, "num_tokens": 11433445.0, "step": 3135 }, { "epoch": 24.153846153846153, "grad_norm": 0.0, "learning_rate": 3.4305000000000004e-05, "loss": 1.7501, "mean_token_accuracy": 0.6464499235153198, "num_tokens": 11451759.0, "step": 3140 }, { "epoch": 24.192307692307693, "grad_norm": 0.0, "learning_rate": 3.4280000000000004e-05, "loss": 1.7251, "mean_token_accuracy": 0.6546833992004395, "num_tokens": 11470163.0, "step": 3145 }, { "epoch": 24.23076923076923, "grad_norm": 0.0, "learning_rate": 3.4255e-05, "loss": 1.7204, "mean_token_accuracy": 0.6526606291532516, "num_tokens": 11488656.0, "step": 3150 }, { "epoch": 24.26923076923077, "grad_norm": 0.0, "learning_rate": 3.423e-05, "loss": 1.8343, "mean_token_accuracy": 0.6308804154396057, "num_tokens": 11505937.0, "step": 3155 }, { "epoch": 24.307692307692307, "grad_norm": 0.0, "learning_rate": 3.4205e-05, "loss": 1.7237, "mean_token_accuracy": 0.6544101685285568, "num_tokens": 11524471.0, "step": 3160 }, { "epoch": 24.346153846153847, "grad_norm": 0.0, "learning_rate": 3.418e-05, "loss": 1.7619, "mean_token_accuracy": 0.6497153013944625, "num_tokens": 11542302.0, "step": 3165 }, { "epoch": 24.384615384615383, "grad_norm": 0.0, "learning_rate": 3.4155e-05, "loss": 1.7937, "mean_token_accuracy": 0.6424742221832276, "num_tokens": 11560367.0, "step": 3170 }, { "epoch": 24.423076923076923, "grad_norm": 0.0, "learning_rate": 3.413e-05, "loss": 1.7211, "mean_token_accuracy": 0.6575414210557937, "num_tokens": 11578662.0, "step": 3175 }, { "epoch": 24.46153846153846, "grad_norm": 0.0, "learning_rate": 3.4105000000000006e-05, "loss": 1.7301, "mean_token_accuracy": 0.6548975586891175, "num_tokens": 11597057.0, "step": 3180 }, { "epoch": 24.5, "grad_norm": 0.0, "learning_rate": 3.408e-05, "loss": 1.8287, "mean_token_accuracy": 0.6332879096269608, "num_tokens": 11614420.0, "step": 3185 }, { "epoch": 24.53846153846154, "grad_norm": 0.0, "learning_rate": 3.4055000000000005e-05, "loss": 1.6783, "mean_token_accuracy": 0.6583501130342484, "num_tokens": 11633485.0, "step": 3190 }, { "epoch": 24.576923076923077, "grad_norm": 0.0, "learning_rate": 3.403e-05, "loss": 1.7974, "mean_token_accuracy": 0.6381134271621705, "num_tokens": 11651379.0, "step": 3195 }, { "epoch": 24.615384615384617, "grad_norm": 0.0, "learning_rate": 3.4005000000000004e-05, "loss": 1.7224, "mean_token_accuracy": 0.6523280829191208, "num_tokens": 11669811.0, "step": 3200 }, { "epoch": 24.653846153846153, "grad_norm": 0.0, "learning_rate": 3.398e-05, "loss": 1.71, "mean_token_accuracy": 0.6583969056606293, "num_tokens": 11687820.0, "step": 3205 }, { "epoch": 24.692307692307693, "grad_norm": 0.0, "learning_rate": 3.3955e-05, "loss": 1.7925, "mean_token_accuracy": 0.636654618382454, "num_tokens": 11705825.0, "step": 3210 }, { "epoch": 24.73076923076923, "grad_norm": 0.0, "learning_rate": 3.393e-05, "loss": 1.7292, "mean_token_accuracy": 0.6524395048618317, "num_tokens": 11723774.0, "step": 3215 }, { "epoch": 24.76923076923077, "grad_norm": 0.0, "learning_rate": 3.3905e-05, "loss": 1.7275, "mean_token_accuracy": 0.6529559135437012, "num_tokens": 11741969.0, "step": 3220 }, { "epoch": 24.807692307692307, "grad_norm": 0.0, "learning_rate": 3.388e-05, "loss": 1.7514, "mean_token_accuracy": 0.6474628210067749, "num_tokens": 11759857.0, "step": 3225 }, { "epoch": 24.846153846153847, "grad_norm": 0.0, "learning_rate": 3.3855e-05, "loss": 1.696, "mean_token_accuracy": 0.6536848098039627, "num_tokens": 11778672.0, "step": 3230 }, { "epoch": 24.884615384615383, "grad_norm": 0.0, "learning_rate": 3.383e-05, "loss": 1.6997, "mean_token_accuracy": 0.6613029778003693, "num_tokens": 11797888.0, "step": 3235 }, { "epoch": 24.923076923076923, "grad_norm": 0.0, "learning_rate": 3.3805000000000006e-05, "loss": 1.6657, "mean_token_accuracy": 0.6732263565063477, "num_tokens": 11816590.0, "step": 3240 }, { "epoch": 24.96153846153846, "grad_norm": 0.0, "learning_rate": 3.378e-05, "loss": 1.8309, "mean_token_accuracy": 0.634924265742302, "num_tokens": 11833413.0, "step": 3245 }, { "epoch": 25.0, "grad_norm": 0.0, "learning_rate": 3.3755000000000005e-05, "loss": 1.6911, "mean_token_accuracy": 0.6547602891921998, "num_tokens": 11852625.0, "step": 3250 }, { "epoch": 25.03846153846154, "grad_norm": 0.0, "learning_rate": 3.373e-05, "loss": 1.7518, "mean_token_accuracy": 0.6456693172454834, "num_tokens": 11870498.0, "step": 3255 }, { "epoch": 25.076923076923077, "grad_norm": 0.0, "learning_rate": 3.3705000000000003e-05, "loss": 1.734, "mean_token_accuracy": 0.6480823725461959, "num_tokens": 11888806.0, "step": 3260 }, { "epoch": 25.115384615384617, "grad_norm": 0.0, "learning_rate": 3.368e-05, "loss": 1.7296, "mean_token_accuracy": 0.6544734388589859, "num_tokens": 11907007.0, "step": 3265 }, { "epoch": 25.153846153846153, "grad_norm": 0.0, "learning_rate": 3.3655e-05, "loss": 1.7, "mean_token_accuracy": 0.661970067024231, "num_tokens": 11925855.0, "step": 3270 }, { "epoch": 25.192307692307693, "grad_norm": 0.0, "learning_rate": 3.363e-05, "loss": 1.7531, "mean_token_accuracy": 0.6502303391695022, "num_tokens": 11944217.0, "step": 3275 }, { "epoch": 25.23076923076923, "grad_norm": 0.0, "learning_rate": 3.3605e-05, "loss": 1.7867, "mean_token_accuracy": 0.6456191658973693, "num_tokens": 11961900.0, "step": 3280 }, { "epoch": 25.26923076923077, "grad_norm": 0.0, "learning_rate": 3.358e-05, "loss": 1.7118, "mean_token_accuracy": 0.6574157625436783, "num_tokens": 11980478.0, "step": 3285 }, { "epoch": 25.307692307692307, "grad_norm": 0.0, "learning_rate": 3.3555e-05, "loss": 1.752, "mean_token_accuracy": 0.6447003424167633, "num_tokens": 11998539.0, "step": 3290 }, { "epoch": 25.346153846153847, "grad_norm": 0.0, "learning_rate": 3.353e-05, "loss": 1.7875, "mean_token_accuracy": 0.6442469358444214, "num_tokens": 12016433.0, "step": 3295 }, { "epoch": 25.384615384615383, "grad_norm": 0.0, "learning_rate": 3.3505000000000005e-05, "loss": 1.7792, "mean_token_accuracy": 0.6442890495061875, "num_tokens": 12034074.0, "step": 3300 }, { "epoch": 25.423076923076923, "grad_norm": 0.0, "learning_rate": 3.348e-05, "loss": 1.7701, "mean_token_accuracy": 0.6484797149896622, "num_tokens": 12051813.0, "step": 3305 }, { "epoch": 25.46153846153846, "grad_norm": 0.0, "learning_rate": 3.3455000000000004e-05, "loss": 1.8468, "mean_token_accuracy": 0.632660773396492, "num_tokens": 12068538.0, "step": 3310 }, { "epoch": 25.5, "grad_norm": 0.0, "learning_rate": 3.3430000000000003e-05, "loss": 1.6571, "mean_token_accuracy": 0.6714397668838501, "num_tokens": 12087276.0, "step": 3315 }, { "epoch": 25.53846153846154, "grad_norm": 0.0, "learning_rate": 3.3405e-05, "loss": 1.7512, "mean_token_accuracy": 0.6499413967132568, "num_tokens": 12105620.0, "step": 3320 }, { "epoch": 25.576923076923077, "grad_norm": 0.0, "learning_rate": 3.338e-05, "loss": 1.7522, "mean_token_accuracy": 0.6516470074653625, "num_tokens": 12123634.0, "step": 3325 }, { "epoch": 25.615384615384617, "grad_norm": 0.0, "learning_rate": 3.3355e-05, "loss": 1.6871, "mean_token_accuracy": 0.6579142987728119, "num_tokens": 12142945.0, "step": 3330 }, { "epoch": 25.653846153846153, "grad_norm": 0.0, "learning_rate": 3.333e-05, "loss": 1.6238, "mean_token_accuracy": 0.6688612520694732, "num_tokens": 12162962.0, "step": 3335 }, { "epoch": 25.692307692307693, "grad_norm": 0.0, "learning_rate": 3.3305e-05, "loss": 1.8432, "mean_token_accuracy": 0.6307004660367965, "num_tokens": 12180342.0, "step": 3340 }, { "epoch": 25.73076923076923, "grad_norm": 0.0, "learning_rate": 3.328e-05, "loss": 1.7708, "mean_token_accuracy": 0.6448087066411972, "num_tokens": 12198282.0, "step": 3345 }, { "epoch": 25.76923076923077, "grad_norm": 0.0, "learning_rate": 3.3255000000000006e-05, "loss": 1.7765, "mean_token_accuracy": 0.6405314236879349, "num_tokens": 12215896.0, "step": 3350 }, { "epoch": 25.807692307692307, "grad_norm": 0.0, "learning_rate": 3.323e-05, "loss": 1.7424, "mean_token_accuracy": 0.6420567870140076, "num_tokens": 12234723.0, "step": 3355 }, { "epoch": 25.846153846153847, "grad_norm": 0.0, "learning_rate": 3.3205000000000005e-05, "loss": 1.6715, "mean_token_accuracy": 0.6686410069465637, "num_tokens": 12253662.0, "step": 3360 }, { "epoch": 25.884615384615383, "grad_norm": 0.0, "learning_rate": 3.318e-05, "loss": 1.7287, "mean_token_accuracy": 0.653440797328949, "num_tokens": 12271662.0, "step": 3365 }, { "epoch": 25.923076923076923, "grad_norm": 0.0, "learning_rate": 3.3155000000000004e-05, "loss": 1.7753, "mean_token_accuracy": 0.6468665450811386, "num_tokens": 12289329.0, "step": 3370 }, { "epoch": 25.96153846153846, "grad_norm": 0.0, "learning_rate": 3.313e-05, "loss": 1.764, "mean_token_accuracy": 0.6430284142494201, "num_tokens": 12307193.0, "step": 3375 }, { "epoch": 26.0, "grad_norm": 0.0, "learning_rate": 3.3105e-05, "loss": 1.6559, "mean_token_accuracy": 0.6661966532468796, "num_tokens": 12326730.0, "step": 3380 }, { "epoch": 26.03846153846154, "grad_norm": 0.0, "learning_rate": 3.308e-05, "loss": 1.7868, "mean_token_accuracy": 0.6423007547855377, "num_tokens": 12344701.0, "step": 3385 }, { "epoch": 26.076923076923077, "grad_norm": 0.0, "learning_rate": 3.3055e-05, "loss": 1.7293, "mean_token_accuracy": 0.6523230075836182, "num_tokens": 12362936.0, "step": 3390 }, { "epoch": 26.115384615384617, "grad_norm": 0.0, "learning_rate": 3.303e-05, "loss": 1.7103, "mean_token_accuracy": 0.6539131790399552, "num_tokens": 12381863.0, "step": 3395 }, { "epoch": 26.153846153846153, "grad_norm": 0.0, "learning_rate": 3.3005e-05, "loss": 1.765, "mean_token_accuracy": 0.6502564907073974, "num_tokens": 12399802.0, "step": 3400 }, { "epoch": 26.192307692307693, "grad_norm": 0.0, "learning_rate": 3.298e-05, "loss": 1.7704, "mean_token_accuracy": 0.6401029378175735, "num_tokens": 12417656.0, "step": 3405 }, { "epoch": 26.23076923076923, "grad_norm": 0.0, "learning_rate": 3.2955000000000006e-05, "loss": 1.6876, "mean_token_accuracy": 0.6598240792751312, "num_tokens": 12436367.0, "step": 3410 }, { "epoch": 26.26923076923077, "grad_norm": 0.0, "learning_rate": 3.293e-05, "loss": 1.7437, "mean_token_accuracy": 0.6541768878698349, "num_tokens": 12454372.0, "step": 3415 }, { "epoch": 26.307692307692307, "grad_norm": 0.0, "learning_rate": 3.2905000000000004e-05, "loss": 1.7304, "mean_token_accuracy": 0.6505888134241105, "num_tokens": 12472148.0, "step": 3420 }, { "epoch": 26.346153846153847, "grad_norm": 0.0, "learning_rate": 3.288e-05, "loss": 1.7421, "mean_token_accuracy": 0.6458885490894317, "num_tokens": 12491225.0, "step": 3425 }, { "epoch": 26.384615384615383, "grad_norm": 0.0, "learning_rate": 3.2855e-05, "loss": 1.7056, "mean_token_accuracy": 0.6600462704896927, "num_tokens": 12510093.0, "step": 3430 }, { "epoch": 26.423076923076923, "grad_norm": 0.0, "learning_rate": 3.283e-05, "loss": 1.7006, "mean_token_accuracy": 0.6607132166624069, "num_tokens": 12529121.0, "step": 3435 }, { "epoch": 26.46153846153846, "grad_norm": 0.0, "learning_rate": 3.2805e-05, "loss": 1.7692, "mean_token_accuracy": 0.6444113343954087, "num_tokens": 12546721.0, "step": 3440 }, { "epoch": 26.5, "grad_norm": 0.0, "learning_rate": 3.278e-05, "loss": 1.7363, "mean_token_accuracy": 0.6494855612516404, "num_tokens": 12564745.0, "step": 3445 }, { "epoch": 26.53846153846154, "grad_norm": 0.0, "learning_rate": 3.2755e-05, "loss": 1.7475, "mean_token_accuracy": 0.6525735288858414, "num_tokens": 12583018.0, "step": 3450 }, { "epoch": 26.576923076923077, "grad_norm": 0.0, "learning_rate": 3.273e-05, "loss": 1.7463, "mean_token_accuracy": 0.6488917529582977, "num_tokens": 12600537.0, "step": 3455 }, { "epoch": 26.615384615384617, "grad_norm": 0.0, "learning_rate": 3.2705e-05, "loss": 1.7124, "mean_token_accuracy": 0.6540741354227066, "num_tokens": 12619514.0, "step": 3460 }, { "epoch": 26.653846153846153, "grad_norm": 0.0, "learning_rate": 3.268e-05, "loss": 1.7322, "mean_token_accuracy": 0.6489483207464218, "num_tokens": 12637542.0, "step": 3465 }, { "epoch": 26.692307692307693, "grad_norm": 0.0, "learning_rate": 3.2655000000000005e-05, "loss": 1.6812, "mean_token_accuracy": 0.6622325748205184, "num_tokens": 12656523.0, "step": 3470 }, { "epoch": 26.73076923076923, "grad_norm": 0.0, "learning_rate": 3.263e-05, "loss": 1.7123, "mean_token_accuracy": 0.654995933175087, "num_tokens": 12675577.0, "step": 3475 }, { "epoch": 26.76923076923077, "grad_norm": 0.0, "learning_rate": 3.2605000000000004e-05, "loss": 1.7554, "mean_token_accuracy": 0.6486168265342712, "num_tokens": 12693239.0, "step": 3480 }, { "epoch": 26.807692307692307, "grad_norm": 0.0, "learning_rate": 3.2579999999999996e-05, "loss": 1.6658, "mean_token_accuracy": 0.6676429510116577, "num_tokens": 12712177.0, "step": 3485 }, { "epoch": 26.846153846153847, "grad_norm": 0.0, "learning_rate": 3.2555e-05, "loss": 1.789, "mean_token_accuracy": 0.6442992717027665, "num_tokens": 12729550.0, "step": 3490 }, { "epoch": 26.884615384615383, "grad_norm": 0.0, "learning_rate": 3.253e-05, "loss": 1.7931, "mean_token_accuracy": 0.6460472613573074, "num_tokens": 12747450.0, "step": 3495 }, { "epoch": 26.923076923076923, "grad_norm": 0.0, "learning_rate": 3.2505e-05, "loss": 1.7583, "mean_token_accuracy": 0.6475391507148742, "num_tokens": 12765847.0, "step": 3500 }, { "epoch": 26.96153846153846, "grad_norm": 0.0, "learning_rate": 3.248e-05, "loss": 1.7987, "mean_token_accuracy": 0.6450091898441315, "num_tokens": 12783426.0, "step": 3505 }, { "epoch": 27.0, "grad_norm": 0.0, "learning_rate": 3.2455e-05, "loss": 1.8088, "mean_token_accuracy": 0.6360502719879151, "num_tokens": 12800835.0, "step": 3510 }, { "epoch": 27.03846153846154, "grad_norm": 0.0, "learning_rate": 3.243e-05, "loss": 1.6903, "mean_token_accuracy": 0.6623231947422028, "num_tokens": 12819454.0, "step": 3515 }, { "epoch": 27.076923076923077, "grad_norm": 0.0, "learning_rate": 3.2405e-05, "loss": 1.7798, "mean_token_accuracy": 0.6476831644773483, "num_tokens": 12837657.0, "step": 3520 }, { "epoch": 27.115384615384617, "grad_norm": 0.0, "learning_rate": 3.238e-05, "loss": 1.7351, "mean_token_accuracy": 0.6554979294538498, "num_tokens": 12855426.0, "step": 3525 }, { "epoch": 27.153846153846153, "grad_norm": 0.0, "learning_rate": 3.2355000000000004e-05, "loss": 1.6828, "mean_token_accuracy": 0.661261597275734, "num_tokens": 12873752.0, "step": 3530 }, { "epoch": 27.192307692307693, "grad_norm": 0.0, "learning_rate": 3.233e-05, "loss": 1.769, "mean_token_accuracy": 0.6420103222131729, "num_tokens": 12891459.0, "step": 3535 }, { "epoch": 27.23076923076923, "grad_norm": 0.0, "learning_rate": 3.2305e-05, "loss": 1.7324, "mean_token_accuracy": 0.6513606965541839, "num_tokens": 12910141.0, "step": 3540 }, { "epoch": 27.26923076923077, "grad_norm": 0.0, "learning_rate": 3.2279999999999996e-05, "loss": 1.7847, "mean_token_accuracy": 0.6459431618452072, "num_tokens": 12927928.0, "step": 3545 }, { "epoch": 27.307692307692307, "grad_norm": 0.0, "learning_rate": 3.2255e-05, "loss": 1.745, "mean_token_accuracy": 0.6481869876384735, "num_tokens": 12945933.0, "step": 3550 }, { "epoch": 27.346153846153847, "grad_norm": 0.0, "learning_rate": 3.223e-05, "loss": 1.7767, "mean_token_accuracy": 0.6434827774763108, "num_tokens": 12963760.0, "step": 3555 }, { "epoch": 27.384615384615383, "grad_norm": 0.0, "learning_rate": 3.2205e-05, "loss": 1.7383, "mean_token_accuracy": 0.6473019540309906, "num_tokens": 12982243.0, "step": 3560 }, { "epoch": 27.423076923076923, "grad_norm": 0.0, "learning_rate": 3.218e-05, "loss": 1.7827, "mean_token_accuracy": 0.6399312824010849, "num_tokens": 13000474.0, "step": 3565 }, { "epoch": 27.46153846153846, "grad_norm": 0.0, "learning_rate": 3.2155e-05, "loss": 1.7396, "mean_token_accuracy": 0.6523656696081161, "num_tokens": 13019062.0, "step": 3570 }, { "epoch": 27.5, "grad_norm": 0.0, "learning_rate": 3.213e-05, "loss": 1.7927, "mean_token_accuracy": 0.6346830785274505, "num_tokens": 13036934.0, "step": 3575 }, { "epoch": 27.53846153846154, "grad_norm": 0.0, "learning_rate": 3.2105e-05, "loss": 1.7951, "mean_token_accuracy": 0.6394420713186264, "num_tokens": 13054793.0, "step": 3580 }, { "epoch": 27.576923076923077, "grad_norm": 0.0, "learning_rate": 3.208e-05, "loss": 1.771, "mean_token_accuracy": 0.6480452120304108, "num_tokens": 13072483.0, "step": 3585 }, { "epoch": 27.615384615384617, "grad_norm": 0.0, "learning_rate": 3.2055000000000004e-05, "loss": 1.614, "mean_token_accuracy": 0.6677554935216904, "num_tokens": 13092135.0, "step": 3590 }, { "epoch": 27.653846153846153, "grad_norm": 0.0, "learning_rate": 3.2029999999999997e-05, "loss": 1.7139, "mean_token_accuracy": 0.6528209656476974, "num_tokens": 13110720.0, "step": 3595 }, { "epoch": 27.692307692307693, "grad_norm": 0.0, "learning_rate": 3.2005e-05, "loss": 1.6911, "mean_token_accuracy": 0.6594361692667008, "num_tokens": 13128968.0, "step": 3600 }, { "epoch": 27.73076923076923, "grad_norm": 0.0, "learning_rate": 3.198e-05, "loss": 1.6991, "mean_token_accuracy": 0.6626447290182114, "num_tokens": 13147908.0, "step": 3605 }, { "epoch": 27.76923076923077, "grad_norm": 0.0, "learning_rate": 3.1955e-05, "loss": 1.7859, "mean_token_accuracy": 0.6439048200845718, "num_tokens": 13165594.0, "step": 3610 }, { "epoch": 27.807692307692307, "grad_norm": 0.0, "learning_rate": 3.193e-05, "loss": 1.6987, "mean_token_accuracy": 0.6610940158367157, "num_tokens": 13183929.0, "step": 3615 }, { "epoch": 27.846153846153847, "grad_norm": 0.0, "learning_rate": 3.1905e-05, "loss": 1.8453, "mean_token_accuracy": 0.6301412254571914, "num_tokens": 13200878.0, "step": 3620 }, { "epoch": 27.884615384615383, "grad_norm": 0.0, "learning_rate": 3.188e-05, "loss": 1.7052, "mean_token_accuracy": 0.6592441231012345, "num_tokens": 13219674.0, "step": 3625 }, { "epoch": 27.923076923076923, "grad_norm": 0.0, "learning_rate": 3.1855e-05, "loss": 1.7817, "mean_token_accuracy": 0.6449559360742569, "num_tokens": 13238217.0, "step": 3630 }, { "epoch": 27.96153846153846, "grad_norm": 0.0, "learning_rate": 3.1830000000000005e-05, "loss": 1.7167, "mean_token_accuracy": 0.6618435323238373, "num_tokens": 13256740.0, "step": 3635 }, { "epoch": 28.0, "grad_norm": 0.0, "learning_rate": 3.1805000000000005e-05, "loss": 1.723, "mean_token_accuracy": 0.6585237294435501, "num_tokens": 13274940.0, "step": 3640 }, { "epoch": 28.03846153846154, "grad_norm": 0.0, "learning_rate": 3.1780000000000004e-05, "loss": 1.7343, "mean_token_accuracy": 0.6495951384305954, "num_tokens": 13293159.0, "step": 3645 }, { "epoch": 28.076923076923077, "grad_norm": 0.0, "learning_rate": 3.1755000000000003e-05, "loss": 1.7448, "mean_token_accuracy": 0.6506305515766144, "num_tokens": 13310902.0, "step": 3650 }, { "epoch": 28.115384615384617, "grad_norm": 0.0, "learning_rate": 3.173e-05, "loss": 1.7557, "mean_token_accuracy": 0.6501852482557297, "num_tokens": 13329266.0, "step": 3655 }, { "epoch": 28.153846153846153, "grad_norm": 0.0, "learning_rate": 3.1705e-05, "loss": 1.7799, "mean_token_accuracy": 0.6411935687065125, "num_tokens": 13346803.0, "step": 3660 }, { "epoch": 28.192307692307693, "grad_norm": 0.0, "learning_rate": 3.168e-05, "loss": 1.6598, "mean_token_accuracy": 0.6662024825811386, "num_tokens": 13366046.0, "step": 3665 }, { "epoch": 28.23076923076923, "grad_norm": 0.0, "learning_rate": 3.1655e-05, "loss": 1.7028, "mean_token_accuracy": 0.6530542701482773, "num_tokens": 13384925.0, "step": 3670 }, { "epoch": 28.26923076923077, "grad_norm": 0.0, "learning_rate": 3.163000000000001e-05, "loss": 1.77, "mean_token_accuracy": 0.6439873456954956, "num_tokens": 13402589.0, "step": 3675 }, { "epoch": 28.307692307692307, "grad_norm": 0.0, "learning_rate": 3.1605e-05, "loss": 1.7013, "mean_token_accuracy": 0.6571666300296783, "num_tokens": 13421123.0, "step": 3680 }, { "epoch": 28.346153846153847, "grad_norm": 0.0, "learning_rate": 3.1580000000000006e-05, "loss": 1.7788, "mean_token_accuracy": 0.6432044029235839, "num_tokens": 13439431.0, "step": 3685 }, { "epoch": 28.384615384615383, "grad_norm": 0.0, "learning_rate": 3.1555e-05, "loss": 1.7207, "mean_token_accuracy": 0.6578160703182221, "num_tokens": 13457841.0, "step": 3690 }, { "epoch": 28.423076923076923, "grad_norm": 0.0, "learning_rate": 3.1530000000000005e-05, "loss": 1.7291, "mean_token_accuracy": 0.6534811556339264, "num_tokens": 13476489.0, "step": 3695 }, { "epoch": 28.46153846153846, "grad_norm": 0.0, "learning_rate": 3.1505000000000004e-05, "loss": 1.7142, "mean_token_accuracy": 0.6544462114572525, "num_tokens": 13494620.0, "step": 3700 }, { "epoch": 28.5, "grad_norm": 0.0, "learning_rate": 3.1480000000000004e-05, "loss": 1.7707, "mean_token_accuracy": 0.6433985292911529, "num_tokens": 13512210.0, "step": 3705 }, { "epoch": 28.53846153846154, "grad_norm": 0.0, "learning_rate": 3.1455e-05, "loss": 1.7642, "mean_token_accuracy": 0.6466316103935241, "num_tokens": 13529925.0, "step": 3710 }, { "epoch": 28.576923076923077, "grad_norm": 0.0, "learning_rate": 3.143e-05, "loss": 1.6938, "mean_token_accuracy": 0.663212212920189, "num_tokens": 13548438.0, "step": 3715 }, { "epoch": 28.615384615384617, "grad_norm": 0.0, "learning_rate": 3.1405e-05, "loss": 1.7596, "mean_token_accuracy": 0.6495319962501526, "num_tokens": 13566473.0, "step": 3720 }, { "epoch": 28.653846153846153, "grad_norm": 0.0, "learning_rate": 3.138e-05, "loss": 1.7426, "mean_token_accuracy": 0.6494800269603729, "num_tokens": 13584724.0, "step": 3725 }, { "epoch": 28.692307692307693, "grad_norm": 0.0, "learning_rate": 3.1355e-05, "loss": 1.7341, "mean_token_accuracy": 0.6527552962303161, "num_tokens": 13603304.0, "step": 3730 }, { "epoch": 28.73076923076923, "grad_norm": 0.0, "learning_rate": 3.133000000000001e-05, "loss": 1.7222, "mean_token_accuracy": 0.6519874900579452, "num_tokens": 13621839.0, "step": 3735 }, { "epoch": 28.76923076923077, "grad_norm": 0.0, "learning_rate": 3.1305e-05, "loss": 1.6808, "mean_token_accuracy": 0.6618104815483093, "num_tokens": 13640943.0, "step": 3740 }, { "epoch": 28.807692307692307, "grad_norm": 0.0, "learning_rate": 3.1280000000000005e-05, "loss": 1.8519, "mean_token_accuracy": 0.6293092161417008, "num_tokens": 13658372.0, "step": 3745 }, { "epoch": 28.846153846153847, "grad_norm": 0.0, "learning_rate": 3.1255e-05, "loss": 1.7723, "mean_token_accuracy": 0.6427496522665024, "num_tokens": 13676355.0, "step": 3750 }, { "epoch": 28.884615384615383, "grad_norm": 0.0, "learning_rate": 3.1230000000000004e-05, "loss": 1.7997, "mean_token_accuracy": 0.6400014579296112, "num_tokens": 13693987.0, "step": 3755 }, { "epoch": 28.923076923076923, "grad_norm": 0.0, "learning_rate": 3.1205000000000004e-05, "loss": 1.698, "mean_token_accuracy": 0.6581391155719757, "num_tokens": 13712942.0, "step": 3760 }, { "epoch": 28.96153846153846, "grad_norm": 0.0, "learning_rate": 3.118e-05, "loss": 1.7421, "mean_token_accuracy": 0.655744206905365, "num_tokens": 13731216.0, "step": 3765 }, { "epoch": 29.0, "grad_norm": 0.0, "learning_rate": 3.1155e-05, "loss": 1.7639, "mean_token_accuracy": 0.6510610312223435, "num_tokens": 13749045.0, "step": 3770 }, { "epoch": 29.03846153846154, "grad_norm": 0.0, "learning_rate": 3.113e-05, "loss": 1.6943, "mean_token_accuracy": 0.658639770746231, "num_tokens": 13767612.0, "step": 3775 }, { "epoch": 29.076923076923077, "grad_norm": 0.0, "learning_rate": 3.1105e-05, "loss": 1.7511, "mean_token_accuracy": 0.645610973238945, "num_tokens": 13785732.0, "step": 3780 }, { "epoch": 29.115384615384617, "grad_norm": 0.0, "learning_rate": 3.108e-05, "loss": 1.7157, "mean_token_accuracy": 0.6572709828615189, "num_tokens": 13804149.0, "step": 3785 }, { "epoch": 29.153846153846153, "grad_norm": 0.0, "learning_rate": 3.1055e-05, "loss": 1.7041, "mean_token_accuracy": 0.6564319849014282, "num_tokens": 13822922.0, "step": 3790 }, { "epoch": 29.192307692307693, "grad_norm": 0.0, "learning_rate": 3.1030000000000006e-05, "loss": 1.7008, "mean_token_accuracy": 0.6606447160243988, "num_tokens": 13840985.0, "step": 3795 }, { "epoch": 29.23076923076923, "grad_norm": 0.0, "learning_rate": 3.1005e-05, "loss": 1.7041, "mean_token_accuracy": 0.6543154448270798, "num_tokens": 13859478.0, "step": 3800 }, { "epoch": 29.26923076923077, "grad_norm": 0.0, "learning_rate": 3.0980000000000005e-05, "loss": 1.743, "mean_token_accuracy": 0.6480233430862427, "num_tokens": 13877895.0, "step": 3805 }, { "epoch": 29.307692307692307, "grad_norm": 0.0, "learning_rate": 3.0955e-05, "loss": 1.6918, "mean_token_accuracy": 0.6589089602231979, "num_tokens": 13896941.0, "step": 3810 }, { "epoch": 29.346153846153847, "grad_norm": 0.0, "learning_rate": 3.0930000000000004e-05, "loss": 1.7573, "mean_token_accuracy": 0.64599030315876, "num_tokens": 13915563.0, "step": 3815 }, { "epoch": 29.384615384615383, "grad_norm": 0.0, "learning_rate": 3.0905e-05, "loss": 1.7548, "mean_token_accuracy": 0.6496599853038788, "num_tokens": 13933531.0, "step": 3820 }, { "epoch": 29.423076923076923, "grad_norm": 0.0, "learning_rate": 3.088e-05, "loss": 1.7523, "mean_token_accuracy": 0.6545566976070404, "num_tokens": 13951615.0, "step": 3825 }, { "epoch": 29.46153846153846, "grad_norm": 0.0, "learning_rate": 3.0855e-05, "loss": 1.8148, "mean_token_accuracy": 0.6359264075756073, "num_tokens": 13969103.0, "step": 3830 }, { "epoch": 29.5, "grad_norm": 0.0, "learning_rate": 3.083e-05, "loss": 1.7199, "mean_token_accuracy": 0.6523585051298142, "num_tokens": 13987413.0, "step": 3835 }, { "epoch": 29.53846153846154, "grad_norm": 0.0, "learning_rate": 3.0805e-05, "loss": 1.8455, "mean_token_accuracy": 0.6313353180885315, "num_tokens": 14004497.0, "step": 3840 }, { "epoch": 29.576923076923077, "grad_norm": 0.0, "learning_rate": 3.078e-05, "loss": 1.6469, "mean_token_accuracy": 0.6732534170150757, "num_tokens": 14023651.0, "step": 3845 }, { "epoch": 29.615384615384617, "grad_norm": 0.0, "learning_rate": 3.0755e-05, "loss": 1.7665, "mean_token_accuracy": 0.6485314697027207, "num_tokens": 14041204.0, "step": 3850 }, { "epoch": 29.653846153846153, "grad_norm": 0.0, "learning_rate": 3.0730000000000006e-05, "loss": 1.7689, "mean_token_accuracy": 0.6438219338655472, "num_tokens": 14058902.0, "step": 3855 }, { "epoch": 29.692307692307693, "grad_norm": 0.0, "learning_rate": 3.0705e-05, "loss": 1.7326, "mean_token_accuracy": 0.6497067272663116, "num_tokens": 14077655.0, "step": 3860 }, { "epoch": 29.73076923076923, "grad_norm": 0.0, "learning_rate": 3.0680000000000004e-05, "loss": 1.6854, "mean_token_accuracy": 0.6631190478801727, "num_tokens": 14096779.0, "step": 3865 }, { "epoch": 29.76923076923077, "grad_norm": 0.0, "learning_rate": 3.0655e-05, "loss": 1.7354, "mean_token_accuracy": 0.6524263739585876, "num_tokens": 14114433.0, "step": 3870 }, { "epoch": 29.807692307692307, "grad_norm": 0.0, "learning_rate": 3.063e-05, "loss": 1.7631, "mean_token_accuracy": 0.6457651436328888, "num_tokens": 14132433.0, "step": 3875 }, { "epoch": 29.846153846153847, "grad_norm": 0.0, "learning_rate": 3.0605e-05, "loss": 1.7735, "mean_token_accuracy": 0.6389037370681763, "num_tokens": 14150572.0, "step": 3880 }, { "epoch": 29.884615384615383, "grad_norm": 0.0, "learning_rate": 3.058e-05, "loss": 1.786, "mean_token_accuracy": 0.6433692693710327, "num_tokens": 14168515.0, "step": 3885 }, { "epoch": 29.923076923076923, "grad_norm": 0.0, "learning_rate": 3.0555e-05, "loss": 1.8269, "mean_token_accuracy": 0.6312504172325134, "num_tokens": 14185905.0, "step": 3890 }, { "epoch": 29.96153846153846, "grad_norm": 0.0, "learning_rate": 3.053e-05, "loss": 1.7589, "mean_token_accuracy": 0.6496811449527741, "num_tokens": 14203685.0, "step": 3895 }, { "epoch": 30.0, "grad_norm": 0.0, "learning_rate": 3.0505e-05, "loss": 1.6956, "mean_token_accuracy": 0.6565341949462891, "num_tokens": 14223150.0, "step": 3900 }, { "epoch": 30.03846153846154, "grad_norm": 0.0, "learning_rate": 3.0480000000000003e-05, "loss": 1.7284, "mean_token_accuracy": 0.6489845454692841, "num_tokens": 14241996.0, "step": 3905 }, { "epoch": 30.076923076923077, "grad_norm": 0.0, "learning_rate": 3.0455e-05, "loss": 1.6848, "mean_token_accuracy": 0.6612500458955765, "num_tokens": 14260761.0, "step": 3910 }, { "epoch": 30.115384615384617, "grad_norm": 0.0, "learning_rate": 3.0430000000000002e-05, "loss": 1.7619, "mean_token_accuracy": 0.6461090564727783, "num_tokens": 14278765.0, "step": 3915 }, { "epoch": 30.153846153846153, "grad_norm": 0.0, "learning_rate": 3.0405e-05, "loss": 1.7833, "mean_token_accuracy": 0.6493801504373551, "num_tokens": 14296313.0, "step": 3920 }, { "epoch": 30.192307692307693, "grad_norm": 0.0, "learning_rate": 3.0380000000000004e-05, "loss": 1.7384, "mean_token_accuracy": 0.6529599964618683, "num_tokens": 14314228.0, "step": 3925 }, { "epoch": 30.23076923076923, "grad_norm": 0.0, "learning_rate": 3.0355e-05, "loss": 1.736, "mean_token_accuracy": 0.6538343548774719, "num_tokens": 14332966.0, "step": 3930 }, { "epoch": 30.26923076923077, "grad_norm": 0.0, "learning_rate": 3.0330000000000003e-05, "loss": 1.7853, "mean_token_accuracy": 0.6408940821886062, "num_tokens": 14350343.0, "step": 3935 }, { "epoch": 30.307692307692307, "grad_norm": 0.0, "learning_rate": 3.0305e-05, "loss": 1.6214, "mean_token_accuracy": 0.6724200338125229, "num_tokens": 14369664.0, "step": 3940 }, { "epoch": 30.346153846153847, "grad_norm": 0.0, "learning_rate": 3.028e-05, "loss": 1.7748, "mean_token_accuracy": 0.6431576639413834, "num_tokens": 14388275.0, "step": 3945 }, { "epoch": 30.384615384615383, "grad_norm": 0.0, "learning_rate": 3.0255e-05, "loss": 1.7376, "mean_token_accuracy": 0.6552271574735642, "num_tokens": 14406379.0, "step": 3950 }, { "epoch": 30.423076923076923, "grad_norm": 0.0, "learning_rate": 3.0230000000000004e-05, "loss": 1.7319, "mean_token_accuracy": 0.6521833807229995, "num_tokens": 14424792.0, "step": 3955 }, { "epoch": 30.46153846153846, "grad_norm": 0.0, "learning_rate": 3.0205e-05, "loss": 1.68, "mean_token_accuracy": 0.6599725693464279, "num_tokens": 14443837.0, "step": 3960 }, { "epoch": 30.5, "grad_norm": 0.0, "learning_rate": 3.0180000000000002e-05, "loss": 1.8198, "mean_token_accuracy": 0.6380291700363159, "num_tokens": 14461559.0, "step": 3965 }, { "epoch": 30.53846153846154, "grad_norm": 0.0, "learning_rate": 3.0155e-05, "loss": 1.7347, "mean_token_accuracy": 0.6505941689014435, "num_tokens": 14479670.0, "step": 3970 }, { "epoch": 30.576923076923077, "grad_norm": 0.0, "learning_rate": 3.013e-05, "loss": 1.7947, "mean_token_accuracy": 0.6404198050498963, "num_tokens": 14497368.0, "step": 3975 }, { "epoch": 30.615384615384617, "grad_norm": 0.0, "learning_rate": 3.0105e-05, "loss": 1.7593, "mean_token_accuracy": 0.644034668803215, "num_tokens": 14515634.0, "step": 3980 }, { "epoch": 30.653846153846153, "grad_norm": 0.0, "learning_rate": 3.0080000000000003e-05, "loss": 1.6789, "mean_token_accuracy": 0.6613237649202347, "num_tokens": 14534370.0, "step": 3985 }, { "epoch": 30.692307692307693, "grad_norm": 0.0, "learning_rate": 3.0055e-05, "loss": 1.7542, "mean_token_accuracy": 0.645700478553772, "num_tokens": 14552442.0, "step": 3990 }, { "epoch": 30.73076923076923, "grad_norm": 0.0, "learning_rate": 3.0030000000000002e-05, "loss": 1.8189, "mean_token_accuracy": 0.6313192546367645, "num_tokens": 14569871.0, "step": 3995 }, { "epoch": 30.76923076923077, "grad_norm": 0.0, "learning_rate": 3.0004999999999998e-05, "loss": 1.6918, "mean_token_accuracy": 0.6627134591341018, "num_tokens": 14588659.0, "step": 4000 }, { "epoch": 30.807692307692307, "grad_norm": 0.0, "learning_rate": 2.998e-05, "loss": 1.7661, "mean_token_accuracy": 0.6492778122425079, "num_tokens": 14606860.0, "step": 4005 }, { "epoch": 30.846153846153847, "grad_norm": 0.0, "learning_rate": 2.9955e-05, "loss": 1.771, "mean_token_accuracy": 0.6434893846511841, "num_tokens": 14624861.0, "step": 4010 }, { "epoch": 30.884615384615383, "grad_norm": 0.0, "learning_rate": 2.9930000000000003e-05, "loss": 1.6634, "mean_token_accuracy": 0.6664941430091857, "num_tokens": 14643597.0, "step": 4015 }, { "epoch": 30.923076923076923, "grad_norm": 0.0, "learning_rate": 2.9905e-05, "loss": 1.7814, "mean_token_accuracy": 0.6440500050783158, "num_tokens": 14661469.0, "step": 4020 }, { "epoch": 30.96153846153846, "grad_norm": 0.0, "learning_rate": 2.9880000000000002e-05, "loss": 1.7175, "mean_token_accuracy": 0.6576481223106384, "num_tokens": 14679555.0, "step": 4025 }, { "epoch": 31.0, "grad_norm": 0.0, "learning_rate": 2.9855e-05, "loss": 1.7706, "mean_token_accuracy": 0.6471602261066437, "num_tokens": 14697255.0, "step": 4030 }, { "epoch": 31.03846153846154, "grad_norm": 0.0, "learning_rate": 2.9830000000000004e-05, "loss": 1.7567, "mean_token_accuracy": 0.6573283642530441, "num_tokens": 14715135.0, "step": 4035 }, { "epoch": 31.076923076923077, "grad_norm": 0.0, "learning_rate": 2.9805e-05, "loss": 1.649, "mean_token_accuracy": 0.6654754310846329, "num_tokens": 14734385.0, "step": 4040 }, { "epoch": 31.115384615384617, "grad_norm": 0.0, "learning_rate": 2.9780000000000003e-05, "loss": 1.7331, "mean_token_accuracy": 0.6555071473121643, "num_tokens": 14752659.0, "step": 4045 }, { "epoch": 31.153846153846153, "grad_norm": 0.0, "learning_rate": 2.9755e-05, "loss": 1.7449, "mean_token_accuracy": 0.6513841986656189, "num_tokens": 14770947.0, "step": 4050 }, { "epoch": 31.192307692307693, "grad_norm": 0.0, "learning_rate": 2.973e-05, "loss": 1.8619, "mean_token_accuracy": 0.6317902356386185, "num_tokens": 14787485.0, "step": 4055 }, { "epoch": 31.23076923076923, "grad_norm": 0.0, "learning_rate": 2.9705e-05, "loss": 1.7178, "mean_token_accuracy": 0.6520997792482376, "num_tokens": 14806506.0, "step": 4060 }, { "epoch": 31.26923076923077, "grad_norm": 0.0, "learning_rate": 2.9680000000000004e-05, "loss": 1.749, "mean_token_accuracy": 0.6490054994821548, "num_tokens": 14824796.0, "step": 4065 }, { "epoch": 31.307692307692307, "grad_norm": 0.0, "learning_rate": 2.9655e-05, "loss": 1.7548, "mean_token_accuracy": 0.6469120174646378, "num_tokens": 14842766.0, "step": 4070 }, { "epoch": 31.346153846153847, "grad_norm": 0.0, "learning_rate": 2.9630000000000003e-05, "loss": 1.7272, "mean_token_accuracy": 0.6515872448682785, "num_tokens": 14861282.0, "step": 4075 }, { "epoch": 31.384615384615383, "grad_norm": 0.0, "learning_rate": 2.9605e-05, "loss": 1.6966, "mean_token_accuracy": 0.6624060094356536, "num_tokens": 14879361.0, "step": 4080 }, { "epoch": 31.423076923076923, "grad_norm": 0.0, "learning_rate": 2.958e-05, "loss": 1.735, "mean_token_accuracy": 0.6479102671146393, "num_tokens": 14897412.0, "step": 4085 }, { "epoch": 31.46153846153846, "grad_norm": 0.0, "learning_rate": 2.9555e-05, "loss": 1.7283, "mean_token_accuracy": 0.6495876848697663, "num_tokens": 14915712.0, "step": 4090 }, { "epoch": 31.5, "grad_norm": 0.0, "learning_rate": 2.9530000000000004e-05, "loss": 1.7165, "mean_token_accuracy": 0.6608868330717087, "num_tokens": 14933929.0, "step": 4095 }, { "epoch": 31.53846153846154, "grad_norm": 0.0, "learning_rate": 2.9505e-05, "loss": 1.7196, "mean_token_accuracy": 0.6539542943239212, "num_tokens": 14952726.0, "step": 4100 }, { "epoch": 31.576923076923077, "grad_norm": 0.0, "learning_rate": 2.9480000000000002e-05, "loss": 1.7294, "mean_token_accuracy": 0.6499251574277878, "num_tokens": 14971349.0, "step": 4105 }, { "epoch": 31.615384615384617, "grad_norm": 0.0, "learning_rate": 2.9455e-05, "loss": 1.7617, "mean_token_accuracy": 0.6461383640766144, "num_tokens": 14989504.0, "step": 4110 }, { "epoch": 31.653846153846153, "grad_norm": 0.0, "learning_rate": 2.943e-05, "loss": 1.83, "mean_token_accuracy": 0.6284946590662003, "num_tokens": 15006774.0, "step": 4115 }, { "epoch": 31.692307692307693, "grad_norm": 0.0, "learning_rate": 2.9405e-05, "loss": 1.7556, "mean_token_accuracy": 0.646930256485939, "num_tokens": 15025098.0, "step": 4120 }, { "epoch": 31.73076923076923, "grad_norm": 0.0, "learning_rate": 2.9380000000000003e-05, "loss": 1.7464, "mean_token_accuracy": 0.6496236711740494, "num_tokens": 15043402.0, "step": 4125 }, { "epoch": 31.76923076923077, "grad_norm": 0.0, "learning_rate": 2.9355e-05, "loss": 1.7743, "mean_token_accuracy": 0.6452437072992325, "num_tokens": 15061031.0, "step": 4130 }, { "epoch": 31.807692307692307, "grad_norm": 0.0, "learning_rate": 2.9330000000000002e-05, "loss": 1.7517, "mean_token_accuracy": 0.6447321742773056, "num_tokens": 15079238.0, "step": 4135 }, { "epoch": 31.846153846153847, "grad_norm": 0.0, "learning_rate": 2.9304999999999998e-05, "loss": 1.6618, "mean_token_accuracy": 0.6659055262804031, "num_tokens": 15098047.0, "step": 4140 }, { "epoch": 31.884615384615383, "grad_norm": 0.0, "learning_rate": 2.928e-05, "loss": 1.7187, "mean_token_accuracy": 0.6543934553861618, "num_tokens": 15116367.0, "step": 4145 }, { "epoch": 31.923076923076923, "grad_norm": 0.0, "learning_rate": 2.9255e-05, "loss": 1.7361, "mean_token_accuracy": 0.6480780899524688, "num_tokens": 15135132.0, "step": 4150 }, { "epoch": 31.96153846153846, "grad_norm": 0.0, "learning_rate": 2.9230000000000003e-05, "loss": 1.7548, "mean_token_accuracy": 0.6506118953227997, "num_tokens": 15152655.0, "step": 4155 }, { "epoch": 32.0, "grad_norm": 0.0, "learning_rate": 2.9205e-05, "loss": 1.7498, "mean_token_accuracy": 0.6500423729419709, "num_tokens": 15171360.0, "step": 4160 }, { "epoch": 32.03846153846154, "grad_norm": 0.0, "learning_rate": 2.9180000000000002e-05, "loss": 1.7394, "mean_token_accuracy": 0.6556807518005371, "num_tokens": 15189772.0, "step": 4165 }, { "epoch": 32.07692307692308, "grad_norm": 0.0, "learning_rate": 2.9154999999999998e-05, "loss": 1.6884, "mean_token_accuracy": 0.6571320801973343, "num_tokens": 15208356.0, "step": 4170 }, { "epoch": 32.11538461538461, "grad_norm": 0.0, "learning_rate": 2.913e-05, "loss": 1.7331, "mean_token_accuracy": 0.6546037912368774, "num_tokens": 15225973.0, "step": 4175 }, { "epoch": 32.15384615384615, "grad_norm": 0.0, "learning_rate": 2.9105e-05, "loss": 1.7188, "mean_token_accuracy": 0.6566130489110946, "num_tokens": 15244498.0, "step": 4180 }, { "epoch": 32.19230769230769, "grad_norm": 0.0, "learning_rate": 2.9080000000000003e-05, "loss": 1.6926, "mean_token_accuracy": 0.6630068510770798, "num_tokens": 15263404.0, "step": 4185 }, { "epoch": 32.23076923076923, "grad_norm": 0.0, "learning_rate": 2.9055e-05, "loss": 1.8139, "mean_token_accuracy": 0.6341224372386932, "num_tokens": 15281071.0, "step": 4190 }, { "epoch": 32.26923076923077, "grad_norm": 0.0, "learning_rate": 2.903e-05, "loss": 1.7784, "mean_token_accuracy": 0.6433619648218155, "num_tokens": 15299529.0, "step": 4195 }, { "epoch": 32.30769230769231, "grad_norm": 0.0, "learning_rate": 2.9004999999999998e-05, "loss": 1.7868, "mean_token_accuracy": 0.642947056889534, "num_tokens": 15317120.0, "step": 4200 }, { "epoch": 32.34615384615385, "grad_norm": 0.0, "learning_rate": 2.898e-05, "loss": 1.7569, "mean_token_accuracy": 0.648267537355423, "num_tokens": 15335733.0, "step": 4205 }, { "epoch": 32.38461538461539, "grad_norm": 0.0, "learning_rate": 2.8955e-05, "loss": 1.6314, "mean_token_accuracy": 0.6678825885057449, "num_tokens": 15355227.0, "step": 4210 }, { "epoch": 32.42307692307692, "grad_norm": 0.0, "learning_rate": 2.8930000000000003e-05, "loss": 1.7637, "mean_token_accuracy": 0.6434868663549423, "num_tokens": 15373422.0, "step": 4215 }, { "epoch": 32.46153846153846, "grad_norm": 0.0, "learning_rate": 2.8905e-05, "loss": 1.7577, "mean_token_accuracy": 0.6474568367004394, "num_tokens": 15391589.0, "step": 4220 }, { "epoch": 32.5, "grad_norm": 0.0, "learning_rate": 2.888e-05, "loss": 1.7877, "mean_token_accuracy": 0.637940239906311, "num_tokens": 15409383.0, "step": 4225 }, { "epoch": 32.53846153846154, "grad_norm": 0.0, "learning_rate": 2.8854999999999997e-05, "loss": 1.6901, "mean_token_accuracy": 0.6617184728384018, "num_tokens": 15428238.0, "step": 4230 }, { "epoch": 32.57692307692308, "grad_norm": 0.0, "learning_rate": 2.883e-05, "loss": 1.6847, "mean_token_accuracy": 0.660372993350029, "num_tokens": 15446651.0, "step": 4235 }, { "epoch": 32.61538461538461, "grad_norm": 0.0, "learning_rate": 2.8805e-05, "loss": 1.8002, "mean_token_accuracy": 0.634194228053093, "num_tokens": 15464761.0, "step": 4240 }, { "epoch": 32.65384615384615, "grad_norm": 0.0, "learning_rate": 2.8780000000000002e-05, "loss": 1.7199, "mean_token_accuracy": 0.6551119357347488, "num_tokens": 15483259.0, "step": 4245 }, { "epoch": 32.69230769230769, "grad_norm": 0.0, "learning_rate": 2.8754999999999998e-05, "loss": 1.718, "mean_token_accuracy": 0.6517602205276489, "num_tokens": 15502102.0, "step": 4250 }, { "epoch": 32.73076923076923, "grad_norm": 0.0, "learning_rate": 2.873e-05, "loss": 1.7375, "mean_token_accuracy": 0.6485500365495682, "num_tokens": 15520198.0, "step": 4255 }, { "epoch": 32.76923076923077, "grad_norm": 0.0, "learning_rate": 2.8705000000000004e-05, "loss": 1.7885, "mean_token_accuracy": 0.641147717833519, "num_tokens": 15538006.0, "step": 4260 }, { "epoch": 32.80769230769231, "grad_norm": 0.0, "learning_rate": 2.868e-05, "loss": 1.6918, "mean_token_accuracy": 0.662942835688591, "num_tokens": 15555826.0, "step": 4265 }, { "epoch": 32.84615384615385, "grad_norm": 0.0, "learning_rate": 2.8655000000000003e-05, "loss": 1.7834, "mean_token_accuracy": 0.6468859910964966, "num_tokens": 15573396.0, "step": 4270 }, { "epoch": 32.88461538461539, "grad_norm": 0.0, "learning_rate": 2.8630000000000002e-05, "loss": 1.7352, "mean_token_accuracy": 0.654278576374054, "num_tokens": 15592035.0, "step": 4275 }, { "epoch": 32.92307692307692, "grad_norm": 0.0, "learning_rate": 2.8605000000000005e-05, "loss": 1.7305, "mean_token_accuracy": 0.6594868719577789, "num_tokens": 15610531.0, "step": 4280 }, { "epoch": 32.96153846153846, "grad_norm": 0.0, "learning_rate": 2.858e-05, "loss": 1.8267, "mean_token_accuracy": 0.6349624991416931, "num_tokens": 15627607.0, "step": 4285 }, { "epoch": 33.0, "grad_norm": 0.0, "learning_rate": 2.8555000000000004e-05, "loss": 1.743, "mean_token_accuracy": 0.6526326090097427, "num_tokens": 15645465.0, "step": 4290 }, { "epoch": 33.03846153846154, "grad_norm": 0.0, "learning_rate": 2.853e-05, "loss": 1.7299, "mean_token_accuracy": 0.6534471154212952, "num_tokens": 15663771.0, "step": 4295 }, { "epoch": 33.07692307692308, "grad_norm": 0.0, "learning_rate": 2.8505000000000002e-05, "loss": 1.7432, "mean_token_accuracy": 0.6503149479627609, "num_tokens": 15682167.0, "step": 4300 }, { "epoch": 33.11538461538461, "grad_norm": 0.0, "learning_rate": 2.8480000000000002e-05, "loss": 1.7905, "mean_token_accuracy": 0.6410009592771531, "num_tokens": 15699294.0, "step": 4305 }, { "epoch": 33.15384615384615, "grad_norm": 0.0, "learning_rate": 2.8455000000000005e-05, "loss": 1.7323, "mean_token_accuracy": 0.6517443567514419, "num_tokens": 15718054.0, "step": 4310 }, { "epoch": 33.19230769230769, "grad_norm": 0.0, "learning_rate": 2.843e-05, "loss": 1.7484, "mean_token_accuracy": 0.6458596080541611, "num_tokens": 15736292.0, "step": 4315 }, { "epoch": 33.23076923076923, "grad_norm": 0.0, "learning_rate": 2.8405000000000003e-05, "loss": 1.715, "mean_token_accuracy": 0.6541249454021454, "num_tokens": 15755018.0, "step": 4320 }, { "epoch": 33.26923076923077, "grad_norm": 0.0, "learning_rate": 2.8380000000000003e-05, "loss": 1.6412, "mean_token_accuracy": 0.665298554301262, "num_tokens": 15774342.0, "step": 4325 }, { "epoch": 33.30769230769231, "grad_norm": 0.0, "learning_rate": 2.8355000000000002e-05, "loss": 1.7192, "mean_token_accuracy": 0.6550895661115647, "num_tokens": 15792876.0, "step": 4330 }, { "epoch": 33.34615384615385, "grad_norm": 0.0, "learning_rate": 2.833e-05, "loss": 1.6975, "mean_token_accuracy": 0.6590204060077667, "num_tokens": 15811378.0, "step": 4335 }, { "epoch": 33.38461538461539, "grad_norm": 0.0, "learning_rate": 2.8305000000000004e-05, "loss": 1.7203, "mean_token_accuracy": 0.6554653912782669, "num_tokens": 15829656.0, "step": 4340 }, { "epoch": 33.42307692307692, "grad_norm": 0.0, "learning_rate": 2.828e-05, "loss": 1.7643, "mean_token_accuracy": 0.6414321035146713, "num_tokens": 15847703.0, "step": 4345 }, { "epoch": 33.46153846153846, "grad_norm": 0.0, "learning_rate": 2.8255000000000003e-05, "loss": 1.6511, "mean_token_accuracy": 0.667979073524475, "num_tokens": 15867134.0, "step": 4350 }, { "epoch": 33.5, "grad_norm": 0.0, "learning_rate": 2.8230000000000002e-05, "loss": 1.7647, "mean_token_accuracy": 0.6481159001588821, "num_tokens": 15885242.0, "step": 4355 }, { "epoch": 33.53846153846154, "grad_norm": 0.0, "learning_rate": 2.8205000000000005e-05, "loss": 1.8012, "mean_token_accuracy": 0.6423881828784943, "num_tokens": 15902772.0, "step": 4360 }, { "epoch": 33.57692307692308, "grad_norm": 0.0, "learning_rate": 2.818e-05, "loss": 1.7589, "mean_token_accuracy": 0.6487757861614227, "num_tokens": 15921138.0, "step": 4365 }, { "epoch": 33.61538461538461, "grad_norm": 0.0, "learning_rate": 2.8155000000000004e-05, "loss": 1.7236, "mean_token_accuracy": 0.6539973527193069, "num_tokens": 15939494.0, "step": 4370 }, { "epoch": 33.65384615384615, "grad_norm": 0.0, "learning_rate": 2.813e-05, "loss": 1.7232, "mean_token_accuracy": 0.649171131849289, "num_tokens": 15958194.0, "step": 4375 }, { "epoch": 33.69230769230769, "grad_norm": 0.0, "learning_rate": 2.8105000000000003e-05, "loss": 1.7117, "mean_token_accuracy": 0.6549121081829071, "num_tokens": 15976927.0, "step": 4380 }, { "epoch": 33.73076923076923, "grad_norm": 0.0, "learning_rate": 2.8080000000000002e-05, "loss": 1.8074, "mean_token_accuracy": 0.6450524151325225, "num_tokens": 15994035.0, "step": 4385 }, { "epoch": 33.76923076923077, "grad_norm": 0.0, "learning_rate": 2.8055000000000005e-05, "loss": 1.7169, "mean_token_accuracy": 0.6550534397363663, "num_tokens": 16012525.0, "step": 4390 }, { "epoch": 33.80769230769231, "grad_norm": 0.0, "learning_rate": 2.803e-05, "loss": 1.8682, "mean_token_accuracy": 0.6267781972885131, "num_tokens": 16029494.0, "step": 4395 }, { "epoch": 33.84615384615385, "grad_norm": 0.0, "learning_rate": 2.8005000000000004e-05, "loss": 1.7787, "mean_token_accuracy": 0.6424099951982498, "num_tokens": 16047811.0, "step": 4400 }, { "epoch": 33.88461538461539, "grad_norm": 0.0, "learning_rate": 2.798e-05, "loss": 1.737, "mean_token_accuracy": 0.6514830589294434, "num_tokens": 16065304.0, "step": 4405 }, { "epoch": 33.92307692307692, "grad_norm": 0.0, "learning_rate": 2.7955000000000003e-05, "loss": 1.7418, "mean_token_accuracy": 0.652796396613121, "num_tokens": 16083292.0, "step": 4410 }, { "epoch": 33.96153846153846, "grad_norm": 0.0, "learning_rate": 2.7930000000000002e-05, "loss": 1.7136, "mean_token_accuracy": 0.654367059469223, "num_tokens": 16101807.0, "step": 4415 }, { "epoch": 34.0, "grad_norm": 0.0, "learning_rate": 2.7905000000000005e-05, "loss": 1.7799, "mean_token_accuracy": 0.644026267528534, "num_tokens": 16119570.0, "step": 4420 }, { "epoch": 34.03846153846154, "grad_norm": 0.0, "learning_rate": 2.788e-05, "loss": 1.7598, "mean_token_accuracy": 0.6437584400177002, "num_tokens": 16137834.0, "step": 4425 }, { "epoch": 34.07692307692308, "grad_norm": 0.0, "learning_rate": 2.7855000000000004e-05, "loss": 1.6805, "mean_token_accuracy": 0.6666524916887283, "num_tokens": 16156663.0, "step": 4430 }, { "epoch": 34.11538461538461, "grad_norm": 0.0, "learning_rate": 2.783e-05, "loss": 1.6866, "mean_token_accuracy": 0.6584878832101821, "num_tokens": 16175788.0, "step": 4435 }, { "epoch": 34.15384615384615, "grad_norm": 0.0, "learning_rate": 2.7805000000000002e-05, "loss": 1.7121, "mean_token_accuracy": 0.6575721591711045, "num_tokens": 16194451.0, "step": 4440 }, { "epoch": 34.19230769230769, "grad_norm": 0.0, "learning_rate": 2.778e-05, "loss": 1.751, "mean_token_accuracy": 0.6504880547523498, "num_tokens": 16212789.0, "step": 4445 }, { "epoch": 34.23076923076923, "grad_norm": 0.0, "learning_rate": 2.7755000000000004e-05, "loss": 1.8498, "mean_token_accuracy": 0.6282713234424591, "num_tokens": 16230046.0, "step": 4450 }, { "epoch": 34.26923076923077, "grad_norm": 0.0, "learning_rate": 2.773e-05, "loss": 1.75, "mean_token_accuracy": 0.6498936504125595, "num_tokens": 16247898.0, "step": 4455 }, { "epoch": 34.30769230769231, "grad_norm": 0.0, "learning_rate": 2.7705000000000003e-05, "loss": 1.7994, "mean_token_accuracy": 0.6431075096130371, "num_tokens": 16265073.0, "step": 4460 }, { "epoch": 34.34615384615385, "grad_norm": 0.0, "learning_rate": 2.768e-05, "loss": 1.6605, "mean_token_accuracy": 0.6640267461538315, "num_tokens": 16283741.0, "step": 4465 }, { "epoch": 34.38461538461539, "grad_norm": 0.0, "learning_rate": 2.7655000000000002e-05, "loss": 1.7335, "mean_token_accuracy": 0.6550962030887604, "num_tokens": 16301795.0, "step": 4470 }, { "epoch": 34.42307692307692, "grad_norm": 0.0, "learning_rate": 2.763e-05, "loss": 1.7215, "mean_token_accuracy": 0.6581933110952377, "num_tokens": 16320399.0, "step": 4475 }, { "epoch": 34.46153846153846, "grad_norm": 0.0, "learning_rate": 2.7605000000000004e-05, "loss": 1.7164, "mean_token_accuracy": 0.6563991248607636, "num_tokens": 16338805.0, "step": 4480 }, { "epoch": 34.5, "grad_norm": 0.0, "learning_rate": 2.758e-05, "loss": 1.718, "mean_token_accuracy": 0.6518977075815201, "num_tokens": 16357675.0, "step": 4485 }, { "epoch": 34.53846153846154, "grad_norm": 0.0, "learning_rate": 2.7555000000000003e-05, "loss": 1.7261, "mean_token_accuracy": 0.6472561746835709, "num_tokens": 16376357.0, "step": 4490 }, { "epoch": 34.57692307692308, "grad_norm": 0.0, "learning_rate": 2.753e-05, "loss": 1.7656, "mean_token_accuracy": 0.6474886327981949, "num_tokens": 16394142.0, "step": 4495 }, { "epoch": 34.61538461538461, "grad_norm": 0.0, "learning_rate": 2.7505000000000002e-05, "loss": 1.7131, "mean_token_accuracy": 0.6552159339189529, "num_tokens": 16412619.0, "step": 4500 }, { "epoch": 34.65384615384615, "grad_norm": 0.0, "learning_rate": 2.748e-05, "loss": 1.8421, "mean_token_accuracy": 0.6334175139665603, "num_tokens": 16429807.0, "step": 4505 }, { "epoch": 34.69230769230769, "grad_norm": 0.0, "learning_rate": 2.7455000000000004e-05, "loss": 1.7888, "mean_token_accuracy": 0.6399717807769776, "num_tokens": 16447829.0, "step": 4510 }, { "epoch": 34.73076923076923, "grad_norm": 0.0, "learning_rate": 2.743e-05, "loss": 1.6926, "mean_token_accuracy": 0.6622339367866517, "num_tokens": 16466534.0, "step": 4515 }, { "epoch": 34.76923076923077, "grad_norm": 0.0, "learning_rate": 2.7405000000000003e-05, "loss": 1.7331, "mean_token_accuracy": 0.6530486464500427, "num_tokens": 16484278.0, "step": 4520 }, { "epoch": 34.80769230769231, "grad_norm": 0.0, "learning_rate": 2.738e-05, "loss": 1.7237, "mean_token_accuracy": 0.6497306555509568, "num_tokens": 16502702.0, "step": 4525 }, { "epoch": 34.84615384615385, "grad_norm": 0.0, "learning_rate": 2.7355e-05, "loss": 1.6833, "mean_token_accuracy": 0.6642057120800018, "num_tokens": 16521906.0, "step": 4530 }, { "epoch": 34.88461538461539, "grad_norm": 0.0, "learning_rate": 2.733e-05, "loss": 1.8207, "mean_token_accuracy": 0.6355377316474915, "num_tokens": 16539170.0, "step": 4535 }, { "epoch": 34.92307692307692, "grad_norm": 0.0, "learning_rate": 2.7305000000000004e-05, "loss": 1.7629, "mean_token_accuracy": 0.6472138792276383, "num_tokens": 16556927.0, "step": 4540 }, { "epoch": 34.96153846153846, "grad_norm": 0.0, "learning_rate": 2.728e-05, "loss": 1.7334, "mean_token_accuracy": 0.6492251008749008, "num_tokens": 16575521.0, "step": 4545 }, { "epoch": 35.0, "grad_norm": 0.0, "learning_rate": 2.7255000000000002e-05, "loss": 1.7436, "mean_token_accuracy": 0.6502075582742691, "num_tokens": 16593675.0, "step": 4550 }, { "epoch": 35.03846153846154, "grad_norm": 0.0, "learning_rate": 2.723e-05, "loss": 1.7475, "mean_token_accuracy": 0.6530224084854126, "num_tokens": 16612033.0, "step": 4555 }, { "epoch": 35.07692307692308, "grad_norm": 0.0, "learning_rate": 2.7205e-05, "loss": 1.7608, "mean_token_accuracy": 0.6467309683561325, "num_tokens": 16630046.0, "step": 4560 }, { "epoch": 35.11538461538461, "grad_norm": 0.0, "learning_rate": 2.718e-05, "loss": 1.6749, "mean_token_accuracy": 0.6611016511917114, "num_tokens": 16648965.0, "step": 4565 }, { "epoch": 35.15384615384615, "grad_norm": 0.0, "learning_rate": 2.7155000000000003e-05, "loss": 1.6678, "mean_token_accuracy": 0.6659534931182861, "num_tokens": 16668179.0, "step": 4570 }, { "epoch": 35.19230769230769, "grad_norm": 0.0, "learning_rate": 2.713e-05, "loss": 1.7469, "mean_token_accuracy": 0.6515377521514892, "num_tokens": 16686202.0, "step": 4575 }, { "epoch": 35.23076923076923, "grad_norm": 0.0, "learning_rate": 2.7105000000000002e-05, "loss": 1.758, "mean_token_accuracy": 0.6510407030582428, "num_tokens": 16704566.0, "step": 4580 }, { "epoch": 35.26923076923077, "grad_norm": 0.0, "learning_rate": 2.7079999999999998e-05, "loss": 1.7879, "mean_token_accuracy": 0.6449306428432464, "num_tokens": 16721669.0, "step": 4585 }, { "epoch": 35.30769230769231, "grad_norm": 0.0, "learning_rate": 2.7055e-05, "loss": 1.7399, "mean_token_accuracy": 0.6478658020496368, "num_tokens": 16739999.0, "step": 4590 }, { "epoch": 35.34615384615385, "grad_norm": 0.0, "learning_rate": 2.703e-05, "loss": 1.7316, "mean_token_accuracy": 0.647733348608017, "num_tokens": 16758454.0, "step": 4595 }, { "epoch": 35.38461538461539, "grad_norm": 0.0, "learning_rate": 2.7005000000000003e-05, "loss": 1.7179, "mean_token_accuracy": 0.6545091897249222, "num_tokens": 16776775.0, "step": 4600 }, { "epoch": 35.42307692307692, "grad_norm": 0.0, "learning_rate": 2.698e-05, "loss": 1.7509, "mean_token_accuracy": 0.6458514422178269, "num_tokens": 16795006.0, "step": 4605 }, { "epoch": 35.46153846153846, "grad_norm": 0.0, "learning_rate": 2.6955000000000002e-05, "loss": 1.8091, "mean_token_accuracy": 0.6382217675447464, "num_tokens": 16812133.0, "step": 4610 }, { "epoch": 35.5, "grad_norm": 0.0, "learning_rate": 2.693e-05, "loss": 1.7595, "mean_token_accuracy": 0.6477942883968353, "num_tokens": 16829745.0, "step": 4615 }, { "epoch": 35.53846153846154, "grad_norm": 0.0, "learning_rate": 2.6905e-05, "loss": 1.7319, "mean_token_accuracy": 0.6578234076499939, "num_tokens": 16848720.0, "step": 4620 }, { "epoch": 35.57692307692308, "grad_norm": 0.0, "learning_rate": 2.688e-05, "loss": 1.6978, "mean_token_accuracy": 0.653961855173111, "num_tokens": 16867266.0, "step": 4625 }, { "epoch": 35.61538461538461, "grad_norm": 0.0, "learning_rate": 2.6855000000000003e-05, "loss": 1.7421, "mean_token_accuracy": 0.6451707929372787, "num_tokens": 16885165.0, "step": 4630 }, { "epoch": 35.65384615384615, "grad_norm": 0.0, "learning_rate": 2.683e-05, "loss": 1.6848, "mean_token_accuracy": 0.660153791308403, "num_tokens": 16904043.0, "step": 4635 }, { "epoch": 35.69230769230769, "grad_norm": 0.0, "learning_rate": 2.6805000000000002e-05, "loss": 1.7564, "mean_token_accuracy": 0.6473292171955108, "num_tokens": 16921974.0, "step": 4640 }, { "epoch": 35.73076923076923, "grad_norm": 0.0, "learning_rate": 2.678e-05, "loss": 1.758, "mean_token_accuracy": 0.6488713294267654, "num_tokens": 16939451.0, "step": 4645 }, { "epoch": 35.76923076923077, "grad_norm": 0.0, "learning_rate": 2.6755000000000004e-05, "loss": 1.7267, "mean_token_accuracy": 0.653421950340271, "num_tokens": 16958067.0, "step": 4650 }, { "epoch": 35.80769230769231, "grad_norm": 0.0, "learning_rate": 2.673e-05, "loss": 1.7234, "mean_token_accuracy": 0.6553035944700241, "num_tokens": 16976533.0, "step": 4655 }, { "epoch": 35.84615384615385, "grad_norm": 0.0, "learning_rate": 2.6705000000000003e-05, "loss": 1.7566, "mean_token_accuracy": 0.6490527987480164, "num_tokens": 16994749.0, "step": 4660 }, { "epoch": 35.88461538461539, "grad_norm": 0.0, "learning_rate": 2.668e-05, "loss": 1.7622, "mean_token_accuracy": 0.6474152833223343, "num_tokens": 17012449.0, "step": 4665 }, { "epoch": 35.92307692307692, "grad_norm": 0.0, "learning_rate": 2.6655e-05, "loss": 1.746, "mean_token_accuracy": 0.644548413157463, "num_tokens": 17031126.0, "step": 4670 }, { "epoch": 35.96153846153846, "grad_norm": 0.0, "learning_rate": 2.663e-05, "loss": 1.7045, "mean_token_accuracy": 0.6576229840517044, "num_tokens": 17050110.0, "step": 4675 }, { "epoch": 36.0, "grad_norm": 0.0, "learning_rate": 2.6605000000000004e-05, "loss": 1.8172, "mean_token_accuracy": 0.6355714172124862, "num_tokens": 17067780.0, "step": 4680 }, { "epoch": 36.03846153846154, "grad_norm": 0.0, "learning_rate": 2.658e-05, "loss": 1.8764, "mean_token_accuracy": 0.6272640764713288, "num_tokens": 17084927.0, "step": 4685 }, { "epoch": 36.07692307692308, "grad_norm": 0.0, "learning_rate": 2.6555000000000002e-05, "loss": 1.754, "mean_token_accuracy": 0.6480523020029068, "num_tokens": 17102872.0, "step": 4690 }, { "epoch": 36.11538461538461, "grad_norm": 0.0, "learning_rate": 2.653e-05, "loss": 1.733, "mean_token_accuracy": 0.6500560373067856, "num_tokens": 17120645.0, "step": 4695 }, { "epoch": 36.15384615384615, "grad_norm": 0.0, "learning_rate": 2.6505e-05, "loss": 1.7193, "mean_token_accuracy": 0.6540112674236298, "num_tokens": 17139263.0, "step": 4700 }, { "epoch": 36.19230769230769, "grad_norm": 0.0, "learning_rate": 2.648e-05, "loss": 1.6614, "mean_token_accuracy": 0.6684265941381454, "num_tokens": 17158319.0, "step": 4705 }, { "epoch": 36.23076923076923, "grad_norm": 0.0, "learning_rate": 2.6455000000000003e-05, "loss": 1.7377, "mean_token_accuracy": 0.6504531025886535, "num_tokens": 17176907.0, "step": 4710 }, { "epoch": 36.26923076923077, "grad_norm": 0.0, "learning_rate": 2.643e-05, "loss": 1.751, "mean_token_accuracy": 0.6483948498964309, "num_tokens": 17195007.0, "step": 4715 }, { "epoch": 36.30769230769231, "grad_norm": 0.0, "learning_rate": 2.6405000000000002e-05, "loss": 1.6963, "mean_token_accuracy": 0.6590126276016235, "num_tokens": 17213782.0, "step": 4720 }, { "epoch": 36.34615384615385, "grad_norm": 0.0, "learning_rate": 2.6379999999999998e-05, "loss": 1.7659, "mean_token_accuracy": 0.6419188559055329, "num_tokens": 17231953.0, "step": 4725 }, { "epoch": 36.38461538461539, "grad_norm": 0.0, "learning_rate": 2.6355e-05, "loss": 1.7443, "mean_token_accuracy": 0.6517761409282684, "num_tokens": 17250069.0, "step": 4730 }, { "epoch": 36.42307692307692, "grad_norm": 0.0, "learning_rate": 2.633e-05, "loss": 1.7795, "mean_token_accuracy": 0.63814417719841, "num_tokens": 17267793.0, "step": 4735 }, { "epoch": 36.46153846153846, "grad_norm": 0.0, "learning_rate": 2.6305000000000003e-05, "loss": 1.7513, "mean_token_accuracy": 0.6479008972644806, "num_tokens": 17285956.0, "step": 4740 }, { "epoch": 36.5, "grad_norm": 0.0, "learning_rate": 2.628e-05, "loss": 1.7722, "mean_token_accuracy": 0.6436032384634018, "num_tokens": 17303770.0, "step": 4745 }, { "epoch": 36.53846153846154, "grad_norm": 0.0, "learning_rate": 2.6255000000000002e-05, "loss": 1.7904, "mean_token_accuracy": 0.6443703442811965, "num_tokens": 17321120.0, "step": 4750 }, { "epoch": 36.57692307692308, "grad_norm": 0.0, "learning_rate": 2.6229999999999998e-05, "loss": 1.7634, "mean_token_accuracy": 0.6405331075191498, "num_tokens": 17338787.0, "step": 4755 }, { "epoch": 36.61538461538461, "grad_norm": 0.0, "learning_rate": 2.6205e-05, "loss": 1.6876, "mean_token_accuracy": 0.6604384332895279, "num_tokens": 17356798.0, "step": 4760 }, { "epoch": 36.65384615384615, "grad_norm": 0.0, "learning_rate": 2.618e-05, "loss": 1.7414, "mean_token_accuracy": 0.6457662045955658, "num_tokens": 17375411.0, "step": 4765 }, { "epoch": 36.69230769230769, "grad_norm": 0.0, "learning_rate": 2.6155000000000003e-05, "loss": 1.7765, "mean_token_accuracy": 0.6472298890352249, "num_tokens": 17392790.0, "step": 4770 }, { "epoch": 36.73076923076923, "grad_norm": 0.0, "learning_rate": 2.613e-05, "loss": 1.7773, "mean_token_accuracy": 0.6443253785371781, "num_tokens": 17411334.0, "step": 4775 }, { "epoch": 36.76923076923077, "grad_norm": 0.0, "learning_rate": 2.6105e-05, "loss": 1.6968, "mean_token_accuracy": 0.6606687515974045, "num_tokens": 17429786.0, "step": 4780 }, { "epoch": 36.80769230769231, "grad_norm": 0.0, "learning_rate": 2.6079999999999998e-05, "loss": 1.6995, "mean_token_accuracy": 0.6591976523399353, "num_tokens": 17448074.0, "step": 4785 }, { "epoch": 36.84615384615385, "grad_norm": 0.0, "learning_rate": 2.6055e-05, "loss": 1.7353, "mean_token_accuracy": 0.6497946470975876, "num_tokens": 17466677.0, "step": 4790 }, { "epoch": 36.88461538461539, "grad_norm": 0.0, "learning_rate": 2.603e-05, "loss": 1.7532, "mean_token_accuracy": 0.6486219108104706, "num_tokens": 17485904.0, "step": 4795 }, { "epoch": 36.92307692307692, "grad_norm": 0.0, "learning_rate": 2.6005000000000003e-05, "loss": 1.7336, "mean_token_accuracy": 0.6513243049383164, "num_tokens": 17503928.0, "step": 4800 }, { "epoch": 36.96153846153846, "grad_norm": 0.0, "learning_rate": 2.598e-05, "loss": 1.7178, "mean_token_accuracy": 0.6609677881002426, "num_tokens": 17522780.0, "step": 4805 }, { "epoch": 37.0, "grad_norm": 0.0, "learning_rate": 2.5955e-05, "loss": 1.6678, "mean_token_accuracy": 0.6652211785316468, "num_tokens": 17541885.0, "step": 4810 }, { "epoch": 37.03846153846154, "grad_norm": 0.0, "learning_rate": 2.5929999999999997e-05, "loss": 1.7167, "mean_token_accuracy": 0.6583490580320358, "num_tokens": 17560348.0, "step": 4815 }, { "epoch": 37.07692307692308, "grad_norm": 0.0, "learning_rate": 2.5905e-05, "loss": 1.7755, "mean_token_accuracy": 0.6465405881404876, "num_tokens": 17578412.0, "step": 4820 }, { "epoch": 37.11538461538461, "grad_norm": 0.0, "learning_rate": 2.588e-05, "loss": 1.667, "mean_token_accuracy": 0.6637176364660263, "num_tokens": 17597158.0, "step": 4825 }, { "epoch": 37.15384615384615, "grad_norm": 0.0, "learning_rate": 2.5855000000000002e-05, "loss": 1.6931, "mean_token_accuracy": 0.6589970797300339, "num_tokens": 17616052.0, "step": 4830 }, { "epoch": 37.19230769230769, "grad_norm": 0.0, "learning_rate": 2.583e-05, "loss": 1.7732, "mean_token_accuracy": 0.6425283402204514, "num_tokens": 17633798.0, "step": 4835 }, { "epoch": 37.23076923076923, "grad_norm": 0.0, "learning_rate": 2.5805e-05, "loss": 1.7705, "mean_token_accuracy": 0.6458623111248016, "num_tokens": 17651657.0, "step": 4840 }, { "epoch": 37.26923076923077, "grad_norm": 0.0, "learning_rate": 2.5779999999999997e-05, "loss": 1.7854, "mean_token_accuracy": 0.6410915940999985, "num_tokens": 17669527.0, "step": 4845 }, { "epoch": 37.30769230769231, "grad_norm": 0.0, "learning_rate": 2.5755e-05, "loss": 1.7162, "mean_token_accuracy": 0.6470267534255981, "num_tokens": 17688130.0, "step": 4850 }, { "epoch": 37.34615384615385, "grad_norm": 0.0, "learning_rate": 2.573e-05, "loss": 1.7409, "mean_token_accuracy": 0.6474620312452316, "num_tokens": 17706679.0, "step": 4855 }, { "epoch": 37.38461538461539, "grad_norm": 0.0, "learning_rate": 2.5705000000000002e-05, "loss": 1.7944, "mean_token_accuracy": 0.6404423266649246, "num_tokens": 17724627.0, "step": 4860 }, { "epoch": 37.42307692307692, "grad_norm": 0.0, "learning_rate": 2.5679999999999998e-05, "loss": 1.7607, "mean_token_accuracy": 0.649592487514019, "num_tokens": 17742357.0, "step": 4865 }, { "epoch": 37.46153846153846, "grad_norm": 0.0, "learning_rate": 2.5655e-05, "loss": 1.7116, "mean_token_accuracy": 0.6577517062425613, "num_tokens": 17761170.0, "step": 4870 }, { "epoch": 37.5, "grad_norm": 0.0, "learning_rate": 2.5629999999999997e-05, "loss": 1.7832, "mean_token_accuracy": 0.6425192266702652, "num_tokens": 17778997.0, "step": 4875 }, { "epoch": 37.53846153846154, "grad_norm": 0.0, "learning_rate": 2.5605e-05, "loss": 1.7689, "mean_token_accuracy": 0.6469813168048859, "num_tokens": 17796705.0, "step": 4880 }, { "epoch": 37.57692307692308, "grad_norm": 0.0, "learning_rate": 2.5580000000000002e-05, "loss": 1.7045, "mean_token_accuracy": 0.6609047919511795, "num_tokens": 17815465.0, "step": 4885 }, { "epoch": 37.61538461538461, "grad_norm": 0.0, "learning_rate": 2.5555000000000002e-05, "loss": 1.7773, "mean_token_accuracy": 0.641741332411766, "num_tokens": 17833805.0, "step": 4890 }, { "epoch": 37.65384615384615, "grad_norm": 0.0, "learning_rate": 2.5530000000000005e-05, "loss": 1.7906, "mean_token_accuracy": 0.6406534284353256, "num_tokens": 17851648.0, "step": 4895 }, { "epoch": 37.69230769230769, "grad_norm": 0.0, "learning_rate": 2.5505e-05, "loss": 1.7484, "mean_token_accuracy": 0.650345778465271, "num_tokens": 17869722.0, "step": 4900 }, { "epoch": 37.73076923076923, "grad_norm": 0.0, "learning_rate": 2.5480000000000003e-05, "loss": 1.6724, "mean_token_accuracy": 0.661380472779274, "num_tokens": 17888580.0, "step": 4905 }, { "epoch": 37.76923076923077, "grad_norm": 0.0, "learning_rate": 2.5455e-05, "loss": 1.7188, "mean_token_accuracy": 0.6543963342905045, "num_tokens": 17907123.0, "step": 4910 }, { "epoch": 37.80769230769231, "grad_norm": 0.0, "learning_rate": 2.5430000000000002e-05, "loss": 1.7679, "mean_token_accuracy": 0.6461872398853302, "num_tokens": 17924870.0, "step": 4915 }, { "epoch": 37.84615384615385, "grad_norm": 0.0, "learning_rate": 2.5405e-05, "loss": 1.7487, "mean_token_accuracy": 0.6498663574457169, "num_tokens": 17942295.0, "step": 4920 }, { "epoch": 37.88461538461539, "grad_norm": 0.0, "learning_rate": 2.5380000000000004e-05, "loss": 1.7363, "mean_token_accuracy": 0.6530733823776245, "num_tokens": 17960859.0, "step": 4925 }, { "epoch": 37.92307692307692, "grad_norm": 0.0, "learning_rate": 2.5355e-05, "loss": 1.6919, "mean_token_accuracy": 0.6597955763339997, "num_tokens": 17979359.0, "step": 4930 }, { "epoch": 37.96153846153846, "grad_norm": 0.0, "learning_rate": 2.5330000000000003e-05, "loss": 1.7047, "mean_token_accuracy": 0.6595474272966385, "num_tokens": 17997968.0, "step": 4935 }, { "epoch": 38.0, "grad_norm": 0.0, "learning_rate": 2.5305000000000003e-05, "loss": 1.7505, "mean_token_accuracy": 0.6506702989339829, "num_tokens": 18015990.0, "step": 4940 }, { "epoch": 38.03846153846154, "grad_norm": 0.0, "learning_rate": 2.5280000000000005e-05, "loss": 1.7561, "mean_token_accuracy": 0.6481132537126542, "num_tokens": 18034580.0, "step": 4945 }, { "epoch": 38.07692307692308, "grad_norm": 0.0, "learning_rate": 2.5255e-05, "loss": 1.7617, "mean_token_accuracy": 0.6524750530719757, "num_tokens": 18052519.0, "step": 4950 }, { "epoch": 38.11538461538461, "grad_norm": 0.0, "learning_rate": 2.5230000000000004e-05, "loss": 1.7571, "mean_token_accuracy": 0.6480519741773605, "num_tokens": 18070400.0, "step": 4955 }, { "epoch": 38.15384615384615, "grad_norm": 0.0, "learning_rate": 2.5205e-05, "loss": 1.6892, "mean_token_accuracy": 0.6499431103467941, "num_tokens": 18089731.0, "step": 4960 }, { "epoch": 38.19230769230769, "grad_norm": 0.0, "learning_rate": 2.5180000000000003e-05, "loss": 1.7363, "mean_token_accuracy": 0.652138090133667, "num_tokens": 18107784.0, "step": 4965 }, { "epoch": 38.23076923076923, "grad_norm": 0.0, "learning_rate": 2.5155000000000002e-05, "loss": 1.6622, "mean_token_accuracy": 0.6630479246377945, "num_tokens": 18126899.0, "step": 4970 }, { "epoch": 38.26923076923077, "grad_norm": 0.0, "learning_rate": 2.5130000000000005e-05, "loss": 1.6931, "mean_token_accuracy": 0.6576372981071472, "num_tokens": 18145576.0, "step": 4975 }, { "epoch": 38.30769230769231, "grad_norm": 0.0, "learning_rate": 2.5105e-05, "loss": 1.7881, "mean_token_accuracy": 0.6391725331544876, "num_tokens": 18162910.0, "step": 4980 }, { "epoch": 38.34615384615385, "grad_norm": 0.0, "learning_rate": 2.5080000000000004e-05, "loss": 1.7223, "mean_token_accuracy": 0.656527328491211, "num_tokens": 18181680.0, "step": 4985 }, { "epoch": 38.38461538461539, "grad_norm": 0.0, "learning_rate": 2.5055e-05, "loss": 1.7574, "mean_token_accuracy": 0.6486213862895965, "num_tokens": 18199714.0, "step": 4990 }, { "epoch": 38.42307692307692, "grad_norm": 0.0, "learning_rate": 2.5030000000000003e-05, "loss": 1.7484, "mean_token_accuracy": 0.6542579799890518, "num_tokens": 18217074.0, "step": 4995 }, { "epoch": 38.46153846153846, "grad_norm": 0.0, "learning_rate": 2.5005000000000002e-05, "loss": 1.7554, "mean_token_accuracy": 0.642228040099144, "num_tokens": 18235182.0, "step": 5000 }, { "epoch": 38.5, "grad_norm": 0.0, "learning_rate": 2.498e-05, "loss": 1.7976, "mean_token_accuracy": 0.6354389518499375, "num_tokens": 18252871.0, "step": 5005 }, { "epoch": 38.53846153846154, "grad_norm": 0.0, "learning_rate": 2.4955e-05, "loss": 1.7793, "mean_token_accuracy": 0.6435254126787185, "num_tokens": 18270734.0, "step": 5010 }, { "epoch": 38.57692307692308, "grad_norm": 0.0, "learning_rate": 2.493e-05, "loss": 1.7659, "mean_token_accuracy": 0.6490629822015762, "num_tokens": 18288300.0, "step": 5015 }, { "epoch": 38.61538461538461, "grad_norm": 0.0, "learning_rate": 2.4905e-05, "loss": 1.7339, "mean_token_accuracy": 0.6582322597503663, "num_tokens": 18306234.0, "step": 5020 }, { "epoch": 38.65384615384615, "grad_norm": 0.0, "learning_rate": 2.488e-05, "loss": 1.6772, "mean_token_accuracy": 0.6635013937950134, "num_tokens": 18325571.0, "step": 5025 }, { "epoch": 38.69230769230769, "grad_norm": 0.0, "learning_rate": 2.4855000000000002e-05, "loss": 1.7387, "mean_token_accuracy": 0.6517604231834412, "num_tokens": 18344113.0, "step": 5030 }, { "epoch": 38.73076923076923, "grad_norm": 0.0, "learning_rate": 2.483e-05, "loss": 1.6998, "mean_token_accuracy": 0.6589053481817245, "num_tokens": 18362753.0, "step": 5035 }, { "epoch": 38.76923076923077, "grad_norm": 0.0, "learning_rate": 2.4805e-05, "loss": 1.7194, "mean_token_accuracy": 0.654488143324852, "num_tokens": 18381290.0, "step": 5040 }, { "epoch": 38.80769230769231, "grad_norm": 0.0, "learning_rate": 2.478e-05, "loss": 1.7901, "mean_token_accuracy": 0.6406182050704956, "num_tokens": 18398780.0, "step": 5045 }, { "epoch": 38.84615384615385, "grad_norm": 0.0, "learning_rate": 2.4755e-05, "loss": 1.7715, "mean_token_accuracy": 0.6461145460605622, "num_tokens": 18416605.0, "step": 5050 }, { "epoch": 38.88461538461539, "grad_norm": 0.0, "learning_rate": 2.473e-05, "loss": 1.7961, "mean_token_accuracy": 0.6413571745157242, "num_tokens": 18434448.0, "step": 5055 }, { "epoch": 38.92307692307692, "grad_norm": 0.0, "learning_rate": 2.4705e-05, "loss": 1.7132, "mean_token_accuracy": 0.6524721384048462, "num_tokens": 18453025.0, "step": 5060 }, { "epoch": 38.96153846153846, "grad_norm": 0.0, "learning_rate": 2.468e-05, "loss": 1.7789, "mean_token_accuracy": 0.6459534347057343, "num_tokens": 18470888.0, "step": 5065 }, { "epoch": 39.0, "grad_norm": 0.0, "learning_rate": 2.4655e-05, "loss": 1.6775, "mean_token_accuracy": 0.6582199364900589, "num_tokens": 18490095.0, "step": 5070 }, { "epoch": 39.03846153846154, "grad_norm": 0.0, "learning_rate": 2.463e-05, "loss": 1.7668, "mean_token_accuracy": 0.6507688373327255, "num_tokens": 18507912.0, "step": 5075 }, { "epoch": 39.07692307692308, "grad_norm": 0.0, "learning_rate": 2.4605e-05, "loss": 1.7214, "mean_token_accuracy": 0.650596770644188, "num_tokens": 18526534.0, "step": 5080 }, { "epoch": 39.11538461538461, "grad_norm": 0.0, "learning_rate": 2.4580000000000002e-05, "loss": 1.7074, "mean_token_accuracy": 0.6542585700750351, "num_tokens": 18544958.0, "step": 5085 }, { "epoch": 39.15384615384615, "grad_norm": 0.0, "learning_rate": 2.4555e-05, "loss": 1.7504, "mean_token_accuracy": 0.6499609500169754, "num_tokens": 18562850.0, "step": 5090 }, { "epoch": 39.19230769230769, "grad_norm": 0.0, "learning_rate": 2.453e-05, "loss": 1.7253, "mean_token_accuracy": 0.6546373665332794, "num_tokens": 18581257.0, "step": 5095 }, { "epoch": 39.23076923076923, "grad_norm": 0.0, "learning_rate": 2.4505e-05, "loss": 1.7325, "mean_token_accuracy": 0.6537098169326783, "num_tokens": 18599186.0, "step": 5100 }, { "epoch": 39.26923076923077, "grad_norm": 0.0, "learning_rate": 2.448e-05, "loss": 1.817, "mean_token_accuracy": 0.6370946735143661, "num_tokens": 18616131.0, "step": 5105 }, { "epoch": 39.30769230769231, "grad_norm": 0.0, "learning_rate": 2.4455e-05, "loss": 1.6847, "mean_token_accuracy": 0.6567441165447235, "num_tokens": 18635229.0, "step": 5110 }, { "epoch": 39.34615384615385, "grad_norm": 0.0, "learning_rate": 2.443e-05, "loss": 1.6926, "mean_token_accuracy": 0.6508637487888336, "num_tokens": 18654889.0, "step": 5115 }, { "epoch": 39.38461538461539, "grad_norm": 0.0, "learning_rate": 2.4405e-05, "loss": 1.7283, "mean_token_accuracy": 0.6576891779899597, "num_tokens": 18672794.0, "step": 5120 }, { "epoch": 39.42307692307692, "grad_norm": 0.0, "learning_rate": 2.438e-05, "loss": 1.671, "mean_token_accuracy": 0.6607532262802124, "num_tokens": 18692648.0, "step": 5125 }, { "epoch": 39.46153846153846, "grad_norm": 0.0, "learning_rate": 2.4355e-05, "loss": 1.7717, "mean_token_accuracy": 0.6461830377578736, "num_tokens": 18710735.0, "step": 5130 }, { "epoch": 39.5, "grad_norm": 0.0, "learning_rate": 2.433e-05, "loss": 1.7019, "mean_token_accuracy": 0.657170632481575, "num_tokens": 18729163.0, "step": 5135 }, { "epoch": 39.53846153846154, "grad_norm": 0.0, "learning_rate": 2.4305e-05, "loss": 1.7227, "mean_token_accuracy": 0.6536213010549545, "num_tokens": 18747486.0, "step": 5140 }, { "epoch": 39.57692307692308, "grad_norm": 0.0, "learning_rate": 2.428e-05, "loss": 1.7075, "mean_token_accuracy": 0.6612475931644439, "num_tokens": 18766026.0, "step": 5145 }, { "epoch": 39.61538461538461, "grad_norm": 0.0, "learning_rate": 2.4255e-05, "loss": 1.6588, "mean_token_accuracy": 0.6661661475896835, "num_tokens": 18785291.0, "step": 5150 }, { "epoch": 39.65384615384615, "grad_norm": 0.0, "learning_rate": 2.423e-05, "loss": 1.7663, "mean_token_accuracy": 0.6468746066093445, "num_tokens": 18803265.0, "step": 5155 }, { "epoch": 39.69230769230769, "grad_norm": 0.0, "learning_rate": 2.4205e-05, "loss": 1.7807, "mean_token_accuracy": 0.6388770550489425, "num_tokens": 18821103.0, "step": 5160 }, { "epoch": 39.73076923076923, "grad_norm": 0.0, "learning_rate": 2.418e-05, "loss": 1.8602, "mean_token_accuracy": 0.6244511514902115, "num_tokens": 18838884.0, "step": 5165 }, { "epoch": 39.76923076923077, "grad_norm": 0.0, "learning_rate": 2.4154999999999998e-05, "loss": 1.7841, "mean_token_accuracy": 0.6422644227743148, "num_tokens": 18856713.0, "step": 5170 }, { "epoch": 39.80769230769231, "grad_norm": 0.0, "learning_rate": 2.413e-05, "loss": 1.7265, "mean_token_accuracy": 0.6520240396261215, "num_tokens": 18874743.0, "step": 5175 }, { "epoch": 39.84615384615385, "grad_norm": 0.0, "learning_rate": 2.4105e-05, "loss": 1.8197, "mean_token_accuracy": 0.6371575862169265, "num_tokens": 18891917.0, "step": 5180 }, { "epoch": 39.88461538461539, "grad_norm": 0.0, "learning_rate": 2.408e-05, "loss": 1.7642, "mean_token_accuracy": 0.6434910297393799, "num_tokens": 18909826.0, "step": 5185 }, { "epoch": 39.92307692307692, "grad_norm": 0.0, "learning_rate": 2.4055000000000003e-05, "loss": 1.7481, "mean_token_accuracy": 0.6525659114122391, "num_tokens": 18928215.0, "step": 5190 }, { "epoch": 39.96153846153846, "grad_norm": 0.0, "learning_rate": 2.4030000000000002e-05, "loss": 1.705, "mean_token_accuracy": 0.6562006801366806, "num_tokens": 18946708.0, "step": 5195 }, { "epoch": 40.0, "grad_norm": 0.0, "learning_rate": 2.4005e-05, "loss": 1.7572, "mean_token_accuracy": 0.6535427123308182, "num_tokens": 18964200.0, "step": 5200 }, { "epoch": 40.03846153846154, "grad_norm": 0.0, "learning_rate": 2.398e-05, "loss": 1.6565, "mean_token_accuracy": 0.6654837459325791, "num_tokens": 18982763.0, "step": 5205 }, { "epoch": 40.07692307692308, "grad_norm": 0.0, "learning_rate": 2.3955000000000004e-05, "loss": 1.7472, "mean_token_accuracy": 0.6510910272598267, "num_tokens": 19000451.0, "step": 5210 }, { "epoch": 40.11538461538461, "grad_norm": 0.0, "learning_rate": 2.3930000000000003e-05, "loss": 1.7149, "mean_token_accuracy": 0.6571291297674179, "num_tokens": 19018812.0, "step": 5215 }, { "epoch": 40.15384615384615, "grad_norm": 0.0, "learning_rate": 2.3905000000000002e-05, "loss": 1.7426, "mean_token_accuracy": 0.6531620174646378, "num_tokens": 19036736.0, "step": 5220 }, { "epoch": 40.19230769230769, "grad_norm": 0.0, "learning_rate": 2.3880000000000002e-05, "loss": 1.7962, "mean_token_accuracy": 0.6403325825929642, "num_tokens": 19054193.0, "step": 5225 }, { "epoch": 40.23076923076923, "grad_norm": 0.0, "learning_rate": 2.3855e-05, "loss": 1.8235, "mean_token_accuracy": 0.634826073050499, "num_tokens": 19071438.0, "step": 5230 }, { "epoch": 40.26923076923077, "grad_norm": 0.0, "learning_rate": 2.3830000000000004e-05, "loss": 1.7791, "mean_token_accuracy": 0.641825932264328, "num_tokens": 19089581.0, "step": 5235 }, { "epoch": 40.30769230769231, "grad_norm": 0.0, "learning_rate": 2.3805000000000003e-05, "loss": 1.7551, "mean_token_accuracy": 0.651043924689293, "num_tokens": 19107710.0, "step": 5240 }, { "epoch": 40.34615384615385, "grad_norm": 0.0, "learning_rate": 2.3780000000000003e-05, "loss": 1.7209, "mean_token_accuracy": 0.6562368720769882, "num_tokens": 19125352.0, "step": 5245 }, { "epoch": 40.38461538461539, "grad_norm": 0.0, "learning_rate": 2.3755000000000002e-05, "loss": 1.7579, "mean_token_accuracy": 0.6454941630363464, "num_tokens": 19142899.0, "step": 5250 }, { "epoch": 40.42307692307692, "grad_norm": 0.0, "learning_rate": 2.373e-05, "loss": 1.7409, "mean_token_accuracy": 0.6469170600175858, "num_tokens": 19161441.0, "step": 5255 }, { "epoch": 40.46153846153846, "grad_norm": 0.0, "learning_rate": 2.3705e-05, "loss": 1.7284, "mean_token_accuracy": 0.6500023663043976, "num_tokens": 19179916.0, "step": 5260 }, { "epoch": 40.5, "grad_norm": 0.0, "learning_rate": 2.3680000000000004e-05, "loss": 1.7584, "mean_token_accuracy": 0.6516165018081665, "num_tokens": 19197579.0, "step": 5265 }, { "epoch": 40.53846153846154, "grad_norm": 0.0, "learning_rate": 2.3655000000000003e-05, "loss": 1.6656, "mean_token_accuracy": 0.6603842228651047, "num_tokens": 19217073.0, "step": 5270 }, { "epoch": 40.57692307692308, "grad_norm": 0.0, "learning_rate": 2.3630000000000002e-05, "loss": 1.7213, "mean_token_accuracy": 0.6537799417972565, "num_tokens": 19235609.0, "step": 5275 }, { "epoch": 40.61538461538461, "grad_norm": 0.0, "learning_rate": 2.3605000000000002e-05, "loss": 1.6618, "mean_token_accuracy": 0.664988386631012, "num_tokens": 19254738.0, "step": 5280 }, { "epoch": 40.65384615384615, "grad_norm": 0.0, "learning_rate": 2.358e-05, "loss": 1.7453, "mean_token_accuracy": 0.6468230724334717, "num_tokens": 19273467.0, "step": 5285 }, { "epoch": 40.69230769230769, "grad_norm": 0.0, "learning_rate": 2.3555e-05, "loss": 1.7231, "mean_token_accuracy": 0.6521205425262451, "num_tokens": 19292138.0, "step": 5290 }, { "epoch": 40.73076923076923, "grad_norm": 0.0, "learning_rate": 2.3530000000000003e-05, "loss": 1.7605, "mean_token_accuracy": 0.652343025803566, "num_tokens": 19310319.0, "step": 5295 }, { "epoch": 40.76923076923077, "grad_norm": 0.0, "learning_rate": 2.3505000000000003e-05, "loss": 1.7581, "mean_token_accuracy": 0.6503111094236373, "num_tokens": 19328485.0, "step": 5300 }, { "epoch": 40.80769230769231, "grad_norm": 0.0, "learning_rate": 2.3480000000000002e-05, "loss": 1.8109, "mean_token_accuracy": 0.6344301998615265, "num_tokens": 19346027.0, "step": 5305 }, { "epoch": 40.84615384615385, "grad_norm": 0.0, "learning_rate": 2.3455e-05, "loss": 1.7801, "mean_token_accuracy": 0.6480203717947006, "num_tokens": 19364659.0, "step": 5310 }, { "epoch": 40.88461538461539, "grad_norm": 0.0, "learning_rate": 2.343e-05, "loss": 1.7486, "mean_token_accuracy": 0.6470009624958039, "num_tokens": 19383460.0, "step": 5315 }, { "epoch": 40.92307692307692, "grad_norm": 0.0, "learning_rate": 2.3405e-05, "loss": 1.6961, "mean_token_accuracy": 0.6605125904083252, "num_tokens": 19401413.0, "step": 5320 }, { "epoch": 40.96153846153846, "grad_norm": 0.0, "learning_rate": 2.3380000000000003e-05, "loss": 1.7979, "mean_token_accuracy": 0.6404047667980194, "num_tokens": 19419042.0, "step": 5325 }, { "epoch": 41.0, "grad_norm": 0.0, "learning_rate": 2.3355000000000003e-05, "loss": 1.6808, "mean_token_accuracy": 0.6540407299995422, "num_tokens": 19438305.0, "step": 5330 }, { "epoch": 41.03846153846154, "grad_norm": 0.0, "learning_rate": 2.3330000000000002e-05, "loss": 1.7928, "mean_token_accuracy": 0.6409297794103622, "num_tokens": 19456136.0, "step": 5335 }, { "epoch": 41.07692307692308, "grad_norm": 0.0, "learning_rate": 2.3305e-05, "loss": 1.6898, "mean_token_accuracy": 0.6670808821916581, "num_tokens": 19474660.0, "step": 5340 }, { "epoch": 41.11538461538461, "grad_norm": 0.0, "learning_rate": 2.328e-05, "loss": 1.7063, "mean_token_accuracy": 0.6591937988996506, "num_tokens": 19493457.0, "step": 5345 }, { "epoch": 41.15384615384615, "grad_norm": 0.0, "learning_rate": 2.3255e-05, "loss": 1.7488, "mean_token_accuracy": 0.6491418421268463, "num_tokens": 19512018.0, "step": 5350 }, { "epoch": 41.19230769230769, "grad_norm": 0.0, "learning_rate": 2.3230000000000003e-05, "loss": 1.757, "mean_token_accuracy": 0.6469480991363525, "num_tokens": 19529637.0, "step": 5355 }, { "epoch": 41.23076923076923, "grad_norm": 0.0, "learning_rate": 2.3205000000000002e-05, "loss": 1.7163, "mean_token_accuracy": 0.6563492953777313, "num_tokens": 19548075.0, "step": 5360 }, { "epoch": 41.26923076923077, "grad_norm": 0.0, "learning_rate": 2.318e-05, "loss": 1.8052, "mean_token_accuracy": 0.6417810708284378, "num_tokens": 19565579.0, "step": 5365 }, { "epoch": 41.30769230769231, "grad_norm": 0.0, "learning_rate": 2.3155e-05, "loss": 1.703, "mean_token_accuracy": 0.6617189884185791, "num_tokens": 19584102.0, "step": 5370 }, { "epoch": 41.34615384615385, "grad_norm": 0.0, "learning_rate": 2.313e-05, "loss": 1.6773, "mean_token_accuracy": 0.6627373963594436, "num_tokens": 19602734.0, "step": 5375 }, { "epoch": 41.38461538461539, "grad_norm": 0.0, "learning_rate": 2.3105000000000003e-05, "loss": 1.7655, "mean_token_accuracy": 0.6403976768255234, "num_tokens": 19620846.0, "step": 5380 }, { "epoch": 41.42307692307692, "grad_norm": 0.0, "learning_rate": 2.3080000000000003e-05, "loss": 1.7217, "mean_token_accuracy": 0.6528929799795151, "num_tokens": 19639574.0, "step": 5385 }, { "epoch": 41.46153846153846, "grad_norm": 0.0, "learning_rate": 2.3055000000000002e-05, "loss": 1.8022, "mean_token_accuracy": 0.641742405295372, "num_tokens": 19657464.0, "step": 5390 }, { "epoch": 41.5, "grad_norm": 0.0, "learning_rate": 2.303e-05, "loss": 1.7207, "mean_token_accuracy": 0.6533408045768738, "num_tokens": 19676214.0, "step": 5395 }, { "epoch": 41.53846153846154, "grad_norm": 0.0, "learning_rate": 2.3005e-05, "loss": 1.7098, "mean_token_accuracy": 0.6526803016662598, "num_tokens": 19694850.0, "step": 5400 }, { "epoch": 41.57692307692308, "grad_norm": 0.0, "learning_rate": 2.298e-05, "loss": 1.767, "mean_token_accuracy": 0.6512164533138275, "num_tokens": 19712732.0, "step": 5405 }, { "epoch": 41.61538461538461, "grad_norm": 0.0, "learning_rate": 2.2955000000000003e-05, "loss": 1.7149, "mean_token_accuracy": 0.6531447738409042, "num_tokens": 19731634.0, "step": 5410 }, { "epoch": 41.65384615384615, "grad_norm": 0.0, "learning_rate": 2.2930000000000002e-05, "loss": 1.7672, "mean_token_accuracy": 0.6508445739746094, "num_tokens": 19748890.0, "step": 5415 }, { "epoch": 41.69230769230769, "grad_norm": 0.0, "learning_rate": 2.2905000000000002e-05, "loss": 1.7294, "mean_token_accuracy": 0.6476221710443497, "num_tokens": 19767058.0, "step": 5420 }, { "epoch": 41.73076923076923, "grad_norm": 0.0, "learning_rate": 2.288e-05, "loss": 1.713, "mean_token_accuracy": 0.6579526603221894, "num_tokens": 19785185.0, "step": 5425 }, { "epoch": 41.76923076923077, "grad_norm": 0.0, "learning_rate": 2.2855e-05, "loss": 1.6993, "mean_token_accuracy": 0.6542178004980087, "num_tokens": 19803873.0, "step": 5430 }, { "epoch": 41.80769230769231, "grad_norm": 0.0, "learning_rate": 2.283e-05, "loss": 1.7208, "mean_token_accuracy": 0.6572512924671173, "num_tokens": 19822419.0, "step": 5435 }, { "epoch": 41.84615384615385, "grad_norm": 0.0, "learning_rate": 2.2805000000000003e-05, "loss": 1.7922, "mean_token_accuracy": 0.6356712460517884, "num_tokens": 19840582.0, "step": 5440 }, { "epoch": 41.88461538461539, "grad_norm": 0.0, "learning_rate": 2.2780000000000002e-05, "loss": 1.7735, "mean_token_accuracy": 0.6435914099216461, "num_tokens": 19858971.0, "step": 5445 }, { "epoch": 41.92307692307692, "grad_norm": 0.0, "learning_rate": 2.2755e-05, "loss": 1.7703, "mean_token_accuracy": 0.6400933116674423, "num_tokens": 19876852.0, "step": 5450 }, { "epoch": 41.96153846153846, "grad_norm": 0.0, "learning_rate": 2.273e-05, "loss": 1.7577, "mean_token_accuracy": 0.6475079268217087, "num_tokens": 19894493.0, "step": 5455 }, { "epoch": 42.0, "grad_norm": 0.0, "learning_rate": 2.2705e-05, "loss": 1.7518, "mean_token_accuracy": 0.6474265098571778, "num_tokens": 19912410.0, "step": 5460 }, { "epoch": 42.03846153846154, "grad_norm": 0.0, "learning_rate": 2.268e-05, "loss": 1.7478, "mean_token_accuracy": 0.646801871061325, "num_tokens": 19930455.0, "step": 5465 }, { "epoch": 42.07692307692308, "grad_norm": 0.0, "learning_rate": 2.2655000000000002e-05, "loss": 1.8042, "mean_token_accuracy": 0.6378622591495514, "num_tokens": 19948140.0, "step": 5470 }, { "epoch": 42.11538461538461, "grad_norm": 0.0, "learning_rate": 2.2630000000000002e-05, "loss": 1.7676, "mean_token_accuracy": 0.6440225541591644, "num_tokens": 19965973.0, "step": 5475 }, { "epoch": 42.15384615384615, "grad_norm": 0.0, "learning_rate": 2.2605e-05, "loss": 1.7853, "mean_token_accuracy": 0.6422720283269883, "num_tokens": 19983738.0, "step": 5480 }, { "epoch": 42.19230769230769, "grad_norm": 0.0, "learning_rate": 2.258e-05, "loss": 1.7484, "mean_token_accuracy": 0.6497463047504425, "num_tokens": 20001740.0, "step": 5485 }, { "epoch": 42.23076923076923, "grad_norm": 0.0, "learning_rate": 2.2555e-05, "loss": 1.7095, "mean_token_accuracy": 0.6572997212409973, "num_tokens": 20019689.0, "step": 5490 }, { "epoch": 42.26923076923077, "grad_norm": 0.0, "learning_rate": 2.253e-05, "loss": 1.7257, "mean_token_accuracy": 0.6576829224824905, "num_tokens": 20037781.0, "step": 5495 }, { "epoch": 42.30769230769231, "grad_norm": 0.0, "learning_rate": 2.2505000000000002e-05, "loss": 1.7521, "mean_token_accuracy": 0.647400951385498, "num_tokens": 20056196.0, "step": 5500 }, { "epoch": 42.34615384615385, "grad_norm": 0.0, "learning_rate": 2.248e-05, "loss": 1.6555, "mean_token_accuracy": 0.6666512668132782, "num_tokens": 20075099.0, "step": 5505 }, { "epoch": 42.38461538461539, "grad_norm": 0.0, "learning_rate": 2.2455e-05, "loss": 1.7001, "mean_token_accuracy": 0.6635800451040268, "num_tokens": 20094237.0, "step": 5510 }, { "epoch": 42.42307692307692, "grad_norm": 0.0, "learning_rate": 2.243e-05, "loss": 1.7268, "mean_token_accuracy": 0.6491197019815445, "num_tokens": 20112703.0, "step": 5515 }, { "epoch": 42.46153846153846, "grad_norm": 0.0, "learning_rate": 2.2405e-05, "loss": 1.7641, "mean_token_accuracy": 0.6478333413600922, "num_tokens": 20130707.0, "step": 5520 }, { "epoch": 42.5, "grad_norm": 0.0, "learning_rate": 2.2380000000000003e-05, "loss": 1.7299, "mean_token_accuracy": 0.6528987497091293, "num_tokens": 20149020.0, "step": 5525 }, { "epoch": 42.53846153846154, "grad_norm": 0.0, "learning_rate": 2.2355000000000002e-05, "loss": 1.7313, "mean_token_accuracy": 0.6519591569900512, "num_tokens": 20167484.0, "step": 5530 }, { "epoch": 42.57692307692308, "grad_norm": 0.0, "learning_rate": 2.233e-05, "loss": 1.7865, "mean_token_accuracy": 0.6391574621200562, "num_tokens": 20184976.0, "step": 5535 }, { "epoch": 42.61538461538461, "grad_norm": 0.0, "learning_rate": 2.2305e-05, "loss": 1.6944, "mean_token_accuracy": 0.658808296918869, "num_tokens": 20203744.0, "step": 5540 }, { "epoch": 42.65384615384615, "grad_norm": 0.0, "learning_rate": 2.228e-05, "loss": 1.6489, "mean_token_accuracy": 0.6704173415899277, "num_tokens": 20222753.0, "step": 5545 }, { "epoch": 42.69230769230769, "grad_norm": 0.0, "learning_rate": 2.2255e-05, "loss": 1.7507, "mean_token_accuracy": 0.6489108324050903, "num_tokens": 20241135.0, "step": 5550 }, { "epoch": 42.73076923076923, "grad_norm": 0.0, "learning_rate": 2.2230000000000002e-05, "loss": 1.722, "mean_token_accuracy": 0.6489439159631729, "num_tokens": 20260178.0, "step": 5555 }, { "epoch": 42.76923076923077, "grad_norm": 0.0, "learning_rate": 2.2205000000000002e-05, "loss": 1.7384, "mean_token_accuracy": 0.6486925899982452, "num_tokens": 20278793.0, "step": 5560 }, { "epoch": 42.80769230769231, "grad_norm": 0.0, "learning_rate": 2.218e-05, "loss": 1.7011, "mean_token_accuracy": 0.6595232754945755, "num_tokens": 20297915.0, "step": 5565 }, { "epoch": 42.84615384615385, "grad_norm": 0.0, "learning_rate": 2.2155e-05, "loss": 1.756, "mean_token_accuracy": 0.6450414389371872, "num_tokens": 20316235.0, "step": 5570 }, { "epoch": 42.88461538461539, "grad_norm": 0.0, "learning_rate": 2.213e-05, "loss": 1.8509, "mean_token_accuracy": 0.6292482733726501, "num_tokens": 20333291.0, "step": 5575 }, { "epoch": 42.92307692307692, "grad_norm": 0.0, "learning_rate": 2.2105e-05, "loss": 1.7492, "mean_token_accuracy": 0.6515639036893844, "num_tokens": 20351293.0, "step": 5580 }, { "epoch": 42.96153846153846, "grad_norm": 0.0, "learning_rate": 2.2080000000000002e-05, "loss": 1.7454, "mean_token_accuracy": 0.6521316289901733, "num_tokens": 20368835.0, "step": 5585 }, { "epoch": 43.0, "grad_norm": 0.0, "learning_rate": 2.2055e-05, "loss": 1.7799, "mean_token_accuracy": 0.6442039728164672, "num_tokens": 20386515.0, "step": 5590 }, { "epoch": 43.03846153846154, "grad_norm": 0.0, "learning_rate": 2.203e-05, "loss": 1.7707, "mean_token_accuracy": 0.6441226691007614, "num_tokens": 20404313.0, "step": 5595 }, { "epoch": 43.07692307692308, "grad_norm": 0.0, "learning_rate": 2.2005e-05, "loss": 1.7275, "mean_token_accuracy": 0.6514497637748718, "num_tokens": 20422886.0, "step": 5600 }, { "epoch": 43.11538461538461, "grad_norm": 0.0, "learning_rate": 2.198e-05, "loss": 1.7403, "mean_token_accuracy": 0.6523116052150726, "num_tokens": 20440808.0, "step": 5605 }, { "epoch": 43.15384615384615, "grad_norm": 0.0, "learning_rate": 2.1955e-05, "loss": 1.6582, "mean_token_accuracy": 0.6698237389326096, "num_tokens": 20459519.0, "step": 5610 }, { "epoch": 43.19230769230769, "grad_norm": 0.0, "learning_rate": 2.1930000000000002e-05, "loss": 1.7039, "mean_token_accuracy": 0.6586238622665406, "num_tokens": 20478450.0, "step": 5615 }, { "epoch": 43.23076923076923, "grad_norm": 0.0, "learning_rate": 2.1905e-05, "loss": 1.733, "mean_token_accuracy": 0.650734207034111, "num_tokens": 20497721.0, "step": 5620 }, { "epoch": 43.26923076923077, "grad_norm": 0.0, "learning_rate": 2.188e-05, "loss": 1.7549, "mean_token_accuracy": 0.6470336526632309, "num_tokens": 20515794.0, "step": 5625 }, { "epoch": 43.30769230769231, "grad_norm": 0.0, "learning_rate": 2.1855e-05, "loss": 1.7582, "mean_token_accuracy": 0.6463449627161026, "num_tokens": 20533916.0, "step": 5630 }, { "epoch": 43.34615384615385, "grad_norm": 0.0, "learning_rate": 2.183e-05, "loss": 1.6911, "mean_token_accuracy": 0.6570004016160965, "num_tokens": 20552781.0, "step": 5635 }, { "epoch": 43.38461538461539, "grad_norm": 0.0, "learning_rate": 2.1805e-05, "loss": 1.7714, "mean_token_accuracy": 0.6484808444976806, "num_tokens": 20570987.0, "step": 5640 }, { "epoch": 43.42307692307692, "grad_norm": 0.0, "learning_rate": 2.178e-05, "loss": 1.7396, "mean_token_accuracy": 0.6534300118684768, "num_tokens": 20589119.0, "step": 5645 }, { "epoch": 43.46153846153846, "grad_norm": 0.0, "learning_rate": 2.1755e-05, "loss": 1.7263, "mean_token_accuracy": 0.6558996856212616, "num_tokens": 20607573.0, "step": 5650 }, { "epoch": 43.5, "grad_norm": 0.0, "learning_rate": 2.173e-05, "loss": 1.7667, "mean_token_accuracy": 0.6426133215427399, "num_tokens": 20625925.0, "step": 5655 }, { "epoch": 43.53846153846154, "grad_norm": 0.0, "learning_rate": 2.1705e-05, "loss": 1.7072, "mean_token_accuracy": 0.6583232104778289, "num_tokens": 20644265.0, "step": 5660 }, { "epoch": 43.57692307692308, "grad_norm": 0.0, "learning_rate": 2.168e-05, "loss": 1.7284, "mean_token_accuracy": 0.6528914481401443, "num_tokens": 20662528.0, "step": 5665 }, { "epoch": 43.61538461538461, "grad_norm": 0.0, "learning_rate": 2.1655000000000002e-05, "loss": 1.6837, "mean_token_accuracy": 0.6582222819328308, "num_tokens": 20681100.0, "step": 5670 }, { "epoch": 43.65384615384615, "grad_norm": 0.0, "learning_rate": 2.163e-05, "loss": 1.7536, "mean_token_accuracy": 0.6478887408971786, "num_tokens": 20699280.0, "step": 5675 }, { "epoch": 43.69230769230769, "grad_norm": 0.0, "learning_rate": 2.1605e-05, "loss": 1.7702, "mean_token_accuracy": 0.6438897788524628, "num_tokens": 20717346.0, "step": 5680 }, { "epoch": 43.73076923076923, "grad_norm": 0.0, "learning_rate": 2.158e-05, "loss": 1.7388, "mean_token_accuracy": 0.6525939971208572, "num_tokens": 20735654.0, "step": 5685 }, { "epoch": 43.76923076923077, "grad_norm": 0.0, "learning_rate": 2.1555e-05, "loss": 1.7583, "mean_token_accuracy": 0.649162980914116, "num_tokens": 20753638.0, "step": 5690 }, { "epoch": 43.80769230769231, "grad_norm": 0.0, "learning_rate": 2.153e-05, "loss": 1.7555, "mean_token_accuracy": 0.6506625056266785, "num_tokens": 20771873.0, "step": 5695 }, { "epoch": 43.84615384615385, "grad_norm": 0.0, "learning_rate": 2.1505e-05, "loss": 1.7904, "mean_token_accuracy": 0.6409453451633453, "num_tokens": 20789218.0, "step": 5700 }, { "epoch": 43.88461538461539, "grad_norm": 0.0, "learning_rate": 2.148e-05, "loss": 1.7477, "mean_token_accuracy": 0.6445836216211319, "num_tokens": 20807034.0, "step": 5705 }, { "epoch": 43.92307692307692, "grad_norm": 0.0, "learning_rate": 2.1455e-05, "loss": 1.7596, "mean_token_accuracy": 0.6507911443710327, "num_tokens": 20824903.0, "step": 5710 }, { "epoch": 43.96153846153846, "grad_norm": 0.0, "learning_rate": 2.143e-05, "loss": 1.7877, "mean_token_accuracy": 0.6402207911014557, "num_tokens": 20842868.0, "step": 5715 }, { "epoch": 44.0, "grad_norm": 0.0, "learning_rate": 2.1405e-05, "loss": 1.7548, "mean_token_accuracy": 0.6519168674945831, "num_tokens": 20860620.0, "step": 5720 }, { "epoch": 44.03846153846154, "grad_norm": 0.0, "learning_rate": 2.138e-05, "loss": 1.7833, "mean_token_accuracy": 0.6388791173696518, "num_tokens": 20878837.0, "step": 5725 }, { "epoch": 44.07692307692308, "grad_norm": 0.0, "learning_rate": 2.1355e-05, "loss": 1.7785, "mean_token_accuracy": 0.6488111823797226, "num_tokens": 20896892.0, "step": 5730 }, { "epoch": 44.11538461538461, "grad_norm": 0.0, "learning_rate": 2.133e-05, "loss": 1.8372, "mean_token_accuracy": 0.630810198187828, "num_tokens": 20914256.0, "step": 5735 }, { "epoch": 44.15384615384615, "grad_norm": 0.0, "learning_rate": 2.1305e-05, "loss": 1.6904, "mean_token_accuracy": 0.6603229939937592, "num_tokens": 20932992.0, "step": 5740 }, { "epoch": 44.19230769230769, "grad_norm": 0.0, "learning_rate": 2.128e-05, "loss": 1.7463, "mean_token_accuracy": 0.65241838991642, "num_tokens": 20950902.0, "step": 5745 }, { "epoch": 44.23076923076923, "grad_norm": 0.0, "learning_rate": 2.1255e-05, "loss": 1.7803, "mean_token_accuracy": 0.6396133095026016, "num_tokens": 20969175.0, "step": 5750 }, { "epoch": 44.26923076923077, "grad_norm": 0.0, "learning_rate": 2.123e-05, "loss": 1.7431, "mean_token_accuracy": 0.6489101707935333, "num_tokens": 20987238.0, "step": 5755 }, { "epoch": 44.30769230769231, "grad_norm": 0.0, "learning_rate": 2.1205e-05, "loss": 1.7022, "mean_token_accuracy": 0.658707058429718, "num_tokens": 21005379.0, "step": 5760 }, { "epoch": 44.34615384615385, "grad_norm": 0.0, "learning_rate": 2.118e-05, "loss": 1.7081, "mean_token_accuracy": 0.65692238509655, "num_tokens": 21023486.0, "step": 5765 }, { "epoch": 44.38461538461539, "grad_norm": 0.0, "learning_rate": 2.1155e-05, "loss": 1.7568, "mean_token_accuracy": 0.6409606844186783, "num_tokens": 21041662.0, "step": 5770 }, { "epoch": 44.42307692307692, "grad_norm": 0.0, "learning_rate": 2.113e-05, "loss": 1.7182, "mean_token_accuracy": 0.6524233877658844, "num_tokens": 21060206.0, "step": 5775 }, { "epoch": 44.46153846153846, "grad_norm": 0.0, "learning_rate": 2.1105e-05, "loss": 1.7231, "mean_token_accuracy": 0.655006617307663, "num_tokens": 21078736.0, "step": 5780 }, { "epoch": 44.5, "grad_norm": 0.0, "learning_rate": 2.1079999999999998e-05, "loss": 1.7458, "mean_token_accuracy": 0.648220095038414, "num_tokens": 21097223.0, "step": 5785 }, { "epoch": 44.53846153846154, "grad_norm": 0.0, "learning_rate": 2.1055e-05, "loss": 1.6928, "mean_token_accuracy": 0.6601225137710571, "num_tokens": 21116083.0, "step": 5790 }, { "epoch": 44.57692307692308, "grad_norm": 0.0, "learning_rate": 2.103e-05, "loss": 1.7913, "mean_token_accuracy": 0.6423264652490616, "num_tokens": 21133615.0, "step": 5795 }, { "epoch": 44.61538461538461, "grad_norm": 0.0, "learning_rate": 2.1005e-05, "loss": 1.6964, "mean_token_accuracy": 0.6554509729146958, "num_tokens": 21152459.0, "step": 5800 }, { "epoch": 44.65384615384615, "grad_norm": 0.0, "learning_rate": 2.098e-05, "loss": 1.6493, "mean_token_accuracy": 0.6667488932609558, "num_tokens": 21171676.0, "step": 5805 }, { "epoch": 44.69230769230769, "grad_norm": 0.0, "learning_rate": 2.0955e-05, "loss": 1.8191, "mean_token_accuracy": 0.6378259479999542, "num_tokens": 21188876.0, "step": 5810 }, { "epoch": 44.73076923076923, "grad_norm": 0.0, "learning_rate": 2.093e-05, "loss": 1.7046, "mean_token_accuracy": 0.6647377282381057, "num_tokens": 21207312.0, "step": 5815 }, { "epoch": 44.76923076923077, "grad_norm": 0.0, "learning_rate": 2.0905000000000004e-05, "loss": 1.7338, "mean_token_accuracy": 0.6526555448770524, "num_tokens": 21225855.0, "step": 5820 }, { "epoch": 44.80769230769231, "grad_norm": 0.0, "learning_rate": 2.0880000000000003e-05, "loss": 1.7651, "mean_token_accuracy": 0.6445731610059738, "num_tokens": 21244188.0, "step": 5825 }, { "epoch": 44.84615384615385, "grad_norm": 0.0, "learning_rate": 2.0855000000000003e-05, "loss": 1.7282, "mean_token_accuracy": 0.6482422709465027, "num_tokens": 21262701.0, "step": 5830 }, { "epoch": 44.88461538461539, "grad_norm": 0.0, "learning_rate": 2.0830000000000002e-05, "loss": 1.79, "mean_token_accuracy": 0.6363201081752777, "num_tokens": 21280639.0, "step": 5835 }, { "epoch": 44.92307692307692, "grad_norm": 0.0, "learning_rate": 2.0805e-05, "loss": 1.7229, "mean_token_accuracy": 0.6573170632123947, "num_tokens": 21298845.0, "step": 5840 }, { "epoch": 44.96153846153846, "grad_norm": 0.0, "learning_rate": 2.078e-05, "loss": 1.6992, "mean_token_accuracy": 0.6632038950920105, "num_tokens": 21317164.0, "step": 5845 }, { "epoch": 45.0, "grad_norm": 0.0, "learning_rate": 2.0755000000000004e-05, "loss": 1.7663, "mean_token_accuracy": 0.6449836879968643, "num_tokens": 21334725.0, "step": 5850 }, { "epoch": 45.03846153846154, "grad_norm": 0.0, "learning_rate": 2.0730000000000003e-05, "loss": 1.7384, "mean_token_accuracy": 0.6504657328128814, "num_tokens": 21353119.0, "step": 5855 }, { "epoch": 45.07692307692308, "grad_norm": 0.0, "learning_rate": 2.0705000000000003e-05, "loss": 1.6998, "mean_token_accuracy": 0.6628474831581116, "num_tokens": 21371719.0, "step": 5860 }, { "epoch": 45.11538461538461, "grad_norm": 0.0, "learning_rate": 2.0680000000000002e-05, "loss": 1.7366, "mean_token_accuracy": 0.6500916868448258, "num_tokens": 21390734.0, "step": 5865 }, { "epoch": 45.15384615384615, "grad_norm": 0.0, "learning_rate": 2.0655e-05, "loss": 1.7111, "mean_token_accuracy": 0.6544335097074508, "num_tokens": 21409214.0, "step": 5870 }, { "epoch": 45.19230769230769, "grad_norm": 0.0, "learning_rate": 2.063e-05, "loss": 1.6598, "mean_token_accuracy": 0.6647601097822189, "num_tokens": 21428341.0, "step": 5875 }, { "epoch": 45.23076923076923, "grad_norm": 0.0, "learning_rate": 2.0605000000000003e-05, "loss": 1.7528, "mean_token_accuracy": 0.6484750181436538, "num_tokens": 21446444.0, "step": 5880 }, { "epoch": 45.26923076923077, "grad_norm": 0.0, "learning_rate": 2.0580000000000003e-05, "loss": 1.7472, "mean_token_accuracy": 0.6514826148748398, "num_tokens": 21464094.0, "step": 5885 }, { "epoch": 45.30769230769231, "grad_norm": 0.0, "learning_rate": 2.0555000000000002e-05, "loss": 1.6722, "mean_token_accuracy": 0.6630917578935623, "num_tokens": 21482239.0, "step": 5890 }, { "epoch": 45.34615384615385, "grad_norm": 0.0, "learning_rate": 2.053e-05, "loss": 1.7819, "mean_token_accuracy": 0.6405300617218017, "num_tokens": 21499829.0, "step": 5895 }, { "epoch": 45.38461538461539, "grad_norm": 0.0, "learning_rate": 2.0505e-05, "loss": 1.7291, "mean_token_accuracy": 0.6537360846996307, "num_tokens": 21518451.0, "step": 5900 }, { "epoch": 45.42307692307692, "grad_norm": 0.0, "learning_rate": 2.048e-05, "loss": 1.7311, "mean_token_accuracy": 0.6522845417261124, "num_tokens": 21536579.0, "step": 5905 }, { "epoch": 45.46153846153846, "grad_norm": 0.0, "learning_rate": 2.0455000000000003e-05, "loss": 1.6915, "mean_token_accuracy": 0.658150565624237, "num_tokens": 21555889.0, "step": 5910 }, { "epoch": 45.5, "grad_norm": 0.0, "learning_rate": 2.0430000000000003e-05, "loss": 1.7755, "mean_token_accuracy": 0.6407826870679856, "num_tokens": 21574267.0, "step": 5915 }, { "epoch": 45.53846153846154, "grad_norm": 0.0, "learning_rate": 2.0405000000000002e-05, "loss": 1.7662, "mean_token_accuracy": 0.6452647089958191, "num_tokens": 21591748.0, "step": 5920 }, { "epoch": 45.57692307692308, "grad_norm": 0.0, "learning_rate": 2.038e-05, "loss": 1.8003, "mean_token_accuracy": 0.6423941493034363, "num_tokens": 21608855.0, "step": 5925 }, { "epoch": 45.61538461538461, "grad_norm": 0.0, "learning_rate": 2.0355e-05, "loss": 1.7603, "mean_token_accuracy": 0.6519896745681762, "num_tokens": 21626524.0, "step": 5930 }, { "epoch": 45.65384615384615, "grad_norm": 0.0, "learning_rate": 2.033e-05, "loss": 1.7864, "mean_token_accuracy": 0.6421408951282501, "num_tokens": 21644165.0, "step": 5935 }, { "epoch": 45.69230769230769, "grad_norm": 0.0, "learning_rate": 2.0305000000000003e-05, "loss": 1.8047, "mean_token_accuracy": 0.6421581089496613, "num_tokens": 21661822.0, "step": 5940 }, { "epoch": 45.73076923076923, "grad_norm": 0.0, "learning_rate": 2.0280000000000002e-05, "loss": 1.6908, "mean_token_accuracy": 0.6561397314071655, "num_tokens": 21681068.0, "step": 5945 }, { "epoch": 45.76923076923077, "grad_norm": 0.0, "learning_rate": 2.0255000000000002e-05, "loss": 1.7477, "mean_token_accuracy": 0.6448839485645295, "num_tokens": 21699251.0, "step": 5950 }, { "epoch": 45.80769230769231, "grad_norm": 0.0, "learning_rate": 2.023e-05, "loss": 1.8038, "mean_token_accuracy": 0.6432386726140976, "num_tokens": 21716596.0, "step": 5955 }, { "epoch": 45.84615384615385, "grad_norm": 0.0, "learning_rate": 2.0205e-05, "loss": 1.768, "mean_token_accuracy": 0.6459046006202698, "num_tokens": 21734632.0, "step": 5960 }, { "epoch": 45.88461538461539, "grad_norm": 0.0, "learning_rate": 2.0180000000000003e-05, "loss": 1.7534, "mean_token_accuracy": 0.6509894400835037, "num_tokens": 21753024.0, "step": 5965 }, { "epoch": 45.92307692307692, "grad_norm": 0.0, "learning_rate": 2.0155000000000003e-05, "loss": 1.6896, "mean_token_accuracy": 0.663582494854927, "num_tokens": 21771809.0, "step": 5970 }, { "epoch": 45.96153846153846, "grad_norm": 0.0, "learning_rate": 2.0130000000000002e-05, "loss": 1.7161, "mean_token_accuracy": 0.6539533823728562, "num_tokens": 21790432.0, "step": 5975 }, { "epoch": 46.0, "grad_norm": 0.0, "learning_rate": 2.0105e-05, "loss": 1.7338, "mean_token_accuracy": 0.6510831654071808, "num_tokens": 21808830.0, "step": 5980 }, { "epoch": 46.03846153846154, "grad_norm": 0.0, "learning_rate": 2.008e-05, "loss": 1.7729, "mean_token_accuracy": 0.6468154609203338, "num_tokens": 21826336.0, "step": 5985 }, { "epoch": 46.07692307692308, "grad_norm": 0.0, "learning_rate": 2.0055e-05, "loss": 1.7439, "mean_token_accuracy": 0.657364284992218, "num_tokens": 21844521.0, "step": 5990 }, { "epoch": 46.11538461538461, "grad_norm": 0.0, "learning_rate": 2.0030000000000003e-05, "loss": 1.7688, "mean_token_accuracy": 0.645710214972496, "num_tokens": 21862800.0, "step": 5995 }, { "epoch": 46.15384615384615, "grad_norm": 0.0, "learning_rate": 2.0005000000000002e-05, "loss": 1.7315, "mean_token_accuracy": 0.6578894317150116, "num_tokens": 21880938.0, "step": 6000 }, { "epoch": 46.19230769230769, "grad_norm": 0.0, "learning_rate": 1.9980000000000002e-05, "loss": 1.7818, "mean_token_accuracy": 0.641998165845871, "num_tokens": 21899058.0, "step": 6005 }, { "epoch": 46.23076923076923, "grad_norm": 0.0, "learning_rate": 1.9955e-05, "loss": 1.7064, "mean_token_accuracy": 0.6609808683395386, "num_tokens": 21917113.0, "step": 6010 }, { "epoch": 46.26923076923077, "grad_norm": 0.0, "learning_rate": 1.993e-05, "loss": 1.7452, "mean_token_accuracy": 0.6411201417446136, "num_tokens": 21935754.0, "step": 6015 }, { "epoch": 46.30769230769231, "grad_norm": 0.0, "learning_rate": 1.9905e-05, "loss": 1.7566, "mean_token_accuracy": 0.6457954943180084, "num_tokens": 21954031.0, "step": 6020 }, { "epoch": 46.34615384615385, "grad_norm": 0.0, "learning_rate": 1.9880000000000003e-05, "loss": 1.7037, "mean_token_accuracy": 0.6573837339878082, "num_tokens": 21972598.0, "step": 6025 }, { "epoch": 46.38461538461539, "grad_norm": 0.0, "learning_rate": 1.9855000000000002e-05, "loss": 1.7076, "mean_token_accuracy": 0.658588969707489, "num_tokens": 21991074.0, "step": 6030 }, { "epoch": 46.42307692307692, "grad_norm": 0.0, "learning_rate": 1.983e-05, "loss": 1.7319, "mean_token_accuracy": 0.6501603156328202, "num_tokens": 22009260.0, "step": 6035 }, { "epoch": 46.46153846153846, "grad_norm": 0.0, "learning_rate": 1.9805e-05, "loss": 1.6804, "mean_token_accuracy": 0.65768404006958, "num_tokens": 22028277.0, "step": 6040 }, { "epoch": 46.5, "grad_norm": 0.0, "learning_rate": 1.978e-05, "loss": 1.7751, "mean_token_accuracy": 0.6457584232091904, "num_tokens": 22046348.0, "step": 6045 }, { "epoch": 46.53846153846154, "grad_norm": 0.0, "learning_rate": 1.9755e-05, "loss": 1.8084, "mean_token_accuracy": 0.6365677118301392, "num_tokens": 22064065.0, "step": 6050 }, { "epoch": 46.57692307692308, "grad_norm": 0.0, "learning_rate": 1.9730000000000003e-05, "loss": 1.7312, "mean_token_accuracy": 0.6494260609149933, "num_tokens": 22082601.0, "step": 6055 }, { "epoch": 46.61538461538461, "grad_norm": 0.0, "learning_rate": 1.9705000000000002e-05, "loss": 1.7834, "mean_token_accuracy": 0.6389273494482041, "num_tokens": 22100180.0, "step": 6060 }, { "epoch": 46.65384615384615, "grad_norm": 0.0, "learning_rate": 1.968e-05, "loss": 1.7366, "mean_token_accuracy": 0.6541613578796387, "num_tokens": 22118685.0, "step": 6065 }, { "epoch": 46.69230769230769, "grad_norm": 0.0, "learning_rate": 1.9655e-05, "loss": 1.7336, "mean_token_accuracy": 0.6524038523435592, "num_tokens": 22136815.0, "step": 6070 }, { "epoch": 46.73076923076923, "grad_norm": 0.0, "learning_rate": 1.963e-05, "loss": 1.7857, "mean_token_accuracy": 0.6441529363393783, "num_tokens": 22154538.0, "step": 6075 }, { "epoch": 46.76923076923077, "grad_norm": 0.0, "learning_rate": 1.9605e-05, "loss": 1.7715, "mean_token_accuracy": 0.6463933974504471, "num_tokens": 22173083.0, "step": 6080 }, { "epoch": 46.80769230769231, "grad_norm": 0.0, "learning_rate": 1.9580000000000002e-05, "loss": 1.6809, "mean_token_accuracy": 0.6644227296113968, "num_tokens": 22191785.0, "step": 6085 }, { "epoch": 46.84615384615385, "grad_norm": 0.0, "learning_rate": 1.9555e-05, "loss": 1.6764, "mean_token_accuracy": 0.6619850039482117, "num_tokens": 22210517.0, "step": 6090 }, { "epoch": 46.88461538461539, "grad_norm": 0.0, "learning_rate": 1.953e-05, "loss": 1.729, "mean_token_accuracy": 0.6496035814285278, "num_tokens": 22228596.0, "step": 6095 }, { "epoch": 46.92307692307692, "grad_norm": 0.0, "learning_rate": 1.9505e-05, "loss": 1.7949, "mean_token_accuracy": 0.6454124927520752, "num_tokens": 22245784.0, "step": 6100 }, { "epoch": 46.96153846153846, "grad_norm": 0.0, "learning_rate": 1.948e-05, "loss": 1.6937, "mean_token_accuracy": 0.6616695493459701, "num_tokens": 22263991.0, "step": 6105 }, { "epoch": 47.0, "grad_norm": 0.0, "learning_rate": 1.9455000000000003e-05, "loss": 1.7195, "mean_token_accuracy": 0.6506162971258164, "num_tokens": 22282935.0, "step": 6110 }, { "epoch": 47.03846153846154, "grad_norm": 0.0, "learning_rate": 1.9430000000000002e-05, "loss": 1.7305, "mean_token_accuracy": 0.6571007430553436, "num_tokens": 22301303.0, "step": 6115 }, { "epoch": 47.07692307692308, "grad_norm": 0.0, "learning_rate": 1.9405e-05, "loss": 1.6624, "mean_token_accuracy": 0.6648898661136627, "num_tokens": 22320863.0, "step": 6120 }, { "epoch": 47.11538461538461, "grad_norm": 0.0, "learning_rate": 1.938e-05, "loss": 1.7835, "mean_token_accuracy": 0.6378375381231308, "num_tokens": 22338722.0, "step": 6125 }, { "epoch": 47.15384615384615, "grad_norm": 0.0, "learning_rate": 1.9355e-05, "loss": 1.7585, "mean_token_accuracy": 0.6448232650756835, "num_tokens": 22357250.0, "step": 6130 }, { "epoch": 47.19230769230769, "grad_norm": 0.0, "learning_rate": 1.933e-05, "loss": 1.7201, "mean_token_accuracy": 0.6556304097175598, "num_tokens": 22375225.0, "step": 6135 }, { "epoch": 47.23076923076923, "grad_norm": 0.0, "learning_rate": 1.9305000000000002e-05, "loss": 1.6685, "mean_token_accuracy": 0.6670328795909881, "num_tokens": 22394135.0, "step": 6140 }, { "epoch": 47.26923076923077, "grad_norm": 0.0, "learning_rate": 1.9280000000000002e-05, "loss": 1.6467, "mean_token_accuracy": 0.6665792822837829, "num_tokens": 22413510.0, "step": 6145 }, { "epoch": 47.30769230769231, "grad_norm": 0.0, "learning_rate": 1.9255e-05, "loss": 1.7692, "mean_token_accuracy": 0.6390243858098984, "num_tokens": 22432228.0, "step": 6150 }, { "epoch": 47.34615384615385, "grad_norm": 0.0, "learning_rate": 1.923e-05, "loss": 1.7202, "mean_token_accuracy": 0.6546848744153977, "num_tokens": 22450356.0, "step": 6155 }, { "epoch": 47.38461538461539, "grad_norm": 0.0, "learning_rate": 1.9205e-05, "loss": 1.8106, "mean_token_accuracy": 0.6357954889535904, "num_tokens": 22467696.0, "step": 6160 }, { "epoch": 47.42307692307692, "grad_norm": 0.0, "learning_rate": 1.918e-05, "loss": 1.7551, "mean_token_accuracy": 0.6453001827001572, "num_tokens": 22486250.0, "step": 6165 }, { "epoch": 47.46153846153846, "grad_norm": 0.0, "learning_rate": 1.9155000000000002e-05, "loss": 1.7741, "mean_token_accuracy": 0.6439648687839508, "num_tokens": 22504172.0, "step": 6170 }, { "epoch": 47.5, "grad_norm": 0.0, "learning_rate": 1.913e-05, "loss": 1.7398, "mean_token_accuracy": 0.6509507864713668, "num_tokens": 22521860.0, "step": 6175 }, { "epoch": 47.53846153846154, "grad_norm": 0.0, "learning_rate": 1.9105e-05, "loss": 1.7535, "mean_token_accuracy": 0.6506534874439239, "num_tokens": 22539795.0, "step": 6180 }, { "epoch": 47.57692307692308, "grad_norm": 0.0, "learning_rate": 1.908e-05, "loss": 1.6702, "mean_token_accuracy": 0.6614595890045166, "num_tokens": 22558751.0, "step": 6185 }, { "epoch": 47.61538461538461, "grad_norm": 0.0, "learning_rate": 1.9055e-05, "loss": 1.7339, "mean_token_accuracy": 0.6529056757688523, "num_tokens": 22576706.0, "step": 6190 }, { "epoch": 47.65384615384615, "grad_norm": 0.0, "learning_rate": 1.903e-05, "loss": 1.7892, "mean_token_accuracy": 0.6409416913986206, "num_tokens": 22594485.0, "step": 6195 }, { "epoch": 47.69230769230769, "grad_norm": 0.0, "learning_rate": 1.9005000000000002e-05, "loss": 1.6885, "mean_token_accuracy": 0.6648684412240982, "num_tokens": 22613662.0, "step": 6200 }, { "epoch": 47.73076923076923, "grad_norm": 0.0, "learning_rate": 1.898e-05, "loss": 1.7837, "mean_token_accuracy": 0.6461777001619339, "num_tokens": 22631498.0, "step": 6205 }, { "epoch": 47.76923076923077, "grad_norm": 0.0, "learning_rate": 1.8955e-05, "loss": 1.8125, "mean_token_accuracy": 0.6402515649795533, "num_tokens": 22648506.0, "step": 6210 }, { "epoch": 47.80769230769231, "grad_norm": 0.0, "learning_rate": 1.893e-05, "loss": 1.7494, "mean_token_accuracy": 0.6493428587913513, "num_tokens": 22666870.0, "step": 6215 }, { "epoch": 47.84615384615385, "grad_norm": 0.0, "learning_rate": 1.8905e-05, "loss": 1.7876, "mean_token_accuracy": 0.6424293428659439, "num_tokens": 22684255.0, "step": 6220 }, { "epoch": 47.88461538461539, "grad_norm": 0.0, "learning_rate": 1.888e-05, "loss": 1.7513, "mean_token_accuracy": 0.6502956181764603, "num_tokens": 22702158.0, "step": 6225 }, { "epoch": 47.92307692307692, "grad_norm": 0.0, "learning_rate": 1.8855e-05, "loss": 1.6915, "mean_token_accuracy": 0.6557220876216888, "num_tokens": 22720730.0, "step": 6230 }, { "epoch": 47.96153846153846, "grad_norm": 0.0, "learning_rate": 1.883e-05, "loss": 1.7548, "mean_token_accuracy": 0.6476509481668472, "num_tokens": 22739451.0, "step": 6235 }, { "epoch": 48.0, "grad_norm": 0.0, "learning_rate": 1.8805e-05, "loss": 1.7672, "mean_token_accuracy": 0.6477993667125702, "num_tokens": 22757040.0, "step": 6240 }, { "epoch": 48.03846153846154, "grad_norm": 0.0, "learning_rate": 1.878e-05, "loss": 1.7979, "mean_token_accuracy": 0.6381140261888504, "num_tokens": 22775082.0, "step": 6245 }, { "epoch": 48.07692307692308, "grad_norm": 0.0, "learning_rate": 1.8755e-05, "loss": 1.7172, "mean_token_accuracy": 0.6539857119321824, "num_tokens": 22793573.0, "step": 6250 }, { "epoch": 48.11538461538461, "grad_norm": 0.0, "learning_rate": 1.8730000000000002e-05, "loss": 1.7259, "mean_token_accuracy": 0.652483606338501, "num_tokens": 22811965.0, "step": 6255 }, { "epoch": 48.15384615384615, "grad_norm": 0.0, "learning_rate": 1.8705e-05, "loss": 1.7343, "mean_token_accuracy": 0.6539576113224029, "num_tokens": 22829928.0, "step": 6260 }, { "epoch": 48.19230769230769, "grad_norm": 0.0, "learning_rate": 1.868e-05, "loss": 1.7318, "mean_token_accuracy": 0.651011449098587, "num_tokens": 22848013.0, "step": 6265 }, { "epoch": 48.23076923076923, "grad_norm": 0.0, "learning_rate": 1.8655e-05, "loss": 1.705, "mean_token_accuracy": 0.6651150584220886, "num_tokens": 22866785.0, "step": 6270 }, { "epoch": 48.26923076923077, "grad_norm": 0.0, "learning_rate": 1.863e-05, "loss": 1.7495, "mean_token_accuracy": 0.6475239574909211, "num_tokens": 22884516.0, "step": 6275 }, { "epoch": 48.30769230769231, "grad_norm": 0.0, "learning_rate": 1.8605e-05, "loss": 1.7714, "mean_token_accuracy": 0.6414877325296402, "num_tokens": 22902483.0, "step": 6280 }, { "epoch": 48.34615384615385, "grad_norm": 0.0, "learning_rate": 1.858e-05, "loss": 1.7391, "mean_token_accuracy": 0.6488156080245971, "num_tokens": 22921114.0, "step": 6285 }, { "epoch": 48.38461538461539, "grad_norm": 0.0, "learning_rate": 1.8555e-05, "loss": 1.7538, "mean_token_accuracy": 0.6502360552549362, "num_tokens": 22938888.0, "step": 6290 }, { "epoch": 48.42307692307692, "grad_norm": 0.0, "learning_rate": 1.853e-05, "loss": 1.7299, "mean_token_accuracy": 0.649975848197937, "num_tokens": 22957254.0, "step": 6295 }, { "epoch": 48.46153846153846, "grad_norm": 0.0, "learning_rate": 1.8505e-05, "loss": 1.7488, "mean_token_accuracy": 0.6526569038629532, "num_tokens": 22975073.0, "step": 6300 }, { "epoch": 48.5, "grad_norm": 0.0, "learning_rate": 1.848e-05, "loss": 1.7108, "mean_token_accuracy": 0.6544829219579696, "num_tokens": 22993939.0, "step": 6305 }, { "epoch": 48.53846153846154, "grad_norm": 0.0, "learning_rate": 1.8455e-05, "loss": 1.7625, "mean_token_accuracy": 0.643173098564148, "num_tokens": 23012560.0, "step": 6310 }, { "epoch": 48.57692307692308, "grad_norm": 0.0, "learning_rate": 1.843e-05, "loss": 1.7734, "mean_token_accuracy": 0.6458878636360168, "num_tokens": 23030129.0, "step": 6315 }, { "epoch": 48.61538461538461, "grad_norm": 0.0, "learning_rate": 1.8405e-05, "loss": 1.7598, "mean_token_accuracy": 0.6463797986507416, "num_tokens": 23047988.0, "step": 6320 }, { "epoch": 48.65384615384615, "grad_norm": 0.0, "learning_rate": 1.838e-05, "loss": 1.7542, "mean_token_accuracy": 0.6464673429727554, "num_tokens": 23065816.0, "step": 6325 }, { "epoch": 48.69230769230769, "grad_norm": 0.0, "learning_rate": 1.8355e-05, "loss": 1.742, "mean_token_accuracy": 0.6510752677917481, "num_tokens": 23083562.0, "step": 6330 }, { "epoch": 48.73076923076923, "grad_norm": 0.0, "learning_rate": 1.833e-05, "loss": 1.7406, "mean_token_accuracy": 0.6523085534572601, "num_tokens": 23101646.0, "step": 6335 }, { "epoch": 48.76923076923077, "grad_norm": 0.0, "learning_rate": 1.8305e-05, "loss": 1.7408, "mean_token_accuracy": 0.6524953216314315, "num_tokens": 23120020.0, "step": 6340 }, { "epoch": 48.80769230769231, "grad_norm": 0.0, "learning_rate": 1.828e-05, "loss": 1.6759, "mean_token_accuracy": 0.6643141090869904, "num_tokens": 23139519.0, "step": 6345 }, { "epoch": 48.84615384615385, "grad_norm": 0.0, "learning_rate": 1.8255e-05, "loss": 1.8242, "mean_token_accuracy": 0.6340841352939606, "num_tokens": 23156832.0, "step": 6350 }, { "epoch": 48.88461538461539, "grad_norm": 0.0, "learning_rate": 1.823e-05, "loss": 1.6869, "mean_token_accuracy": 0.6609957993030549, "num_tokens": 23175409.0, "step": 6355 }, { "epoch": 48.92307692307692, "grad_norm": 0.0, "learning_rate": 1.8205e-05, "loss": 1.7518, "mean_token_accuracy": 0.6469810694456101, "num_tokens": 23193876.0, "step": 6360 }, { "epoch": 48.96153846153846, "grad_norm": 0.0, "learning_rate": 1.818e-05, "loss": 1.6923, "mean_token_accuracy": 0.6596918433904648, "num_tokens": 23212532.0, "step": 6365 }, { "epoch": 49.0, "grad_norm": 0.0, "learning_rate": 1.8154999999999998e-05, "loss": 1.7207, "mean_token_accuracy": 0.6570287615060806, "num_tokens": 23231145.0, "step": 6370 }, { "epoch": 49.03846153846154, "grad_norm": 0.0, "learning_rate": 1.813e-05, "loss": 1.7494, "mean_token_accuracy": 0.6495041728019715, "num_tokens": 23249429.0, "step": 6375 }, { "epoch": 49.07692307692308, "grad_norm": 0.0, "learning_rate": 1.8105e-05, "loss": 1.6628, "mean_token_accuracy": 0.6711469799280166, "num_tokens": 23268600.0, "step": 6380 }, { "epoch": 49.11538461538461, "grad_norm": 0.0, "learning_rate": 1.808e-05, "loss": 1.7377, "mean_token_accuracy": 0.6483930766582489, "num_tokens": 23286588.0, "step": 6385 }, { "epoch": 49.15384615384615, "grad_norm": 0.0, "learning_rate": 1.8055e-05, "loss": 1.7444, "mean_token_accuracy": 0.6448217332363129, "num_tokens": 23304489.0, "step": 6390 }, { "epoch": 49.19230769230769, "grad_norm": 0.0, "learning_rate": 1.803e-05, "loss": 1.7013, "mean_token_accuracy": 0.6568325966596603, "num_tokens": 23323802.0, "step": 6395 }, { "epoch": 49.23076923076923, "grad_norm": 0.0, "learning_rate": 1.8005e-05, "loss": 1.6943, "mean_token_accuracy": 0.6581353217363357, "num_tokens": 23342349.0, "step": 6400 }, { "epoch": 49.26923076923077, "grad_norm": 0.0, "learning_rate": 1.798e-05, "loss": 1.739, "mean_token_accuracy": 0.6524305224418641, "num_tokens": 23360368.0, "step": 6405 }, { "epoch": 49.30769230769231, "grad_norm": 0.0, "learning_rate": 1.7955e-05, "loss": 1.7507, "mean_token_accuracy": 0.6451392889022827, "num_tokens": 23378793.0, "step": 6410 }, { "epoch": 49.34615384615385, "grad_norm": 0.0, "learning_rate": 1.793e-05, "loss": 1.7621, "mean_token_accuracy": 0.6497738718986511, "num_tokens": 23396781.0, "step": 6415 }, { "epoch": 49.38461538461539, "grad_norm": 0.0, "learning_rate": 1.7905e-05, "loss": 1.7673, "mean_token_accuracy": 0.6516094326972961, "num_tokens": 23413811.0, "step": 6420 }, { "epoch": 49.42307692307692, "grad_norm": 0.0, "learning_rate": 1.7879999999999998e-05, "loss": 1.7673, "mean_token_accuracy": 0.6500208914279938, "num_tokens": 23431562.0, "step": 6425 }, { "epoch": 49.46153846153846, "grad_norm": 0.0, "learning_rate": 1.7855e-05, "loss": 1.7738, "mean_token_accuracy": 0.6465479761362076, "num_tokens": 23449644.0, "step": 6430 }, { "epoch": 49.5, "grad_norm": 0.0, "learning_rate": 1.783e-05, "loss": 1.7867, "mean_token_accuracy": 0.6393932700157166, "num_tokens": 23466835.0, "step": 6435 }, { "epoch": 49.53846153846154, "grad_norm": 0.0, "learning_rate": 1.7805000000000003e-05, "loss": 1.6852, "mean_token_accuracy": 0.6625799834728241, "num_tokens": 23485932.0, "step": 6440 }, { "epoch": 49.57692307692308, "grad_norm": 0.0, "learning_rate": 1.7780000000000003e-05, "loss": 1.7437, "mean_token_accuracy": 0.6509669572114944, "num_tokens": 23504065.0, "step": 6445 }, { "epoch": 49.61538461538461, "grad_norm": 0.0, "learning_rate": 1.7755000000000002e-05, "loss": 1.705, "mean_token_accuracy": 0.6565153568983078, "num_tokens": 23522965.0, "step": 6450 }, { "epoch": 49.65384615384615, "grad_norm": 0.0, "learning_rate": 1.773e-05, "loss": 1.7125, "mean_token_accuracy": 0.6582383394241333, "num_tokens": 23541321.0, "step": 6455 }, { "epoch": 49.69230769230769, "grad_norm": 0.0, "learning_rate": 1.7705e-05, "loss": 1.7213, "mean_token_accuracy": 0.6557675808668136, "num_tokens": 23559830.0, "step": 6460 }, { "epoch": 49.73076923076923, "grad_norm": 0.0, "learning_rate": 1.7680000000000004e-05, "loss": 1.7611, "mean_token_accuracy": 0.6487846702337265, "num_tokens": 23578070.0, "step": 6465 }, { "epoch": 49.76923076923077, "grad_norm": 0.0, "learning_rate": 1.7655000000000003e-05, "loss": 1.7967, "mean_token_accuracy": 0.639831280708313, "num_tokens": 23596069.0, "step": 6470 }, { "epoch": 49.80769230769231, "grad_norm": 0.0, "learning_rate": 1.7630000000000002e-05, "loss": 1.8093, "mean_token_accuracy": 0.6422061920166016, "num_tokens": 23612951.0, "step": 6475 }, { "epoch": 49.84615384615385, "grad_norm": 0.0, "learning_rate": 1.7605000000000002e-05, "loss": 1.7392, "mean_token_accuracy": 0.6497550249099732, "num_tokens": 23632333.0, "step": 6480 }, { "epoch": 49.88461538461539, "grad_norm": 0.0, "learning_rate": 1.758e-05, "loss": 1.6955, "mean_token_accuracy": 0.6517337173223495, "num_tokens": 23651103.0, "step": 6485 }, { "epoch": 49.92307692307692, "grad_norm": 0.0, "learning_rate": 1.7555e-05, "loss": 1.7078, "mean_token_accuracy": 0.6560462713241577, "num_tokens": 23669731.0, "step": 6490 }, { "epoch": 49.96153846153846, "grad_norm": 0.0, "learning_rate": 1.7530000000000003e-05, "loss": 1.817, "mean_token_accuracy": 0.6334926426410675, "num_tokens": 23687316.0, "step": 6495 }, { "epoch": 50.0, "grad_norm": 0.0, "learning_rate": 1.7505000000000003e-05, "loss": 1.7466, "mean_token_accuracy": 0.6494122266769409, "num_tokens": 23705250.0, "step": 6500 }, { "epoch": 50.03846153846154, "grad_norm": 0.0, "learning_rate": 1.7480000000000002e-05, "loss": 1.7627, "mean_token_accuracy": 0.6493338525295258, "num_tokens": 23722587.0, "step": 6505 }, { "epoch": 50.07692307692308, "grad_norm": 0.0, "learning_rate": 1.7455e-05, "loss": 1.7611, "mean_token_accuracy": 0.6497463405132293, "num_tokens": 23740287.0, "step": 6510 }, { "epoch": 50.11538461538461, "grad_norm": 0.0, "learning_rate": 1.743e-05, "loss": 1.7443, "mean_token_accuracy": 0.6468771427869797, "num_tokens": 23758657.0, "step": 6515 }, { "epoch": 50.15384615384615, "grad_norm": 0.0, "learning_rate": 1.7405e-05, "loss": 1.7767, "mean_token_accuracy": 0.6426906883716583, "num_tokens": 23776842.0, "step": 6520 }, { "epoch": 50.19230769230769, "grad_norm": 0.0, "learning_rate": 1.7380000000000003e-05, "loss": 1.7058, "mean_token_accuracy": 0.6551610261201859, "num_tokens": 23795678.0, "step": 6525 }, { "epoch": 50.23076923076923, "grad_norm": 0.0, "learning_rate": 1.7355000000000002e-05, "loss": 1.713, "mean_token_accuracy": 0.6565469861030578, "num_tokens": 23814122.0, "step": 6530 }, { "epoch": 50.26923076923077, "grad_norm": 0.0, "learning_rate": 1.7330000000000002e-05, "loss": 1.6964, "mean_token_accuracy": 0.6581017464399338, "num_tokens": 23832733.0, "step": 6535 }, { "epoch": 50.30769230769231, "grad_norm": 0.0, "learning_rate": 1.7305e-05, "loss": 1.7557, "mean_token_accuracy": 0.6457631707191467, "num_tokens": 23851205.0, "step": 6540 }, { "epoch": 50.34615384615385, "grad_norm": 0.0, "learning_rate": 1.728e-05, "loss": 1.7517, "mean_token_accuracy": 0.6463135719299317, "num_tokens": 23869701.0, "step": 6545 }, { "epoch": 50.38461538461539, "grad_norm": 0.0, "learning_rate": 1.7255000000000003e-05, "loss": 1.7099, "mean_token_accuracy": 0.6558305144309997, "num_tokens": 23887966.0, "step": 6550 }, { "epoch": 50.42307692307692, "grad_norm": 0.0, "learning_rate": 1.7230000000000003e-05, "loss": 1.703, "mean_token_accuracy": 0.6580185920000077, "num_tokens": 23906795.0, "step": 6555 }, { "epoch": 50.46153846153846, "grad_norm": 0.0, "learning_rate": 1.7205000000000002e-05, "loss": 1.7364, "mean_token_accuracy": 0.6480530560016632, "num_tokens": 23925538.0, "step": 6560 }, { "epoch": 50.5, "grad_norm": 0.0, "learning_rate": 1.718e-05, "loss": 1.7476, "mean_token_accuracy": 0.6457035690546036, "num_tokens": 23943937.0, "step": 6565 }, { "epoch": 50.53846153846154, "grad_norm": 0.0, "learning_rate": 1.7155e-05, "loss": 1.829, "mean_token_accuracy": 0.6348341315984726, "num_tokens": 23961264.0, "step": 6570 }, { "epoch": 50.57692307692308, "grad_norm": 0.0, "learning_rate": 1.713e-05, "loss": 1.6595, "mean_token_accuracy": 0.6654309093952179, "num_tokens": 23980824.0, "step": 6575 }, { "epoch": 50.61538461538461, "grad_norm": 0.0, "learning_rate": 1.7105000000000003e-05, "loss": 1.7533, "mean_token_accuracy": 0.6445026308298111, "num_tokens": 23998733.0, "step": 6580 }, { "epoch": 50.65384615384615, "grad_norm": 0.0, "learning_rate": 1.7080000000000002e-05, "loss": 1.7202, "mean_token_accuracy": 0.6478159755468369, "num_tokens": 24017155.0, "step": 6585 }, { "epoch": 50.69230769230769, "grad_norm": 0.0, "learning_rate": 1.7055000000000002e-05, "loss": 1.7928, "mean_token_accuracy": 0.6426284611225128, "num_tokens": 24034300.0, "step": 6590 }, { "epoch": 50.73076923076923, "grad_norm": 0.0, "learning_rate": 1.703e-05, "loss": 1.7132, "mean_token_accuracy": 0.6564747720956803, "num_tokens": 24052640.0, "step": 6595 }, { "epoch": 50.76923076923077, "grad_norm": 0.0, "learning_rate": 1.7005e-05, "loss": 1.7754, "mean_token_accuracy": 0.6513616561889648, "num_tokens": 24070441.0, "step": 6600 }, { "epoch": 50.80769230769231, "grad_norm": 0.0, "learning_rate": 1.698e-05, "loss": 1.7644, "mean_token_accuracy": 0.644871911406517, "num_tokens": 24088585.0, "step": 6605 }, { "epoch": 50.84615384615385, "grad_norm": 0.0, "learning_rate": 1.6955000000000003e-05, "loss": 1.7347, "mean_token_accuracy": 0.6584911167621612, "num_tokens": 24106472.0, "step": 6610 }, { "epoch": 50.88461538461539, "grad_norm": 0.0, "learning_rate": 1.6930000000000002e-05, "loss": 1.7469, "mean_token_accuracy": 0.6565795034170151, "num_tokens": 24124773.0, "step": 6615 }, { "epoch": 50.92307692307692, "grad_norm": 0.0, "learning_rate": 1.6905e-05, "loss": 1.7342, "mean_token_accuracy": 0.6567032128572464, "num_tokens": 24142871.0, "step": 6620 }, { "epoch": 50.96153846153846, "grad_norm": 0.0, "learning_rate": 1.688e-05, "loss": 1.6891, "mean_token_accuracy": 0.6616051197052002, "num_tokens": 24161666.0, "step": 6625 }, { "epoch": 51.0, "grad_norm": 0.0, "learning_rate": 1.6855e-05, "loss": 1.7768, "mean_token_accuracy": 0.6428048938512803, "num_tokens": 24179355.0, "step": 6630 }, { "epoch": 51.03846153846154, "grad_norm": 0.0, "learning_rate": 1.683e-05, "loss": 1.6311, "mean_token_accuracy": 0.6698222011327744, "num_tokens": 24198872.0, "step": 6635 }, { "epoch": 51.07692307692308, "grad_norm": 0.0, "learning_rate": 1.6805000000000003e-05, "loss": 1.7177, "mean_token_accuracy": 0.6608267247676849, "num_tokens": 24216830.0, "step": 6640 }, { "epoch": 51.11538461538461, "grad_norm": 0.0, "learning_rate": 1.6780000000000002e-05, "loss": 1.7234, "mean_token_accuracy": 0.6543210685253144, "num_tokens": 24235082.0, "step": 6645 }, { "epoch": 51.15384615384615, "grad_norm": 0.0, "learning_rate": 1.6755e-05, "loss": 1.7839, "mean_token_accuracy": 0.64230475127697, "num_tokens": 24252383.0, "step": 6650 }, { "epoch": 51.19230769230769, "grad_norm": 0.0, "learning_rate": 1.673e-05, "loss": 1.7651, "mean_token_accuracy": 0.6459890872240066, "num_tokens": 24270356.0, "step": 6655 }, { "epoch": 51.23076923076923, "grad_norm": 0.0, "learning_rate": 1.6705e-05, "loss": 1.7523, "mean_token_accuracy": 0.6558059513568878, "num_tokens": 24288236.0, "step": 6660 }, { "epoch": 51.26923076923077, "grad_norm": 0.0, "learning_rate": 1.668e-05, "loss": 1.7298, "mean_token_accuracy": 0.6469644725322723, "num_tokens": 24306844.0, "step": 6665 }, { "epoch": 51.30769230769231, "grad_norm": 0.0, "learning_rate": 1.6655000000000002e-05, "loss": 1.7604, "mean_token_accuracy": 0.6506382346153259, "num_tokens": 24325111.0, "step": 6670 }, { "epoch": 51.34615384615385, "grad_norm": 0.0, "learning_rate": 1.6630000000000002e-05, "loss": 1.7607, "mean_token_accuracy": 0.6459864258766175, "num_tokens": 24343133.0, "step": 6675 }, { "epoch": 51.38461538461539, "grad_norm": 0.0, "learning_rate": 1.6605e-05, "loss": 1.7669, "mean_token_accuracy": 0.6461174368858338, "num_tokens": 24361431.0, "step": 6680 }, { "epoch": 51.42307692307692, "grad_norm": 0.0, "learning_rate": 1.658e-05, "loss": 1.6647, "mean_token_accuracy": 0.6634925454854965, "num_tokens": 24380890.0, "step": 6685 }, { "epoch": 51.46153846153846, "grad_norm": 0.0, "learning_rate": 1.6555e-05, "loss": 1.7547, "mean_token_accuracy": 0.650789025425911, "num_tokens": 24398752.0, "step": 6690 }, { "epoch": 51.5, "grad_norm": 0.0, "learning_rate": 1.6530000000000003e-05, "loss": 1.8203, "mean_token_accuracy": 0.6345427900552749, "num_tokens": 24416377.0, "step": 6695 }, { "epoch": 51.53846153846154, "grad_norm": 0.0, "learning_rate": 1.6505000000000002e-05, "loss": 1.7334, "mean_token_accuracy": 0.6506275236606598, "num_tokens": 24435235.0, "step": 6700 }, { "epoch": 51.57692307692308, "grad_norm": 0.0, "learning_rate": 1.648e-05, "loss": 1.7543, "mean_token_accuracy": 0.6435793161392211, "num_tokens": 24453138.0, "step": 6705 }, { "epoch": 51.61538461538461, "grad_norm": 0.0, "learning_rate": 1.6455e-05, "loss": 1.7537, "mean_token_accuracy": 0.644529914855957, "num_tokens": 24471067.0, "step": 6710 }, { "epoch": 51.65384615384615, "grad_norm": 0.0, "learning_rate": 1.643e-05, "loss": 1.6671, "mean_token_accuracy": 0.6617798268795013, "num_tokens": 24489441.0, "step": 6715 }, { "epoch": 51.69230769230769, "grad_norm": 0.0, "learning_rate": 1.6405e-05, "loss": 1.7789, "mean_token_accuracy": 0.6475713908672333, "num_tokens": 24507517.0, "step": 6720 }, { "epoch": 51.73076923076923, "grad_norm": 0.0, "learning_rate": 1.6380000000000002e-05, "loss": 1.7109, "mean_token_accuracy": 0.6559187889099121, "num_tokens": 24525635.0, "step": 6725 }, { "epoch": 51.76923076923077, "grad_norm": 0.0, "learning_rate": 1.6355000000000002e-05, "loss": 1.6891, "mean_token_accuracy": 0.6629273265600204, "num_tokens": 24544388.0, "step": 6730 }, { "epoch": 51.80769230769231, "grad_norm": 0.0, "learning_rate": 1.633e-05, "loss": 1.7177, "mean_token_accuracy": 0.653970816731453, "num_tokens": 24563413.0, "step": 6735 }, { "epoch": 51.84615384615385, "grad_norm": 0.0, "learning_rate": 1.6305e-05, "loss": 1.7994, "mean_token_accuracy": 0.6389773488044739, "num_tokens": 24581168.0, "step": 6740 }, { "epoch": 51.88461538461539, "grad_norm": 0.0, "learning_rate": 1.628e-05, "loss": 1.7425, "mean_token_accuracy": 0.6551438093185424, "num_tokens": 24599489.0, "step": 6745 }, { "epoch": 51.92307692307692, "grad_norm": 0.0, "learning_rate": 1.6255e-05, "loss": 1.735, "mean_token_accuracy": 0.6468292713165283, "num_tokens": 24617539.0, "step": 6750 }, { "epoch": 51.96153846153846, "grad_norm": 0.0, "learning_rate": 1.6230000000000002e-05, "loss": 1.7414, "mean_token_accuracy": 0.653847748041153, "num_tokens": 24635357.0, "step": 6755 }, { "epoch": 52.0, "grad_norm": 0.0, "learning_rate": 1.6205e-05, "loss": 1.7919, "mean_token_accuracy": 0.6327983260154724, "num_tokens": 24653460.0, "step": 6760 }, { "epoch": 52.03846153846154, "grad_norm": 0.0, "learning_rate": 1.618e-05, "loss": 1.6387, "mean_token_accuracy": 0.6734457373619079, "num_tokens": 24672969.0, "step": 6765 }, { "epoch": 52.07692307692308, "grad_norm": 0.0, "learning_rate": 1.6155e-05, "loss": 1.7751, "mean_token_accuracy": 0.6482141762971878, "num_tokens": 24690768.0, "step": 6770 }, { "epoch": 52.11538461538461, "grad_norm": 0.0, "learning_rate": 1.613e-05, "loss": 1.6709, "mean_token_accuracy": 0.6655151665210723, "num_tokens": 24709986.0, "step": 6775 }, { "epoch": 52.15384615384615, "grad_norm": 0.0, "learning_rate": 1.6105e-05, "loss": 1.8157, "mean_token_accuracy": 0.6377092868089675, "num_tokens": 24726980.0, "step": 6780 }, { "epoch": 52.19230769230769, "grad_norm": 0.0, "learning_rate": 1.6080000000000002e-05, "loss": 1.758, "mean_token_accuracy": 0.6477128326892853, "num_tokens": 24744869.0, "step": 6785 }, { "epoch": 52.23076923076923, "grad_norm": 0.0, "learning_rate": 1.6055e-05, "loss": 1.74, "mean_token_accuracy": 0.6518424063920975, "num_tokens": 24763369.0, "step": 6790 }, { "epoch": 52.26923076923077, "grad_norm": 0.0, "learning_rate": 1.603e-05, "loss": 1.6843, "mean_token_accuracy": 0.662993323802948, "num_tokens": 24782101.0, "step": 6795 }, { "epoch": 52.30769230769231, "grad_norm": 0.0, "learning_rate": 1.6005e-05, "loss": 1.718, "mean_token_accuracy": 0.6558027982711792, "num_tokens": 24800571.0, "step": 6800 }, { "epoch": 52.34615384615385, "grad_norm": 0.0, "learning_rate": 1.598e-05, "loss": 1.7319, "mean_token_accuracy": 0.6475233376026154, "num_tokens": 24818676.0, "step": 6805 }, { "epoch": 52.38461538461539, "grad_norm": 0.0, "learning_rate": 1.5955e-05, "loss": 1.6884, "mean_token_accuracy": 0.6593423992395401, "num_tokens": 24837669.0, "step": 6810 }, { "epoch": 52.42307692307692, "grad_norm": 0.0, "learning_rate": 1.593e-05, "loss": 1.7498, "mean_token_accuracy": 0.6527423232793808, "num_tokens": 24855694.0, "step": 6815 }, { "epoch": 52.46153846153846, "grad_norm": 0.0, "learning_rate": 1.5905e-05, "loss": 1.7311, "mean_token_accuracy": 0.6554641664028168, "num_tokens": 24874368.0, "step": 6820 }, { "epoch": 52.5, "grad_norm": 0.0, "learning_rate": 1.588e-05, "loss": 1.7152, "mean_token_accuracy": 0.6568386435508728, "num_tokens": 24893164.0, "step": 6825 }, { "epoch": 52.53846153846154, "grad_norm": 0.0, "learning_rate": 1.5855e-05, "loss": 1.7097, "mean_token_accuracy": 0.6543055295944213, "num_tokens": 24911502.0, "step": 6830 }, { "epoch": 52.57692307692308, "grad_norm": 0.0, "learning_rate": 1.583e-05, "loss": 1.8051, "mean_token_accuracy": 0.6321907073259354, "num_tokens": 24929552.0, "step": 6835 }, { "epoch": 52.61538461538461, "grad_norm": 0.0, "learning_rate": 1.5805000000000002e-05, "loss": 1.7726, "mean_token_accuracy": 0.6486641079187393, "num_tokens": 24947565.0, "step": 6840 }, { "epoch": 52.65384615384615, "grad_norm": 0.0, "learning_rate": 1.578e-05, "loss": 1.7256, "mean_token_accuracy": 0.6513917237520218, "num_tokens": 24966020.0, "step": 6845 }, { "epoch": 52.69230769230769, "grad_norm": 0.0, "learning_rate": 1.5755e-05, "loss": 1.7481, "mean_token_accuracy": 0.6502928107976913, "num_tokens": 24983952.0, "step": 6850 }, { "epoch": 52.73076923076923, "grad_norm": 0.0, "learning_rate": 1.573e-05, "loss": 1.6641, "mean_token_accuracy": 0.6641529649496078, "num_tokens": 25002526.0, "step": 6855 }, { "epoch": 52.76923076923077, "grad_norm": 0.0, "learning_rate": 1.5705e-05, "loss": 1.7879, "mean_token_accuracy": 0.6445726931095124, "num_tokens": 25020198.0, "step": 6860 }, { "epoch": 52.80769230769231, "grad_norm": 0.0, "learning_rate": 1.568e-05, "loss": 1.831, "mean_token_accuracy": 0.6385212212800979, "num_tokens": 25037190.0, "step": 6865 }, { "epoch": 52.84615384615385, "grad_norm": 0.0, "learning_rate": 1.5655000000000002e-05, "loss": 1.7463, "mean_token_accuracy": 0.6501509785652161, "num_tokens": 25055025.0, "step": 6870 }, { "epoch": 52.88461538461539, "grad_norm": 0.0, "learning_rate": 1.563e-05, "loss": 1.739, "mean_token_accuracy": 0.6465972781181335, "num_tokens": 25073502.0, "step": 6875 }, { "epoch": 52.92307692307692, "grad_norm": 0.0, "learning_rate": 1.5605e-05, "loss": 1.7514, "mean_token_accuracy": 0.6448877215385437, "num_tokens": 25092178.0, "step": 6880 }, { "epoch": 52.96153846153846, "grad_norm": 0.0, "learning_rate": 1.558e-05, "loss": 1.7896, "mean_token_accuracy": 0.6419251203536988, "num_tokens": 25109978.0, "step": 6885 }, { "epoch": 53.0, "grad_norm": 0.0, "learning_rate": 1.5555e-05, "loss": 1.8081, "mean_token_accuracy": 0.6363903671503067, "num_tokens": 25127565.0, "step": 6890 }, { "epoch": 53.03846153846154, "grad_norm": 0.0, "learning_rate": 1.553e-05, "loss": 1.7896, "mean_token_accuracy": 0.6396276503801346, "num_tokens": 25145088.0, "step": 6895 }, { "epoch": 53.07692307692308, "grad_norm": 0.0, "learning_rate": 1.5505e-05, "loss": 1.717, "mean_token_accuracy": 0.6515821665525436, "num_tokens": 25163828.0, "step": 6900 }, { "epoch": 53.11538461538461, "grad_norm": 0.0, "learning_rate": 1.548e-05, "loss": 1.7229, "mean_token_accuracy": 0.6592198878526687, "num_tokens": 25181797.0, "step": 6905 }, { "epoch": 53.15384615384615, "grad_norm": 0.0, "learning_rate": 1.5455e-05, "loss": 1.7218, "mean_token_accuracy": 0.6529275476932526, "num_tokens": 25200618.0, "step": 6910 }, { "epoch": 53.19230769230769, "grad_norm": 0.0, "learning_rate": 1.543e-05, "loss": 1.8069, "mean_token_accuracy": 0.6347218960523605, "num_tokens": 25218034.0, "step": 6915 }, { "epoch": 53.23076923076923, "grad_norm": 0.0, "learning_rate": 1.5405e-05, "loss": 1.7021, "mean_token_accuracy": 0.6613812148571014, "num_tokens": 25236653.0, "step": 6920 }, { "epoch": 53.26923076923077, "grad_norm": 0.0, "learning_rate": 1.538e-05, "loss": 1.6697, "mean_token_accuracy": 0.6670442432165146, "num_tokens": 25255752.0, "step": 6925 }, { "epoch": 53.30769230769231, "grad_norm": 0.0, "learning_rate": 1.5355e-05, "loss": 1.7525, "mean_token_accuracy": 0.6471881836652755, "num_tokens": 25273909.0, "step": 6930 }, { "epoch": 53.34615384615385, "grad_norm": 0.0, "learning_rate": 1.533e-05, "loss": 1.7417, "mean_token_accuracy": 0.6538171172142029, "num_tokens": 25291903.0, "step": 6935 }, { "epoch": 53.38461538461539, "grad_norm": 0.0, "learning_rate": 1.5305e-05, "loss": 1.7168, "mean_token_accuracy": 0.654968672990799, "num_tokens": 25310540.0, "step": 6940 }, { "epoch": 53.42307692307692, "grad_norm": 0.0, "learning_rate": 1.528e-05, "loss": 1.726, "mean_token_accuracy": 0.652405110001564, "num_tokens": 25328964.0, "step": 6945 }, { "epoch": 53.46153846153846, "grad_norm": 0.0, "learning_rate": 1.5255e-05, "loss": 1.7646, "mean_token_accuracy": 0.6477054178714752, "num_tokens": 25346432.0, "step": 6950 }, { "epoch": 53.5, "grad_norm": 0.0, "learning_rate": 1.523e-05, "loss": 1.7439, "mean_token_accuracy": 0.6488854259252548, "num_tokens": 25364382.0, "step": 6955 }, { "epoch": 53.53846153846154, "grad_norm": 0.0, "learning_rate": 1.5205e-05, "loss": 1.7786, "mean_token_accuracy": 0.6465272456407547, "num_tokens": 25382179.0, "step": 6960 }, { "epoch": 53.57692307692308, "grad_norm": 0.0, "learning_rate": 1.518e-05, "loss": 1.7934, "mean_token_accuracy": 0.6389431089162827, "num_tokens": 25400797.0, "step": 6965 }, { "epoch": 53.61538461538461, "grad_norm": 0.0, "learning_rate": 1.5155e-05, "loss": 1.71, "mean_token_accuracy": 0.6540043205022812, "num_tokens": 25418880.0, "step": 6970 }, { "epoch": 53.65384615384615, "grad_norm": 0.0, "learning_rate": 1.5129999999999999e-05, "loss": 1.6958, "mean_token_accuracy": 0.6611298769712448, "num_tokens": 25437618.0, "step": 6975 }, { "epoch": 53.69230769230769, "grad_norm": 0.0, "learning_rate": 1.5105e-05, "loss": 1.7339, "mean_token_accuracy": 0.6556006103754044, "num_tokens": 25455960.0, "step": 6980 }, { "epoch": 53.73076923076923, "grad_norm": 0.0, "learning_rate": 1.508e-05, "loss": 1.7452, "mean_token_accuracy": 0.6496385037899017, "num_tokens": 25474285.0, "step": 6985 }, { "epoch": 53.76923076923077, "grad_norm": 0.0, "learning_rate": 1.5054999999999999e-05, "loss": 1.7495, "mean_token_accuracy": 0.6455465704202652, "num_tokens": 25492111.0, "step": 6990 }, { "epoch": 53.80769230769231, "grad_norm": 0.0, "learning_rate": 1.503e-05, "loss": 1.7091, "mean_token_accuracy": 0.6587622851133347, "num_tokens": 25510592.0, "step": 6995 }, { "epoch": 53.84615384615385, "grad_norm": 0.0, "learning_rate": 1.5005e-05, "loss": 1.7738, "mean_token_accuracy": 0.646448740363121, "num_tokens": 25528517.0, "step": 7000 }, { "epoch": 53.88461538461539, "grad_norm": 0.0, "learning_rate": 1.4979999999999999e-05, "loss": 1.8031, "mean_token_accuracy": 0.6375834941864014, "num_tokens": 25546409.0, "step": 7005 }, { "epoch": 53.92307692307692, "grad_norm": 0.0, "learning_rate": 1.4955e-05, "loss": 1.7659, "mean_token_accuracy": 0.6465475499629975, "num_tokens": 25564683.0, "step": 7010 }, { "epoch": 53.96153846153846, "grad_norm": 0.0, "learning_rate": 1.493e-05, "loss": 1.7459, "mean_token_accuracy": 0.6489285141229629, "num_tokens": 25582903.0, "step": 7015 }, { "epoch": 54.0, "grad_norm": 0.0, "learning_rate": 1.4904999999999999e-05, "loss": 1.6906, "mean_token_accuracy": 0.6570352196693421, "num_tokens": 25601670.0, "step": 7020 }, { "epoch": 54.03846153846154, "grad_norm": 0.0, "learning_rate": 1.488e-05, "loss": 1.7228, "mean_token_accuracy": 0.6555858552455902, "num_tokens": 25620461.0, "step": 7025 }, { "epoch": 54.07692307692308, "grad_norm": 0.0, "learning_rate": 1.4855e-05, "loss": 1.817, "mean_token_accuracy": 0.6386265754699707, "num_tokens": 25637942.0, "step": 7030 }, { "epoch": 54.11538461538461, "grad_norm": 0.0, "learning_rate": 1.4829999999999999e-05, "loss": 1.6514, "mean_token_accuracy": 0.6691981256008148, "num_tokens": 25656864.0, "step": 7035 }, { "epoch": 54.15384615384615, "grad_norm": 0.0, "learning_rate": 1.4805e-05, "loss": 1.7832, "mean_token_accuracy": 0.6411665648221969, "num_tokens": 25675005.0, "step": 7040 }, { "epoch": 54.19230769230769, "grad_norm": 0.0, "learning_rate": 1.4779999999999999e-05, "loss": 1.6956, "mean_token_accuracy": 0.6606644123792649, "num_tokens": 25694037.0, "step": 7045 }, { "epoch": 54.23076923076923, "grad_norm": 0.0, "learning_rate": 1.4755e-05, "loss": 1.7537, "mean_token_accuracy": 0.6480139315128326, "num_tokens": 25712064.0, "step": 7050 }, { "epoch": 54.26923076923077, "grad_norm": 0.0, "learning_rate": 1.473e-05, "loss": 1.701, "mean_token_accuracy": 0.6573342353105545, "num_tokens": 25731391.0, "step": 7055 }, { "epoch": 54.30769230769231, "grad_norm": 0.0, "learning_rate": 1.4704999999999999e-05, "loss": 1.7046, "mean_token_accuracy": 0.6552079916000366, "num_tokens": 25750436.0, "step": 7060 }, { "epoch": 54.34615384615385, "grad_norm": 0.0, "learning_rate": 1.4680000000000002e-05, "loss": 1.7022, "mean_token_accuracy": 0.6563819646835327, "num_tokens": 25768998.0, "step": 7065 }, { "epoch": 54.38461538461539, "grad_norm": 0.0, "learning_rate": 1.4655000000000003e-05, "loss": 1.7062, "mean_token_accuracy": 0.6605098664760589, "num_tokens": 25787124.0, "step": 7070 }, { "epoch": 54.42307692307692, "grad_norm": 0.0, "learning_rate": 1.4630000000000002e-05, "loss": 1.7444, "mean_token_accuracy": 0.6469592988491059, "num_tokens": 25805583.0, "step": 7075 }, { "epoch": 54.46153846153846, "grad_norm": 0.0, "learning_rate": 1.4605000000000002e-05, "loss": 1.7117, "mean_token_accuracy": 0.6542593479156494, "num_tokens": 25824061.0, "step": 7080 }, { "epoch": 54.5, "grad_norm": 0.0, "learning_rate": 1.4580000000000003e-05, "loss": 1.7751, "mean_token_accuracy": 0.6425804078578949, "num_tokens": 25842073.0, "step": 7085 }, { "epoch": 54.53846153846154, "grad_norm": 0.0, "learning_rate": 1.4555000000000002e-05, "loss": 1.7454, "mean_token_accuracy": 0.6489378064870834, "num_tokens": 25860639.0, "step": 7090 }, { "epoch": 54.57692307692308, "grad_norm": 0.0, "learning_rate": 1.4530000000000001e-05, "loss": 1.721, "mean_token_accuracy": 0.6531164467334747, "num_tokens": 25878985.0, "step": 7095 }, { "epoch": 54.61538461538461, "grad_norm": 0.0, "learning_rate": 1.4505000000000003e-05, "loss": 1.7618, "mean_token_accuracy": 0.6497918337583541, "num_tokens": 25896813.0, "step": 7100 }, { "epoch": 54.65384615384615, "grad_norm": 0.0, "learning_rate": 1.4480000000000002e-05, "loss": 1.85, "mean_token_accuracy": 0.6287777543067932, "num_tokens": 25913707.0, "step": 7105 }, { "epoch": 54.69230769230769, "grad_norm": 0.0, "learning_rate": 1.4455000000000001e-05, "loss": 1.739, "mean_token_accuracy": 0.6525210946798324, "num_tokens": 25931516.0, "step": 7110 }, { "epoch": 54.73076923076923, "grad_norm": 0.0, "learning_rate": 1.4430000000000002e-05, "loss": 1.7648, "mean_token_accuracy": 0.6474990725517273, "num_tokens": 25949015.0, "step": 7115 }, { "epoch": 54.76923076923077, "grad_norm": 0.0, "learning_rate": 1.4405000000000002e-05, "loss": 1.7407, "mean_token_accuracy": 0.6535341709852218, "num_tokens": 25967346.0, "step": 7120 }, { "epoch": 54.80769230769231, "grad_norm": 0.0, "learning_rate": 1.4380000000000001e-05, "loss": 1.7919, "mean_token_accuracy": 0.6459776431322097, "num_tokens": 25984246.0, "step": 7125 }, { "epoch": 54.84615384615385, "grad_norm": 0.0, "learning_rate": 1.4355000000000002e-05, "loss": 1.6747, "mean_token_accuracy": 0.6653303891420365, "num_tokens": 26003170.0, "step": 7130 }, { "epoch": 54.88461538461539, "grad_norm": 0.0, "learning_rate": 1.4330000000000002e-05, "loss": 1.7999, "mean_token_accuracy": 0.6397199392318725, "num_tokens": 26020817.0, "step": 7135 }, { "epoch": 54.92307692307692, "grad_norm": 0.0, "learning_rate": 1.4305000000000001e-05, "loss": 1.7184, "mean_token_accuracy": 0.6556466907262802, "num_tokens": 26039316.0, "step": 7140 }, { "epoch": 54.96153846153846, "grad_norm": 0.0, "learning_rate": 1.4280000000000002e-05, "loss": 1.7178, "mean_token_accuracy": 0.6527464896440506, "num_tokens": 26058030.0, "step": 7145 }, { "epoch": 55.0, "grad_norm": 0.0, "learning_rate": 1.4255000000000002e-05, "loss": 1.7908, "mean_token_accuracy": 0.6422353565692902, "num_tokens": 26075775.0, "step": 7150 }, { "epoch": 55.03846153846154, "grad_norm": 0.0, "learning_rate": 1.4230000000000001e-05, "loss": 1.7951, "mean_token_accuracy": 0.6395128637552261, "num_tokens": 26093593.0, "step": 7155 }, { "epoch": 55.07692307692308, "grad_norm": 0.0, "learning_rate": 1.4205000000000002e-05, "loss": 1.7587, "mean_token_accuracy": 0.6477168142795563, "num_tokens": 26111565.0, "step": 7160 }, { "epoch": 55.11538461538461, "grad_norm": 0.0, "learning_rate": 1.4180000000000001e-05, "loss": 1.6735, "mean_token_accuracy": 0.660878399014473, "num_tokens": 26130288.0, "step": 7165 }, { "epoch": 55.15384615384615, "grad_norm": 0.0, "learning_rate": 1.4155000000000001e-05, "loss": 1.7429, "mean_token_accuracy": 0.644039872288704, "num_tokens": 26148751.0, "step": 7170 }, { "epoch": 55.19230769230769, "grad_norm": 0.0, "learning_rate": 1.4130000000000002e-05, "loss": 1.743, "mean_token_accuracy": 0.6448181957006455, "num_tokens": 26167250.0, "step": 7175 }, { "epoch": 55.23076923076923, "grad_norm": 0.0, "learning_rate": 1.4105000000000001e-05, "loss": 1.7366, "mean_token_accuracy": 0.6529501020908356, "num_tokens": 26185026.0, "step": 7180 }, { "epoch": 55.26923076923077, "grad_norm": 0.0, "learning_rate": 1.408e-05, "loss": 1.687, "mean_token_accuracy": 0.6574739754199982, "num_tokens": 26204005.0, "step": 7185 }, { "epoch": 55.30769230769231, "grad_norm": 0.0, "learning_rate": 1.4055000000000002e-05, "loss": 1.8161, "mean_token_accuracy": 0.6342040151357651, "num_tokens": 26221347.0, "step": 7190 }, { "epoch": 55.34615384615385, "grad_norm": 0.0, "learning_rate": 1.4030000000000001e-05, "loss": 1.7202, "mean_token_accuracy": 0.6536721408367157, "num_tokens": 26239690.0, "step": 7195 }, { "epoch": 55.38461538461539, "grad_norm": 0.0, "learning_rate": 1.4005000000000002e-05, "loss": 1.7623, "mean_token_accuracy": 0.6440413445234299, "num_tokens": 26257855.0, "step": 7200 }, { "epoch": 55.42307692307692, "grad_norm": 0.0, "learning_rate": 1.3980000000000002e-05, "loss": 1.7581, "mean_token_accuracy": 0.651086950302124, "num_tokens": 26275173.0, "step": 7205 }, { "epoch": 55.46153846153846, "grad_norm": 0.0, "learning_rate": 1.3955000000000001e-05, "loss": 1.7523, "mean_token_accuracy": 0.6518444687128067, "num_tokens": 26293084.0, "step": 7210 }, { "epoch": 55.5, "grad_norm": 0.0, "learning_rate": 1.3930000000000002e-05, "loss": 1.7744, "mean_token_accuracy": 0.6451671838760376, "num_tokens": 26310728.0, "step": 7215 }, { "epoch": 55.53846153846154, "grad_norm": 0.0, "learning_rate": 1.3905000000000002e-05, "loss": 1.7131, "mean_token_accuracy": 0.6529065489768981, "num_tokens": 26329504.0, "step": 7220 }, { "epoch": 55.57692307692308, "grad_norm": 0.0, "learning_rate": 1.3880000000000001e-05, "loss": 1.6938, "mean_token_accuracy": 0.6577755719423294, "num_tokens": 26348129.0, "step": 7225 }, { "epoch": 55.61538461538461, "grad_norm": 0.0, "learning_rate": 1.3855000000000002e-05, "loss": 1.7149, "mean_token_accuracy": 0.6628178864717483, "num_tokens": 26366596.0, "step": 7230 }, { "epoch": 55.65384615384615, "grad_norm": 0.0, "learning_rate": 1.3830000000000001e-05, "loss": 1.7266, "mean_token_accuracy": 0.6519777953624726, "num_tokens": 26384852.0, "step": 7235 }, { "epoch": 55.69230769230769, "grad_norm": 0.0, "learning_rate": 1.3805e-05, "loss": 1.7232, "mean_token_accuracy": 0.6492596924304962, "num_tokens": 26403619.0, "step": 7240 }, { "epoch": 55.73076923076923, "grad_norm": 0.0, "learning_rate": 1.3780000000000002e-05, "loss": 1.6343, "mean_token_accuracy": 0.6683441162109375, "num_tokens": 26423343.0, "step": 7245 }, { "epoch": 55.76923076923077, "grad_norm": 0.0, "learning_rate": 1.3755000000000001e-05, "loss": 1.8149, "mean_token_accuracy": 0.6388367056846619, "num_tokens": 26440984.0, "step": 7250 }, { "epoch": 55.80769230769231, "grad_norm": 0.0, "learning_rate": 1.373e-05, "loss": 1.7501, "mean_token_accuracy": 0.6547516256570816, "num_tokens": 26459173.0, "step": 7255 }, { "epoch": 55.84615384615385, "grad_norm": 0.0, "learning_rate": 1.3705000000000002e-05, "loss": 1.7926, "mean_token_accuracy": 0.6436378836631775, "num_tokens": 26477032.0, "step": 7260 }, { "epoch": 55.88461538461539, "grad_norm": 0.0, "learning_rate": 1.3680000000000001e-05, "loss": 1.7543, "mean_token_accuracy": 0.6499706447124481, "num_tokens": 26495861.0, "step": 7265 }, { "epoch": 55.92307692307692, "grad_norm": 0.0, "learning_rate": 1.3655e-05, "loss": 1.714, "mean_token_accuracy": 0.6544960319995881, "num_tokens": 26514486.0, "step": 7270 }, { "epoch": 55.96153846153846, "grad_norm": 0.0, "learning_rate": 1.3630000000000002e-05, "loss": 1.7599, "mean_token_accuracy": 0.6501589000225068, "num_tokens": 26532126.0, "step": 7275 }, { "epoch": 56.0, "grad_norm": 0.0, "learning_rate": 1.3605000000000001e-05, "loss": 1.7521, "mean_token_accuracy": 0.6498360931873322, "num_tokens": 26549880.0, "step": 7280 }, { "epoch": 56.03846153846154, "grad_norm": 0.0, "learning_rate": 1.358e-05, "loss": 1.7991, "mean_token_accuracy": 0.6392385989427567, "num_tokens": 26567257.0, "step": 7285 }, { "epoch": 56.07692307692308, "grad_norm": 0.0, "learning_rate": 1.3555000000000002e-05, "loss": 1.7625, "mean_token_accuracy": 0.6449432462453842, "num_tokens": 26585384.0, "step": 7290 }, { "epoch": 56.11538461538461, "grad_norm": 0.0, "learning_rate": 1.3530000000000001e-05, "loss": 1.7256, "mean_token_accuracy": 0.6524859637022018, "num_tokens": 26604011.0, "step": 7295 }, { "epoch": 56.15384615384615, "grad_norm": 0.0, "learning_rate": 1.3505e-05, "loss": 1.6922, "mean_token_accuracy": 0.6660002678632736, "num_tokens": 26622949.0, "step": 7300 }, { "epoch": 56.19230769230769, "grad_norm": 0.0, "learning_rate": 1.3480000000000001e-05, "loss": 1.7609, "mean_token_accuracy": 0.6495236337184906, "num_tokens": 26640892.0, "step": 7305 }, { "epoch": 56.23076923076923, "grad_norm": 0.0, "learning_rate": 1.3455e-05, "loss": 1.7394, "mean_token_accuracy": 0.6511315196752548, "num_tokens": 26659096.0, "step": 7310 }, { "epoch": 56.26923076923077, "grad_norm": 0.0, "learning_rate": 1.343e-05, "loss": 1.7669, "mean_token_accuracy": 0.6419103145599365, "num_tokens": 26677122.0, "step": 7315 }, { "epoch": 56.30769230769231, "grad_norm": 0.0, "learning_rate": 1.3405000000000001e-05, "loss": 1.7607, "mean_token_accuracy": 0.6484110444784165, "num_tokens": 26694580.0, "step": 7320 }, { "epoch": 56.34615384615385, "grad_norm": 0.0, "learning_rate": 1.338e-05, "loss": 1.7041, "mean_token_accuracy": 0.6528016269207001, "num_tokens": 26713265.0, "step": 7325 }, { "epoch": 56.38461538461539, "grad_norm": 0.0, "learning_rate": 1.3355e-05, "loss": 1.7255, "mean_token_accuracy": 0.6498448342084885, "num_tokens": 26731833.0, "step": 7330 }, { "epoch": 56.42307692307692, "grad_norm": 0.0, "learning_rate": 1.3330000000000001e-05, "loss": 1.7544, "mean_token_accuracy": 0.6504902809858322, "num_tokens": 26749542.0, "step": 7335 }, { "epoch": 56.46153846153846, "grad_norm": 0.0, "learning_rate": 1.3305e-05, "loss": 1.7841, "mean_token_accuracy": 0.6441579282283783, "num_tokens": 26767123.0, "step": 7340 }, { "epoch": 56.5, "grad_norm": 0.0, "learning_rate": 1.3280000000000002e-05, "loss": 1.7293, "mean_token_accuracy": 0.6540816992521286, "num_tokens": 26785021.0, "step": 7345 }, { "epoch": 56.53846153846154, "grad_norm": 0.0, "learning_rate": 1.3255000000000001e-05, "loss": 1.7353, "mean_token_accuracy": 0.6520332425832749, "num_tokens": 26803690.0, "step": 7350 }, { "epoch": 56.57692307692308, "grad_norm": 0.0, "learning_rate": 1.323e-05, "loss": 1.7765, "mean_token_accuracy": 0.6428980946540832, "num_tokens": 26821866.0, "step": 7355 }, { "epoch": 56.61538461538461, "grad_norm": 0.0, "learning_rate": 1.3205000000000001e-05, "loss": 1.7009, "mean_token_accuracy": 0.6572036474943161, "num_tokens": 26840558.0, "step": 7360 }, { "epoch": 56.65384615384615, "grad_norm": 0.0, "learning_rate": 1.3180000000000001e-05, "loss": 1.7503, "mean_token_accuracy": 0.6493770986795425, "num_tokens": 26859028.0, "step": 7365 }, { "epoch": 56.69230769230769, "grad_norm": 0.0, "learning_rate": 1.3155e-05, "loss": 1.7131, "mean_token_accuracy": 0.6502207726240158, "num_tokens": 26877909.0, "step": 7370 }, { "epoch": 56.73076923076923, "grad_norm": 0.0, "learning_rate": 1.3130000000000001e-05, "loss": 1.7571, "mean_token_accuracy": 0.6498846352100373, "num_tokens": 26895371.0, "step": 7375 }, { "epoch": 56.76923076923077, "grad_norm": 0.0, "learning_rate": 1.3105e-05, "loss": 1.7048, "mean_token_accuracy": 0.6559781163930893, "num_tokens": 26914341.0, "step": 7380 }, { "epoch": 56.80769230769231, "grad_norm": 0.0, "learning_rate": 1.308e-05, "loss": 1.7642, "mean_token_accuracy": 0.6446814894676208, "num_tokens": 26932597.0, "step": 7385 }, { "epoch": 56.84615384615385, "grad_norm": 0.0, "learning_rate": 1.3055000000000001e-05, "loss": 1.7919, "mean_token_accuracy": 0.6469128876924515, "num_tokens": 26950520.0, "step": 7390 }, { "epoch": 56.88461538461539, "grad_norm": 0.0, "learning_rate": 1.303e-05, "loss": 1.724, "mean_token_accuracy": 0.6555041134357452, "num_tokens": 26969262.0, "step": 7395 }, { "epoch": 56.92307692307692, "grad_norm": 0.0, "learning_rate": 1.3005e-05, "loss": 1.7015, "mean_token_accuracy": 0.661999249458313, "num_tokens": 26987507.0, "step": 7400 }, { "epoch": 56.96153846153846, "grad_norm": 0.0, "learning_rate": 1.2980000000000001e-05, "loss": 1.6936, "mean_token_accuracy": 0.6539974421262741, "num_tokens": 27005866.0, "step": 7405 }, { "epoch": 57.0, "grad_norm": 0.0, "learning_rate": 1.2955e-05, "loss": 1.7419, "mean_token_accuracy": 0.6442842125892639, "num_tokens": 27023985.0, "step": 7410 }, { "epoch": 57.03846153846154, "grad_norm": 0.0, "learning_rate": 1.293e-05, "loss": 1.7412, "mean_token_accuracy": 0.6453804969787598, "num_tokens": 27042588.0, "step": 7415 }, { "epoch": 57.07692307692308, "grad_norm": 0.0, "learning_rate": 1.2905000000000001e-05, "loss": 1.7348, "mean_token_accuracy": 0.6512410253286361, "num_tokens": 27060805.0, "step": 7420 }, { "epoch": 57.11538461538461, "grad_norm": 0.0, "learning_rate": 1.288e-05, "loss": 1.7818, "mean_token_accuracy": 0.6462918251752854, "num_tokens": 27078281.0, "step": 7425 }, { "epoch": 57.15384615384615, "grad_norm": 0.0, "learning_rate": 1.2855e-05, "loss": 1.7089, "mean_token_accuracy": 0.6547515660524368, "num_tokens": 27097554.0, "step": 7430 }, { "epoch": 57.19230769230769, "grad_norm": 0.0, "learning_rate": 1.283e-05, "loss": 1.7108, "mean_token_accuracy": 0.6577676206827163, "num_tokens": 27116052.0, "step": 7435 }, { "epoch": 57.23076923076923, "grad_norm": 0.0, "learning_rate": 1.2805e-05, "loss": 1.7134, "mean_token_accuracy": 0.6552876085042953, "num_tokens": 27134390.0, "step": 7440 }, { "epoch": 57.26923076923077, "grad_norm": 0.0, "learning_rate": 1.278e-05, "loss": 1.7117, "mean_token_accuracy": 0.6523223549127579, "num_tokens": 27153262.0, "step": 7445 }, { "epoch": 57.30769230769231, "grad_norm": 0.0, "learning_rate": 1.2755e-05, "loss": 1.7682, "mean_token_accuracy": 0.645848673582077, "num_tokens": 27171115.0, "step": 7450 }, { "epoch": 57.34615384615385, "grad_norm": 0.0, "learning_rate": 1.273e-05, "loss": 1.7698, "mean_token_accuracy": 0.6474648177623749, "num_tokens": 27188950.0, "step": 7455 }, { "epoch": 57.38461538461539, "grad_norm": 0.0, "learning_rate": 1.2705e-05, "loss": 1.7511, "mean_token_accuracy": 0.6489328056573868, "num_tokens": 27207207.0, "step": 7460 }, { "epoch": 57.42307692307692, "grad_norm": 0.0, "learning_rate": 1.268e-05, "loss": 1.7531, "mean_token_accuracy": 0.6518824428319931, "num_tokens": 27225139.0, "step": 7465 }, { "epoch": 57.46153846153846, "grad_norm": 0.0, "learning_rate": 1.2655e-05, "loss": 1.7544, "mean_token_accuracy": 0.6468398660421372, "num_tokens": 27243113.0, "step": 7470 }, { "epoch": 57.5, "grad_norm": 0.0, "learning_rate": 1.263e-05, "loss": 1.7591, "mean_token_accuracy": 0.6493138283491134, "num_tokens": 27261095.0, "step": 7475 }, { "epoch": 57.53846153846154, "grad_norm": 0.0, "learning_rate": 1.2605e-05, "loss": 1.7276, "mean_token_accuracy": 0.6510590463876724, "num_tokens": 27279536.0, "step": 7480 }, { "epoch": 57.57692307692308, "grad_norm": 0.0, "learning_rate": 1.258e-05, "loss": 1.7754, "mean_token_accuracy": 0.6381223618984222, "num_tokens": 27297805.0, "step": 7485 }, { "epoch": 57.61538461538461, "grad_norm": 0.0, "learning_rate": 1.2555000000000001e-05, "loss": 1.7051, "mean_token_accuracy": 0.6624167144298554, "num_tokens": 27316540.0, "step": 7490 }, { "epoch": 57.65384615384615, "grad_norm": 0.0, "learning_rate": 1.253e-05, "loss": 1.6621, "mean_token_accuracy": 0.6645474463701249, "num_tokens": 27336007.0, "step": 7495 }, { "epoch": 57.69230769230769, "grad_norm": 0.0, "learning_rate": 1.2505e-05, "loss": 1.7167, "mean_token_accuracy": 0.6529757559299469, "num_tokens": 27354439.0, "step": 7500 }, { "epoch": 57.73076923076923, "grad_norm": 0.0, "learning_rate": 1.248e-05, "loss": 1.7176, "mean_token_accuracy": 0.6583882302045823, "num_tokens": 27372701.0, "step": 7505 }, { "epoch": 57.76923076923077, "grad_norm": 0.0, "learning_rate": 1.2455e-05, "loss": 1.7395, "mean_token_accuracy": 0.6499318182468414, "num_tokens": 27390861.0, "step": 7510 }, { "epoch": 57.80769230769231, "grad_norm": 0.0, "learning_rate": 1.243e-05, "loss": 1.7021, "mean_token_accuracy": 0.654238548874855, "num_tokens": 27409107.0, "step": 7515 }, { "epoch": 57.84615384615385, "grad_norm": 0.0, "learning_rate": 1.2405e-05, "loss": 1.758, "mean_token_accuracy": 0.6471449792385101, "num_tokens": 27427046.0, "step": 7520 }, { "epoch": 57.88461538461539, "grad_norm": 0.0, "learning_rate": 1.238e-05, "loss": 1.7179, "mean_token_accuracy": 0.6572539776563644, "num_tokens": 27445469.0, "step": 7525 }, { "epoch": 57.92307692307692, "grad_norm": 0.0, "learning_rate": 1.2355e-05, "loss": 1.7317, "mean_token_accuracy": 0.653878676891327, "num_tokens": 27463877.0, "step": 7530 }, { "epoch": 57.96153846153846, "grad_norm": 0.0, "learning_rate": 1.233e-05, "loss": 1.836, "mean_token_accuracy": 0.6318351715803147, "num_tokens": 27480728.0, "step": 7535 }, { "epoch": 58.0, "grad_norm": 0.0, "learning_rate": 1.2305000000000002e-05, "loss": 1.8048, "mean_token_accuracy": 0.6375920951366425, "num_tokens": 27498090.0, "step": 7540 }, { "epoch": 58.03846153846154, "grad_norm": 0.0, "learning_rate": 1.2280000000000001e-05, "loss": 1.7156, "mean_token_accuracy": 0.6615946650505066, "num_tokens": 27516714.0, "step": 7545 }, { "epoch": 58.07692307692308, "grad_norm": 0.0, "learning_rate": 1.2255e-05, "loss": 1.8136, "mean_token_accuracy": 0.6379787772893906, "num_tokens": 27534566.0, "step": 7550 }, { "epoch": 58.11538461538461, "grad_norm": 0.0, "learning_rate": 1.2230000000000001e-05, "loss": 1.7963, "mean_token_accuracy": 0.6373230516910553, "num_tokens": 27552046.0, "step": 7555 }, { "epoch": 58.15384615384615, "grad_norm": 0.0, "learning_rate": 1.2205000000000001e-05, "loss": 1.7564, "mean_token_accuracy": 0.6494860410690307, "num_tokens": 27569697.0, "step": 7560 }, { "epoch": 58.19230769230769, "grad_norm": 0.0, "learning_rate": 1.2180000000000002e-05, "loss": 1.7436, "mean_token_accuracy": 0.6535836279392242, "num_tokens": 27588049.0, "step": 7565 }, { "epoch": 58.23076923076923, "grad_norm": 0.0, "learning_rate": 1.2155000000000001e-05, "loss": 1.7814, "mean_token_accuracy": 0.6435631811618805, "num_tokens": 27605236.0, "step": 7570 }, { "epoch": 58.26923076923077, "grad_norm": 0.0, "learning_rate": 1.213e-05, "loss": 1.7401, "mean_token_accuracy": 0.6488517135381698, "num_tokens": 27623605.0, "step": 7575 }, { "epoch": 58.30769230769231, "grad_norm": 0.0, "learning_rate": 1.2105000000000002e-05, "loss": 1.7388, "mean_token_accuracy": 0.6520835846662522, "num_tokens": 27641906.0, "step": 7580 }, { "epoch": 58.34615384615385, "grad_norm": 0.0, "learning_rate": 1.2080000000000001e-05, "loss": 1.7482, "mean_token_accuracy": 0.6481060594320297, "num_tokens": 27660198.0, "step": 7585 }, { "epoch": 58.38461538461539, "grad_norm": 0.0, "learning_rate": 1.2055e-05, "loss": 1.8091, "mean_token_accuracy": 0.6401431083679199, "num_tokens": 27677904.0, "step": 7590 }, { "epoch": 58.42307692307692, "grad_norm": 0.0, "learning_rate": 1.2030000000000002e-05, "loss": 1.7091, "mean_token_accuracy": 0.6640697896480561, "num_tokens": 27695801.0, "step": 7595 }, { "epoch": 58.46153846153846, "grad_norm": 0.0, "learning_rate": 1.2005000000000001e-05, "loss": 1.7289, "mean_token_accuracy": 0.6482616752386093, "num_tokens": 27713869.0, "step": 7600 }, { "epoch": 58.5, "grad_norm": 0.0, "learning_rate": 1.198e-05, "loss": 1.7355, "mean_token_accuracy": 0.6519585400819778, "num_tokens": 27731955.0, "step": 7605 }, { "epoch": 58.53846153846154, "grad_norm": 0.0, "learning_rate": 1.1955000000000002e-05, "loss": 1.7163, "mean_token_accuracy": 0.6518133997917175, "num_tokens": 27750713.0, "step": 7610 }, { "epoch": 58.57692307692308, "grad_norm": 0.0, "learning_rate": 1.1930000000000001e-05, "loss": 1.7286, "mean_token_accuracy": 0.6503622680902481, "num_tokens": 27769167.0, "step": 7615 }, { "epoch": 58.61538461538461, "grad_norm": 0.0, "learning_rate": 1.1905e-05, "loss": 1.6179, "mean_token_accuracy": 0.676550367474556, "num_tokens": 27788253.0, "step": 7620 }, { "epoch": 58.65384615384615, "grad_norm": 0.0, "learning_rate": 1.1880000000000001e-05, "loss": 1.7219, "mean_token_accuracy": 0.6493042975664138, "num_tokens": 27807507.0, "step": 7625 }, { "epoch": 58.69230769230769, "grad_norm": 0.0, "learning_rate": 1.1855e-05, "loss": 1.7596, "mean_token_accuracy": 0.6421011120080948, "num_tokens": 27825718.0, "step": 7630 }, { "epoch": 58.73076923076923, "grad_norm": 0.0, "learning_rate": 1.183e-05, "loss": 1.7286, "mean_token_accuracy": 0.6499491780996323, "num_tokens": 27843945.0, "step": 7635 }, { "epoch": 58.76923076923077, "grad_norm": 0.0, "learning_rate": 1.1805000000000001e-05, "loss": 1.7409, "mean_token_accuracy": 0.6533534795045852, "num_tokens": 27862495.0, "step": 7640 }, { "epoch": 58.80769230769231, "grad_norm": 0.0, "learning_rate": 1.178e-05, "loss": 1.767, "mean_token_accuracy": 0.6454948484897614, "num_tokens": 27880609.0, "step": 7645 }, { "epoch": 58.84615384615385, "grad_norm": 0.0, "learning_rate": 1.1755e-05, "loss": 1.7275, "mean_token_accuracy": 0.6586075276136398, "num_tokens": 27899029.0, "step": 7650 }, { "epoch": 58.88461538461539, "grad_norm": 0.0, "learning_rate": 1.1730000000000001e-05, "loss": 1.7258, "mean_token_accuracy": 0.6555096328258514, "num_tokens": 27917125.0, "step": 7655 }, { "epoch": 58.92307692307692, "grad_norm": 0.0, "learning_rate": 1.1705e-05, "loss": 1.6887, "mean_token_accuracy": 0.6544747292995453, "num_tokens": 27936195.0, "step": 7660 }, { "epoch": 58.96153846153846, "grad_norm": 0.0, "learning_rate": 1.168e-05, "loss": 1.8034, "mean_token_accuracy": 0.6353619664907455, "num_tokens": 27953877.0, "step": 7665 }, { "epoch": 59.0, "grad_norm": 0.0, "learning_rate": 1.1655000000000001e-05, "loss": 1.7287, "mean_token_accuracy": 0.6526497393846512, "num_tokens": 27972195.0, "step": 7670 }, { "epoch": 59.03846153846154, "grad_norm": 0.0, "learning_rate": 1.163e-05, "loss": 1.7736, "mean_token_accuracy": 0.6459016293287277, "num_tokens": 27989703.0, "step": 7675 }, { "epoch": 59.07692307692308, "grad_norm": 0.0, "learning_rate": 1.1605e-05, "loss": 1.7243, "mean_token_accuracy": 0.6570581525564194, "num_tokens": 28007876.0, "step": 7680 }, { "epoch": 59.11538461538461, "grad_norm": 0.0, "learning_rate": 1.1580000000000001e-05, "loss": 1.6764, "mean_token_accuracy": 0.6651537865400314, "num_tokens": 28026168.0, "step": 7685 }, { "epoch": 59.15384615384615, "grad_norm": 0.0, "learning_rate": 1.1555e-05, "loss": 1.7593, "mean_token_accuracy": 0.6441430777311326, "num_tokens": 28044444.0, "step": 7690 }, { "epoch": 59.19230769230769, "grad_norm": 0.0, "learning_rate": 1.153e-05, "loss": 1.8019, "mean_token_accuracy": 0.637511157989502, "num_tokens": 28062363.0, "step": 7695 }, { "epoch": 59.23076923076923, "grad_norm": 0.0, "learning_rate": 1.1505e-05, "loss": 1.6635, "mean_token_accuracy": 0.665479126572609, "num_tokens": 28081357.0, "step": 7700 }, { "epoch": 59.26923076923077, "grad_norm": 0.0, "learning_rate": 1.148e-05, "loss": 1.6759, "mean_token_accuracy": 0.6660502076148986, "num_tokens": 28099996.0, "step": 7705 }, { "epoch": 59.30769230769231, "grad_norm": 0.0, "learning_rate": 1.1455000000000001e-05, "loss": 1.7881, "mean_token_accuracy": 0.6390215069055557, "num_tokens": 28117982.0, "step": 7710 }, { "epoch": 59.34615384615385, "grad_norm": 0.0, "learning_rate": 1.143e-05, "loss": 1.7389, "mean_token_accuracy": 0.6523732364177703, "num_tokens": 28136640.0, "step": 7715 }, { "epoch": 59.38461538461539, "grad_norm": 0.0, "learning_rate": 1.1405e-05, "loss": 1.7337, "mean_token_accuracy": 0.6487939029932022, "num_tokens": 28155122.0, "step": 7720 }, { "epoch": 59.42307692307692, "grad_norm": 0.0, "learning_rate": 1.1380000000000001e-05, "loss": 1.7896, "mean_token_accuracy": 0.6361289143562316, "num_tokens": 28173409.0, "step": 7725 }, { "epoch": 59.46153846153846, "grad_norm": 0.0, "learning_rate": 1.1355e-05, "loss": 1.6714, "mean_token_accuracy": 0.6632986754179001, "num_tokens": 28191671.0, "step": 7730 }, { "epoch": 59.5, "grad_norm": 0.0, "learning_rate": 1.133e-05, "loss": 1.7393, "mean_token_accuracy": 0.6471075922250747, "num_tokens": 28210071.0, "step": 7735 }, { "epoch": 59.53846153846154, "grad_norm": 0.0, "learning_rate": 1.1305000000000001e-05, "loss": 1.7713, "mean_token_accuracy": 0.6440162390470505, "num_tokens": 28227999.0, "step": 7740 }, { "epoch": 59.57692307692308, "grad_norm": 0.0, "learning_rate": 1.128e-05, "loss": 1.7417, "mean_token_accuracy": 0.6511160790920257, "num_tokens": 28246129.0, "step": 7745 }, { "epoch": 59.61538461538461, "grad_norm": 0.0, "learning_rate": 1.1255e-05, "loss": 1.6313, "mean_token_accuracy": 0.6731823951005935, "num_tokens": 28265694.0, "step": 7750 }, { "epoch": 59.65384615384615, "grad_norm": 0.0, "learning_rate": 1.1230000000000001e-05, "loss": 1.7091, "mean_token_accuracy": 0.6587099075317383, "num_tokens": 28283812.0, "step": 7755 }, { "epoch": 59.69230769230769, "grad_norm": 0.0, "learning_rate": 1.1205e-05, "loss": 1.775, "mean_token_accuracy": 0.6434663653373718, "num_tokens": 28301500.0, "step": 7760 }, { "epoch": 59.73076923076923, "grad_norm": 0.0, "learning_rate": 1.118e-05, "loss": 1.7526, "mean_token_accuracy": 0.6484665781259537, "num_tokens": 28320482.0, "step": 7765 }, { "epoch": 59.76923076923077, "grad_norm": 0.0, "learning_rate": 1.1155e-05, "loss": 1.7621, "mean_token_accuracy": 0.6493111461400985, "num_tokens": 28338521.0, "step": 7770 }, { "epoch": 59.80769230769231, "grad_norm": 0.0, "learning_rate": 1.113e-05, "loss": 1.7517, "mean_token_accuracy": 0.6488117009401322, "num_tokens": 28356527.0, "step": 7775 }, { "epoch": 59.84615384615385, "grad_norm": 0.0, "learning_rate": 1.1105e-05, "loss": 1.7549, "mean_token_accuracy": 0.6550350069999695, "num_tokens": 28374046.0, "step": 7780 }, { "epoch": 59.88461538461539, "grad_norm": 0.0, "learning_rate": 1.108e-05, "loss": 1.7463, "mean_token_accuracy": 0.6461563289165497, "num_tokens": 28392669.0, "step": 7785 }, { "epoch": 59.92307692307692, "grad_norm": 0.0, "learning_rate": 1.1055e-05, "loss": 1.7571, "mean_token_accuracy": 0.6455042749643326, "num_tokens": 28410567.0, "step": 7790 }, { "epoch": 59.96153846153846, "grad_norm": 0.0, "learning_rate": 1.103e-05, "loss": 1.817, "mean_token_accuracy": 0.6328691780567169, "num_tokens": 28427919.0, "step": 7795 }, { "epoch": 60.0, "grad_norm": 0.0, "learning_rate": 1.1005e-05, "loss": 1.7394, "mean_token_accuracy": 0.6530990123748779, "num_tokens": 28446300.0, "step": 7800 }, { "epoch": 60.03846153846154, "grad_norm": 0.0, "learning_rate": 1.098e-05, "loss": 1.7301, "mean_token_accuracy": 0.6500816881656647, "num_tokens": 28464951.0, "step": 7805 }, { "epoch": 60.07692307692308, "grad_norm": 0.0, "learning_rate": 1.0955e-05, "loss": 1.7905, "mean_token_accuracy": 0.6456600069999695, "num_tokens": 28482557.0, "step": 7810 }, { "epoch": 60.11538461538461, "grad_norm": 0.0, "learning_rate": 1.093e-05, "loss": 1.719, "mean_token_accuracy": 0.6553211659193039, "num_tokens": 28500556.0, "step": 7815 }, { "epoch": 60.15384615384615, "grad_norm": 0.0, "learning_rate": 1.0905e-05, "loss": 1.804, "mean_token_accuracy": 0.635945051908493, "num_tokens": 28517958.0, "step": 7820 }, { "epoch": 60.19230769230769, "grad_norm": 0.0, "learning_rate": 1.088e-05, "loss": 1.7217, "mean_token_accuracy": 0.6568192034959793, "num_tokens": 28536147.0, "step": 7825 }, { "epoch": 60.23076923076923, "grad_norm": 0.0, "learning_rate": 1.0855e-05, "loss": 1.7808, "mean_token_accuracy": 0.6401394218206405, "num_tokens": 28554578.0, "step": 7830 }, { "epoch": 60.26923076923077, "grad_norm": 0.0, "learning_rate": 1.083e-05, "loss": 1.7614, "mean_token_accuracy": 0.6466422110795975, "num_tokens": 28572574.0, "step": 7835 }, { "epoch": 60.30769230769231, "grad_norm": 0.0, "learning_rate": 1.0804999999999999e-05, "loss": 1.7444, "mean_token_accuracy": 0.6499844253063202, "num_tokens": 28590779.0, "step": 7840 }, { "epoch": 60.34615384615385, "grad_norm": 0.0, "learning_rate": 1.0780000000000002e-05, "loss": 1.737, "mean_token_accuracy": 0.6540430366992951, "num_tokens": 28609298.0, "step": 7845 }, { "epoch": 60.38461538461539, "grad_norm": 0.0, "learning_rate": 1.0755000000000001e-05, "loss": 1.7464, "mean_token_accuracy": 0.6548949480056763, "num_tokens": 28626786.0, "step": 7850 }, { "epoch": 60.42307692307692, "grad_norm": 0.0, "learning_rate": 1.073e-05, "loss": 1.7265, "mean_token_accuracy": 0.6512384116649628, "num_tokens": 28645473.0, "step": 7855 }, { "epoch": 60.46153846153846, "grad_norm": 0.0, "learning_rate": 1.0705000000000002e-05, "loss": 1.6828, "mean_token_accuracy": 0.662873387336731, "num_tokens": 28664176.0, "step": 7860 }, { "epoch": 60.5, "grad_norm": 0.0, "learning_rate": 1.0680000000000001e-05, "loss": 1.7298, "mean_token_accuracy": 0.6527964979410171, "num_tokens": 28682577.0, "step": 7865 }, { "epoch": 60.53846153846154, "grad_norm": 0.0, "learning_rate": 1.0655e-05, "loss": 1.7012, "mean_token_accuracy": 0.6583037704229355, "num_tokens": 28701898.0, "step": 7870 }, { "epoch": 60.57692307692308, "grad_norm": 0.0, "learning_rate": 1.0630000000000002e-05, "loss": 1.7366, "mean_token_accuracy": 0.6519467145204544, "num_tokens": 28720551.0, "step": 7875 }, { "epoch": 60.61538461538461, "grad_norm": 0.0, "learning_rate": 1.0605000000000001e-05, "loss": 1.7621, "mean_token_accuracy": 0.6454852074384689, "num_tokens": 28738693.0, "step": 7880 }, { "epoch": 60.65384615384615, "grad_norm": 0.0, "learning_rate": 1.058e-05, "loss": 1.7185, "mean_token_accuracy": 0.6532977163791657, "num_tokens": 28757044.0, "step": 7885 }, { "epoch": 60.69230769230769, "grad_norm": 0.0, "learning_rate": 1.0555000000000001e-05, "loss": 1.749, "mean_token_accuracy": 0.651301595568657, "num_tokens": 28775068.0, "step": 7890 }, { "epoch": 60.73076923076923, "grad_norm": 0.0, "learning_rate": 1.053e-05, "loss": 1.7901, "mean_token_accuracy": 0.6404271066188812, "num_tokens": 28792926.0, "step": 7895 }, { "epoch": 60.76923076923077, "grad_norm": 0.0, "learning_rate": 1.0505e-05, "loss": 1.7377, "mean_token_accuracy": 0.6532931387424469, "num_tokens": 28810975.0, "step": 7900 }, { "epoch": 60.80769230769231, "grad_norm": 0.0, "learning_rate": 1.0480000000000001e-05, "loss": 1.8055, "mean_token_accuracy": 0.6381255328655243, "num_tokens": 28828534.0, "step": 7905 }, { "epoch": 60.84615384615385, "grad_norm": 0.0, "learning_rate": 1.0455e-05, "loss": 1.7612, "mean_token_accuracy": 0.6478750109672546, "num_tokens": 28846473.0, "step": 7910 }, { "epoch": 60.88461538461539, "grad_norm": 0.0, "learning_rate": 1.043e-05, "loss": 1.7125, "mean_token_accuracy": 0.6560309410095215, "num_tokens": 28864548.0, "step": 7915 }, { "epoch": 60.92307692307692, "grad_norm": 0.0, "learning_rate": 1.0405000000000001e-05, "loss": 1.7297, "mean_token_accuracy": 0.6469286412000657, "num_tokens": 28882849.0, "step": 7920 }, { "epoch": 60.96153846153846, "grad_norm": 0.0, "learning_rate": 1.038e-05, "loss": 1.6918, "mean_token_accuracy": 0.6562754690647126, "num_tokens": 28902085.0, "step": 7925 }, { "epoch": 61.0, "grad_norm": 0.0, "learning_rate": 1.0355000000000002e-05, "loss": 1.7195, "mean_token_accuracy": 0.6521394342184067, "num_tokens": 28920405.0, "step": 7930 }, { "epoch": 61.03846153846154, "grad_norm": 0.0, "learning_rate": 1.0330000000000001e-05, "loss": 1.7788, "mean_token_accuracy": 0.6419234901666642, "num_tokens": 28937930.0, "step": 7935 }, { "epoch": 61.07692307692308, "grad_norm": 0.0, "learning_rate": 1.0305e-05, "loss": 1.8234, "mean_token_accuracy": 0.6357664644718171, "num_tokens": 28955347.0, "step": 7940 }, { "epoch": 61.11538461538461, "grad_norm": 0.0, "learning_rate": 1.0280000000000002e-05, "loss": 1.7796, "mean_token_accuracy": 0.6469300240278244, "num_tokens": 28972764.0, "step": 7945 }, { "epoch": 61.15384615384615, "grad_norm": 0.0, "learning_rate": 1.0255000000000001e-05, "loss": 1.7454, "mean_token_accuracy": 0.6496328443288804, "num_tokens": 28991130.0, "step": 7950 }, { "epoch": 61.19230769230769, "grad_norm": 0.0, "learning_rate": 1.023e-05, "loss": 1.7164, "mean_token_accuracy": 0.6583995938301086, "num_tokens": 29009479.0, "step": 7955 }, { "epoch": 61.23076923076923, "grad_norm": 0.0, "learning_rate": 1.0205000000000001e-05, "loss": 1.7438, "mean_token_accuracy": 0.6459792077541351, "num_tokens": 29027531.0, "step": 7960 }, { "epoch": 61.26923076923077, "grad_norm": 0.0, "learning_rate": 1.018e-05, "loss": 1.7467, "mean_token_accuracy": 0.6560082226991654, "num_tokens": 29045399.0, "step": 7965 }, { "epoch": 61.30769230769231, "grad_norm": 0.0, "learning_rate": 1.0155e-05, "loss": 1.6793, "mean_token_accuracy": 0.6648034691810608, "num_tokens": 29064129.0, "step": 7970 }, { "epoch": 61.34615384615385, "grad_norm": 0.0, "learning_rate": 1.0130000000000001e-05, "loss": 1.7599, "mean_token_accuracy": 0.6455144137144089, "num_tokens": 29082239.0, "step": 7975 }, { "epoch": 61.38461538461539, "grad_norm": 0.0, "learning_rate": 1.0105e-05, "loss": 1.7216, "mean_token_accuracy": 0.6553202778100967, "num_tokens": 29100554.0, "step": 7980 }, { "epoch": 61.42307692307692, "grad_norm": 0.0, "learning_rate": 1.008e-05, "loss": 1.7646, "mean_token_accuracy": 0.6500458031892776, "num_tokens": 29118533.0, "step": 7985 }, { "epoch": 61.46153846153846, "grad_norm": 0.0, "learning_rate": 1.0055000000000001e-05, "loss": 1.7526, "mean_token_accuracy": 0.6455101191997528, "num_tokens": 29137065.0, "step": 7990 }, { "epoch": 61.5, "grad_norm": 0.0, "learning_rate": 1.003e-05, "loss": 1.7622, "mean_token_accuracy": 0.6454584419727325, "num_tokens": 29156036.0, "step": 7995 }, { "epoch": 61.53846153846154, "grad_norm": 0.0, "learning_rate": 1.0005e-05, "loss": 1.7365, "mean_token_accuracy": 0.6461439549922943, "num_tokens": 29174330.0, "step": 8000 }, { "epoch": 61.57692307692308, "grad_norm": 0.0, "learning_rate": 9.980000000000001e-06, "loss": 1.7046, "mean_token_accuracy": 0.661187008023262, "num_tokens": 29192595.0, "step": 8005 }, { "epoch": 61.61538461538461, "grad_norm": 0.0, "learning_rate": 9.955e-06, "loss": 1.6992, "mean_token_accuracy": 0.6574147075414658, "num_tokens": 29211106.0, "step": 8010 }, { "epoch": 61.65384615384615, "grad_norm": 0.0, "learning_rate": 9.93e-06, "loss": 1.6859, "mean_token_accuracy": 0.6642335325479507, "num_tokens": 29230108.0, "step": 8015 }, { "epoch": 61.69230769230769, "grad_norm": 0.0, "learning_rate": 9.905000000000001e-06, "loss": 1.8063, "mean_token_accuracy": 0.6333356082439423, "num_tokens": 29247611.0, "step": 8020 }, { "epoch": 61.73076923076923, "grad_norm": 0.0, "learning_rate": 9.88e-06, "loss": 1.7164, "mean_token_accuracy": 0.6516272902488709, "num_tokens": 29265885.0, "step": 8025 }, { "epoch": 61.76923076923077, "grad_norm": 0.0, "learning_rate": 9.855e-06, "loss": 1.7593, "mean_token_accuracy": 0.6486380189657212, "num_tokens": 29284235.0, "step": 8030 }, { "epoch": 61.80769230769231, "grad_norm": 0.0, "learning_rate": 9.83e-06, "loss": 1.7202, "mean_token_accuracy": 0.6530221402645111, "num_tokens": 29302328.0, "step": 8035 }, { "epoch": 61.84615384615385, "grad_norm": 0.0, "learning_rate": 9.805e-06, "loss": 1.7455, "mean_token_accuracy": 0.6459253072738648, "num_tokens": 29320675.0, "step": 8040 }, { "epoch": 61.88461538461539, "grad_norm": 0.0, "learning_rate": 9.78e-06, "loss": 1.7512, "mean_token_accuracy": 0.6454969674348832, "num_tokens": 29339644.0, "step": 8045 }, { "epoch": 61.92307692307692, "grad_norm": 0.0, "learning_rate": 9.755e-06, "loss": 1.7042, "mean_token_accuracy": 0.6602411061525345, "num_tokens": 29357996.0, "step": 8050 }, { "epoch": 61.96153846153846, "grad_norm": 0.0, "learning_rate": 9.73e-06, "loss": 1.6993, "mean_token_accuracy": 0.657492709159851, "num_tokens": 29376683.0, "step": 8055 }, { "epoch": 62.0, "grad_norm": 0.0, "learning_rate": 9.705e-06, "loss": 1.7704, "mean_token_accuracy": 0.6440939038991929, "num_tokens": 29394510.0, "step": 8060 }, { "epoch": 62.03846153846154, "grad_norm": 0.0, "learning_rate": 9.68e-06, "loss": 1.7601, "mean_token_accuracy": 0.6529957592487335, "num_tokens": 29412388.0, "step": 8065 }, { "epoch": 62.07692307692308, "grad_norm": 0.0, "learning_rate": 9.655e-06, "loss": 1.754, "mean_token_accuracy": 0.6488070756196975, "num_tokens": 29430324.0, "step": 8070 }, { "epoch": 62.11538461538461, "grad_norm": 0.0, "learning_rate": 9.630000000000001e-06, "loss": 1.7514, "mean_token_accuracy": 0.6483252078294754, "num_tokens": 29448473.0, "step": 8075 }, { "epoch": 62.15384615384615, "grad_norm": 0.0, "learning_rate": 9.605e-06, "loss": 1.7805, "mean_token_accuracy": 0.6442840725183487, "num_tokens": 29466357.0, "step": 8080 }, { "epoch": 62.19230769230769, "grad_norm": 0.0, "learning_rate": 9.58e-06, "loss": 1.7854, "mean_token_accuracy": 0.6450077176094056, "num_tokens": 29484406.0, "step": 8085 }, { "epoch": 62.23076923076923, "grad_norm": 0.0, "learning_rate": 9.555e-06, "loss": 1.6738, "mean_token_accuracy": 0.6603300988674163, "num_tokens": 29503619.0, "step": 8090 }, { "epoch": 62.26923076923077, "grad_norm": 0.0, "learning_rate": 9.53e-06, "loss": 1.7563, "mean_token_accuracy": 0.6429074674844741, "num_tokens": 29521125.0, "step": 8095 }, { "epoch": 62.30769230769231, "grad_norm": 0.0, "learning_rate": 9.505e-06, "loss": 1.7227, "mean_token_accuracy": 0.6537641167640686, "num_tokens": 29539856.0, "step": 8100 }, { "epoch": 62.34615384615385, "grad_norm": 0.0, "learning_rate": 9.48e-06, "loss": 1.7475, "mean_token_accuracy": 0.6510567903518677, "num_tokens": 29557912.0, "step": 8105 }, { "epoch": 62.38461538461539, "grad_norm": 0.0, "learning_rate": 9.455e-06, "loss": 1.7125, "mean_token_accuracy": 0.6581447750329972, "num_tokens": 29576445.0, "step": 8110 }, { "epoch": 62.42307692307692, "grad_norm": 0.0, "learning_rate": 9.43e-06, "loss": 1.8141, "mean_token_accuracy": 0.634006318449974, "num_tokens": 29594170.0, "step": 8115 }, { "epoch": 62.46153846153846, "grad_norm": 0.0, "learning_rate": 9.405e-06, "loss": 1.6697, "mean_token_accuracy": 0.6640021294355393, "num_tokens": 29612931.0, "step": 8120 }, { "epoch": 62.5, "grad_norm": 0.0, "learning_rate": 9.38e-06, "loss": 1.8029, "mean_token_accuracy": 0.6408674240112304, "num_tokens": 29630058.0, "step": 8125 }, { "epoch": 62.53846153846154, "grad_norm": 0.0, "learning_rate": 9.355e-06, "loss": 1.665, "mean_token_accuracy": 0.6620132178068161, "num_tokens": 29649586.0, "step": 8130 }, { "epoch": 62.57692307692308, "grad_norm": 0.0, "learning_rate": 9.33e-06, "loss": 1.7189, "mean_token_accuracy": 0.6585166305303574, "num_tokens": 29668203.0, "step": 8135 }, { "epoch": 62.61538461538461, "grad_norm": 0.0, "learning_rate": 9.305e-06, "loss": 1.7752, "mean_token_accuracy": 0.6421294629573822, "num_tokens": 29686310.0, "step": 8140 }, { "epoch": 62.65384615384615, "grad_norm": 0.0, "learning_rate": 9.28e-06, "loss": 1.6784, "mean_token_accuracy": 0.6623817592859268, "num_tokens": 29705325.0, "step": 8145 }, { "epoch": 62.69230769230769, "grad_norm": 0.0, "learning_rate": 9.255e-06, "loss": 1.6656, "mean_token_accuracy": 0.6657170474529266, "num_tokens": 29724477.0, "step": 8150 }, { "epoch": 62.73076923076923, "grad_norm": 0.0, "learning_rate": 9.23e-06, "loss": 1.7482, "mean_token_accuracy": 0.6518024027347564, "num_tokens": 29742661.0, "step": 8155 }, { "epoch": 62.76923076923077, "grad_norm": 0.0, "learning_rate": 9.205e-06, "loss": 1.7613, "mean_token_accuracy": 0.6471410870552063, "num_tokens": 29760581.0, "step": 8160 }, { "epoch": 62.80769230769231, "grad_norm": 0.0, "learning_rate": 9.180000000000002e-06, "loss": 1.7447, "mean_token_accuracy": 0.6508060991764069, "num_tokens": 29778883.0, "step": 8165 }, { "epoch": 62.84615384615385, "grad_norm": 0.0, "learning_rate": 9.155000000000001e-06, "loss": 1.7297, "mean_token_accuracy": 0.6586943835020065, "num_tokens": 29796604.0, "step": 8170 }, { "epoch": 62.88461538461539, "grad_norm": 0.0, "learning_rate": 9.13e-06, "loss": 1.8185, "mean_token_accuracy": 0.6354205548763275, "num_tokens": 29813853.0, "step": 8175 }, { "epoch": 62.92307692307692, "grad_norm": 0.0, "learning_rate": 9.105000000000002e-06, "loss": 1.7358, "mean_token_accuracy": 0.6511748641729355, "num_tokens": 29832138.0, "step": 8180 }, { "epoch": 62.96153846153846, "grad_norm": 0.0, "learning_rate": 9.080000000000001e-06, "loss": 1.7322, "mean_token_accuracy": 0.6481782138347626, "num_tokens": 29850755.0, "step": 8185 }, { "epoch": 63.0, "grad_norm": 0.0, "learning_rate": 9.055e-06, "loss": 1.7469, "mean_token_accuracy": 0.6486757427453995, "num_tokens": 29868615.0, "step": 8190 }, { "epoch": 63.03846153846154, "grad_norm": 0.0, "learning_rate": 9.030000000000002e-06, "loss": 1.7337, "mean_token_accuracy": 0.648679456114769, "num_tokens": 29886878.0, "step": 8195 }, { "epoch": 63.07692307692308, "grad_norm": 0.0, "learning_rate": 9.005000000000001e-06, "loss": 1.7119, "mean_token_accuracy": 0.658037719130516, "num_tokens": 29905302.0, "step": 8200 }, { "epoch": 63.11538461538461, "grad_norm": 0.0, "learning_rate": 8.98e-06, "loss": 1.7854, "mean_token_accuracy": 0.6415458977222442, "num_tokens": 29923223.0, "step": 8205 }, { "epoch": 63.15384615384615, "grad_norm": 0.0, "learning_rate": 8.955000000000002e-06, "loss": 1.6884, "mean_token_accuracy": 0.6621178448200226, "num_tokens": 29941689.0, "step": 8210 }, { "epoch": 63.19230769230769, "grad_norm": 0.0, "learning_rate": 8.930000000000001e-06, "loss": 1.7898, "mean_token_accuracy": 0.6401251316070556, "num_tokens": 29959317.0, "step": 8215 }, { "epoch": 63.23076923076923, "grad_norm": 0.0, "learning_rate": 8.905e-06, "loss": 1.6928, "mean_token_accuracy": 0.6512804627418518, "num_tokens": 29978320.0, "step": 8220 }, { "epoch": 63.26923076923077, "grad_norm": 0.0, "learning_rate": 8.880000000000001e-06, "loss": 1.7525, "mean_token_accuracy": 0.6476843535900116, "num_tokens": 29996596.0, "step": 8225 }, { "epoch": 63.30769230769231, "grad_norm": 0.0, "learning_rate": 8.855e-06, "loss": 1.6552, "mean_token_accuracy": 0.6682954013347626, "num_tokens": 30015615.0, "step": 8230 }, { "epoch": 63.34615384615385, "grad_norm": 0.0, "learning_rate": 8.83e-06, "loss": 1.7519, "mean_token_accuracy": 0.6464314192533493, "num_tokens": 30033857.0, "step": 8235 }, { "epoch": 63.38461538461539, "grad_norm": 0.0, "learning_rate": 8.805000000000001e-06, "loss": 1.8065, "mean_token_accuracy": 0.6360196202993393, "num_tokens": 30051356.0, "step": 8240 }, { "epoch": 63.42307692307692, "grad_norm": 0.0, "learning_rate": 8.78e-06, "loss": 1.7291, "mean_token_accuracy": 0.661040186882019, "num_tokens": 30069584.0, "step": 8245 }, { "epoch": 63.46153846153846, "grad_norm": 0.0, "learning_rate": 8.755e-06, "loss": 1.7783, "mean_token_accuracy": 0.6421218931674957, "num_tokens": 30087772.0, "step": 8250 }, { "epoch": 63.5, "grad_norm": 0.0, "learning_rate": 8.730000000000001e-06, "loss": 1.7815, "mean_token_accuracy": 0.6430323421955109, "num_tokens": 30105880.0, "step": 8255 }, { "epoch": 63.53846153846154, "grad_norm": 0.0, "learning_rate": 8.705e-06, "loss": 1.7272, "mean_token_accuracy": 0.6486560940742493, "num_tokens": 30124841.0, "step": 8260 }, { "epoch": 63.57692307692308, "grad_norm": 0.0, "learning_rate": 8.68e-06, "loss": 1.757, "mean_token_accuracy": 0.6492252767086029, "num_tokens": 30142267.0, "step": 8265 }, { "epoch": 63.61538461538461, "grad_norm": 0.0, "learning_rate": 8.655000000000001e-06, "loss": 1.7874, "mean_token_accuracy": 0.6347816586494446, "num_tokens": 30160045.0, "step": 8270 }, { "epoch": 63.65384615384615, "grad_norm": 0.0, "learning_rate": 8.63e-06, "loss": 1.729, "mean_token_accuracy": 0.6490083158016204, "num_tokens": 30178837.0, "step": 8275 }, { "epoch": 63.69230769230769, "grad_norm": 0.0, "learning_rate": 8.605e-06, "loss": 1.7159, "mean_token_accuracy": 0.6582196593284607, "num_tokens": 30197308.0, "step": 8280 }, { "epoch": 63.73076923076923, "grad_norm": 0.0, "learning_rate": 8.580000000000001e-06, "loss": 1.6787, "mean_token_accuracy": 0.6678410351276398, "num_tokens": 30215988.0, "step": 8285 }, { "epoch": 63.76923076923077, "grad_norm": 0.0, "learning_rate": 8.555e-06, "loss": 1.7973, "mean_token_accuracy": 0.637213721871376, "num_tokens": 30233493.0, "step": 8290 }, { "epoch": 63.80769230769231, "grad_norm": 0.0, "learning_rate": 8.53e-06, "loss": 1.7189, "mean_token_accuracy": 0.656983396410942, "num_tokens": 30251845.0, "step": 8295 }, { "epoch": 63.84615384615385, "grad_norm": 0.0, "learning_rate": 8.505e-06, "loss": 1.8205, "mean_token_accuracy": 0.6382214099168777, "num_tokens": 30268899.0, "step": 8300 }, { "epoch": 63.88461538461539, "grad_norm": 0.0, "learning_rate": 8.48e-06, "loss": 1.7196, "mean_token_accuracy": 0.6534265607595444, "num_tokens": 30287324.0, "step": 8305 }, { "epoch": 63.92307692307692, "grad_norm": 0.0, "learning_rate": 8.455000000000001e-06, "loss": 1.755, "mean_token_accuracy": 0.649156528711319, "num_tokens": 30305098.0, "step": 8310 }, { "epoch": 63.96153846153846, "grad_norm": 0.0, "learning_rate": 8.43e-06, "loss": 1.6393, "mean_token_accuracy": 0.6654347360134125, "num_tokens": 30324316.0, "step": 8315 }, { "epoch": 64.0, "grad_norm": 0.0, "learning_rate": 8.405e-06, "loss": 1.7622, "mean_token_accuracy": 0.647731038928032, "num_tokens": 30342720.0, "step": 8320 }, { "epoch": 64.03846153846153, "grad_norm": 0.0, "learning_rate": 8.380000000000001e-06, "loss": 1.7291, "mean_token_accuracy": 0.6533550292253494, "num_tokens": 30360719.0, "step": 8325 }, { "epoch": 64.07692307692308, "grad_norm": 0.0, "learning_rate": 8.355e-06, "loss": 1.7221, "mean_token_accuracy": 0.6565493553876877, "num_tokens": 30379966.0, "step": 8330 }, { "epoch": 64.11538461538461, "grad_norm": 0.0, "learning_rate": 8.33e-06, "loss": 1.6687, "mean_token_accuracy": 0.6646548688411713, "num_tokens": 30398817.0, "step": 8335 }, { "epoch": 64.15384615384616, "grad_norm": 0.0, "learning_rate": 8.305000000000001e-06, "loss": 1.8145, "mean_token_accuracy": 0.6381383389234543, "num_tokens": 30416271.0, "step": 8340 }, { "epoch": 64.1923076923077, "grad_norm": 0.0, "learning_rate": 8.28e-06, "loss": 1.7004, "mean_token_accuracy": 0.6580903172492981, "num_tokens": 30434471.0, "step": 8345 }, { "epoch": 64.23076923076923, "grad_norm": 0.0, "learning_rate": 8.255e-06, "loss": 1.6885, "mean_token_accuracy": 0.6602605700492858, "num_tokens": 30453103.0, "step": 8350 }, { "epoch": 64.26923076923077, "grad_norm": 0.0, "learning_rate": 8.23e-06, "loss": 1.6849, "mean_token_accuracy": 0.6583765029907227, "num_tokens": 30471800.0, "step": 8355 }, { "epoch": 64.3076923076923, "grad_norm": 0.0, "learning_rate": 8.205e-06, "loss": 1.7548, "mean_token_accuracy": 0.648283451795578, "num_tokens": 30490316.0, "step": 8360 }, { "epoch": 64.34615384615384, "grad_norm": 0.0, "learning_rate": 8.18e-06, "loss": 1.7349, "mean_token_accuracy": 0.6548443913459778, "num_tokens": 30509159.0, "step": 8365 }, { "epoch": 64.38461538461539, "grad_norm": 0.0, "learning_rate": 8.155e-06, "loss": 1.788, "mean_token_accuracy": 0.6422168552875519, "num_tokens": 30526994.0, "step": 8370 }, { "epoch": 64.42307692307692, "grad_norm": 0.0, "learning_rate": 8.13e-06, "loss": 1.7371, "mean_token_accuracy": 0.6487582057714463, "num_tokens": 30545290.0, "step": 8375 }, { "epoch": 64.46153846153847, "grad_norm": 0.0, "learning_rate": 8.105e-06, "loss": 1.7529, "mean_token_accuracy": 0.649326092004776, "num_tokens": 30563300.0, "step": 8380 }, { "epoch": 64.5, "grad_norm": 0.0, "learning_rate": 8.08e-06, "loss": 1.7657, "mean_token_accuracy": 0.6501888036727905, "num_tokens": 30581217.0, "step": 8385 }, { "epoch": 64.53846153846153, "grad_norm": 0.0, "learning_rate": 8.055e-06, "loss": 1.6891, "mean_token_accuracy": 0.6601796567440033, "num_tokens": 30600060.0, "step": 8390 }, { "epoch": 64.57692307692308, "grad_norm": 0.0, "learning_rate": 8.03e-06, "loss": 1.7908, "mean_token_accuracy": 0.6441485762596131, "num_tokens": 30617690.0, "step": 8395 }, { "epoch": 64.61538461538461, "grad_norm": 0.0, "learning_rate": 8.005e-06, "loss": 1.7446, "mean_token_accuracy": 0.6539681345224381, "num_tokens": 30636302.0, "step": 8400 }, { "epoch": 64.65384615384616, "grad_norm": 0.0, "learning_rate": 7.98e-06, "loss": 1.732, "mean_token_accuracy": 0.653337499499321, "num_tokens": 30654393.0, "step": 8405 }, { "epoch": 64.6923076923077, "grad_norm": 0.0, "learning_rate": 7.955e-06, "loss": 1.7691, "mean_token_accuracy": 0.649465736746788, "num_tokens": 30672362.0, "step": 8410 }, { "epoch": 64.73076923076923, "grad_norm": 0.0, "learning_rate": 7.93e-06, "loss": 1.7322, "mean_token_accuracy": 0.6514311760663987, "num_tokens": 30690881.0, "step": 8415 }, { "epoch": 64.76923076923077, "grad_norm": 0.0, "learning_rate": 7.905e-06, "loss": 1.7387, "mean_token_accuracy": 0.6512942612171173, "num_tokens": 30708498.0, "step": 8420 }, { "epoch": 64.8076923076923, "grad_norm": 0.0, "learning_rate": 7.879999999999999e-06, "loss": 1.7456, "mean_token_accuracy": 0.644093918800354, "num_tokens": 30726723.0, "step": 8425 }, { "epoch": 64.84615384615384, "grad_norm": 0.0, "learning_rate": 7.855e-06, "loss": 1.7664, "mean_token_accuracy": 0.6423812568187713, "num_tokens": 30745189.0, "step": 8430 }, { "epoch": 64.88461538461539, "grad_norm": 0.0, "learning_rate": 7.83e-06, "loss": 1.7892, "mean_token_accuracy": 0.6339890867471695, "num_tokens": 30762976.0, "step": 8435 }, { "epoch": 64.92307692307692, "grad_norm": 0.0, "learning_rate": 7.805e-06, "loss": 1.7893, "mean_token_accuracy": 0.6393702328205109, "num_tokens": 30779931.0, "step": 8440 }, { "epoch": 64.96153846153847, "grad_norm": 0.0, "learning_rate": 7.78e-06, "loss": 1.6928, "mean_token_accuracy": 0.6551145076751709, "num_tokens": 30798591.0, "step": 8445 }, { "epoch": 65.0, "grad_norm": 0.0, "learning_rate": 7.755e-06, "loss": 1.7471, "mean_token_accuracy": 0.6472322821617127, "num_tokens": 30816825.0, "step": 8450 }, { "epoch": 65.03846153846153, "grad_norm": 0.0, "learning_rate": 7.73e-06, "loss": 1.7898, "mean_token_accuracy": 0.6398821353912354, "num_tokens": 30834858.0, "step": 8455 }, { "epoch": 65.07692307692308, "grad_norm": 0.0, "learning_rate": 7.705e-06, "loss": 1.7869, "mean_token_accuracy": 0.6389808714389801, "num_tokens": 30853070.0, "step": 8460 }, { "epoch": 65.11538461538461, "grad_norm": 0.0, "learning_rate": 7.68e-06, "loss": 1.697, "mean_token_accuracy": 0.6605163842439652, "num_tokens": 30872040.0, "step": 8465 }, { "epoch": 65.15384615384616, "grad_norm": 0.0, "learning_rate": 7.655e-06, "loss": 1.7485, "mean_token_accuracy": 0.6444921791553497, "num_tokens": 30890649.0, "step": 8470 }, { "epoch": 65.1923076923077, "grad_norm": 0.0, "learning_rate": 7.630000000000001e-06, "loss": 1.708, "mean_token_accuracy": 0.6534979492425919, "num_tokens": 30909370.0, "step": 8475 }, { "epoch": 65.23076923076923, "grad_norm": 0.0, "learning_rate": 7.605000000000001e-06, "loss": 1.6808, "mean_token_accuracy": 0.667791223526001, "num_tokens": 30928442.0, "step": 8480 }, { "epoch": 65.26923076923077, "grad_norm": 0.0, "learning_rate": 7.580000000000001e-06, "loss": 1.7272, "mean_token_accuracy": 0.6559307098388671, "num_tokens": 30946632.0, "step": 8485 }, { "epoch": 65.3076923076923, "grad_norm": 0.0, "learning_rate": 7.555000000000001e-06, "loss": 1.7644, "mean_token_accuracy": 0.6492012023925782, "num_tokens": 30964389.0, "step": 8490 }, { "epoch": 65.34615384615384, "grad_norm": 0.0, "learning_rate": 7.530000000000001e-06, "loss": 1.6994, "mean_token_accuracy": 0.6621949762105942, "num_tokens": 30982829.0, "step": 8495 }, { "epoch": 65.38461538461539, "grad_norm": 0.0, "learning_rate": 7.505000000000001e-06, "loss": 1.6986, "mean_token_accuracy": 0.6562123388051987, "num_tokens": 31001822.0, "step": 8500 }, { "epoch": 65.42307692307692, "grad_norm": 0.0, "learning_rate": 7.480000000000001e-06, "loss": 1.7202, "mean_token_accuracy": 0.6557656317949295, "num_tokens": 31020258.0, "step": 8505 }, { "epoch": 65.46153846153847, "grad_norm": 0.0, "learning_rate": 7.455000000000001e-06, "loss": 1.7205, "mean_token_accuracy": 0.6543114900588989, "num_tokens": 31038288.0, "step": 8510 }, { "epoch": 65.5, "grad_norm": 0.0, "learning_rate": 7.430000000000001e-06, "loss": 1.8082, "mean_token_accuracy": 0.6374682754278183, "num_tokens": 31055635.0, "step": 8515 }, { "epoch": 65.53846153846153, "grad_norm": 0.0, "learning_rate": 7.405000000000001e-06, "loss": 1.718, "mean_token_accuracy": 0.6601694732904434, "num_tokens": 31073830.0, "step": 8520 }, { "epoch": 65.57692307692308, "grad_norm": 0.0, "learning_rate": 7.3800000000000005e-06, "loss": 1.6752, "mean_token_accuracy": 0.6580190539360047, "num_tokens": 31092611.0, "step": 8525 }, { "epoch": 65.61538461538461, "grad_norm": 0.0, "learning_rate": 7.355000000000001e-06, "loss": 1.8009, "mean_token_accuracy": 0.6394985377788543, "num_tokens": 31110126.0, "step": 8530 }, { "epoch": 65.65384615384616, "grad_norm": 0.0, "learning_rate": 7.330000000000001e-06, "loss": 1.7496, "mean_token_accuracy": 0.6522016197443008, "num_tokens": 31128158.0, "step": 8535 }, { "epoch": 65.6923076923077, "grad_norm": 0.0, "learning_rate": 7.305e-06, "loss": 1.7608, "mean_token_accuracy": 0.6444296389818192, "num_tokens": 31146338.0, "step": 8540 }, { "epoch": 65.73076923076923, "grad_norm": 0.0, "learning_rate": 7.280000000000001e-06, "loss": 1.7123, "mean_token_accuracy": 0.6542566984891891, "num_tokens": 31165043.0, "step": 8545 }, { "epoch": 65.76923076923077, "grad_norm": 0.0, "learning_rate": 7.255000000000001e-06, "loss": 1.7491, "mean_token_accuracy": 0.6535750329494476, "num_tokens": 31183433.0, "step": 8550 }, { "epoch": 65.8076923076923, "grad_norm": 0.0, "learning_rate": 7.230000000000001e-06, "loss": 1.729, "mean_token_accuracy": 0.6469509840011597, "num_tokens": 31201276.0, "step": 8555 }, { "epoch": 65.84615384615384, "grad_norm": 0.0, "learning_rate": 7.2050000000000005e-06, "loss": 1.7259, "mean_token_accuracy": 0.6532280802726745, "num_tokens": 31219465.0, "step": 8560 }, { "epoch": 65.88461538461539, "grad_norm": 0.0, "learning_rate": 7.180000000000001e-06, "loss": 1.7995, "mean_token_accuracy": 0.6446830004453659, "num_tokens": 31236938.0, "step": 8565 }, { "epoch": 65.92307692307692, "grad_norm": 0.0, "learning_rate": 7.155000000000001e-06, "loss": 1.7361, "mean_token_accuracy": 0.6507874131202698, "num_tokens": 31255303.0, "step": 8570 }, { "epoch": 65.96153846153847, "grad_norm": 0.0, "learning_rate": 7.13e-06, "loss": 1.7875, "mean_token_accuracy": 0.6378930985927582, "num_tokens": 31273030.0, "step": 8575 }, { "epoch": 66.0, "grad_norm": 0.0, "learning_rate": 7.105000000000001e-06, "loss": 1.7661, "mean_token_accuracy": 0.6479190230369568, "num_tokens": 31290930.0, "step": 8580 }, { "epoch": 66.03846153846153, "grad_norm": 0.0, "learning_rate": 7.080000000000001e-06, "loss": 1.694, "mean_token_accuracy": 0.6604873329401016, "num_tokens": 31309688.0, "step": 8585 }, { "epoch": 66.07692307692308, "grad_norm": 0.0, "learning_rate": 7.055e-06, "loss": 1.768, "mean_token_accuracy": 0.6487796038389206, "num_tokens": 31327646.0, "step": 8590 }, { "epoch": 66.11538461538461, "grad_norm": 0.0, "learning_rate": 7.0300000000000005e-06, "loss": 1.7308, "mean_token_accuracy": 0.6545611709356308, "num_tokens": 31346349.0, "step": 8595 }, { "epoch": 66.15384615384616, "grad_norm": 0.0, "learning_rate": 7.005000000000001e-06, "loss": 1.7115, "mean_token_accuracy": 0.6564001739025116, "num_tokens": 31364569.0, "step": 8600 }, { "epoch": 66.1923076923077, "grad_norm": 0.0, "learning_rate": 6.98e-06, "loss": 1.7538, "mean_token_accuracy": 0.6477825731039047, "num_tokens": 31382670.0, "step": 8605 }, { "epoch": 66.23076923076923, "grad_norm": 0.0, "learning_rate": 6.955e-06, "loss": 1.7799, "mean_token_accuracy": 0.6439489036798477, "num_tokens": 31400493.0, "step": 8610 }, { "epoch": 66.26923076923077, "grad_norm": 0.0, "learning_rate": 6.9300000000000006e-06, "loss": 1.7762, "mean_token_accuracy": 0.6380701899528504, "num_tokens": 31419137.0, "step": 8615 }, { "epoch": 66.3076923076923, "grad_norm": 0.0, "learning_rate": 6.905e-06, "loss": 1.5876, "mean_token_accuracy": 0.6766410171985626, "num_tokens": 31439373.0, "step": 8620 }, { "epoch": 66.34615384615384, "grad_norm": 0.0, "learning_rate": 6.88e-06, "loss": 1.7701, "mean_token_accuracy": 0.6451247215270997, "num_tokens": 31457394.0, "step": 8625 }, { "epoch": 66.38461538461539, "grad_norm": 0.0, "learning_rate": 6.8550000000000004e-06, "loss": 1.7894, "mean_token_accuracy": 0.6450399339199067, "num_tokens": 31474686.0, "step": 8630 }, { "epoch": 66.42307692307692, "grad_norm": 0.0, "learning_rate": 6.830000000000001e-06, "loss": 1.7736, "mean_token_accuracy": 0.6471458494663238, "num_tokens": 31492406.0, "step": 8635 }, { "epoch": 66.46153846153847, "grad_norm": 0.0, "learning_rate": 6.805e-06, "loss": 1.6584, "mean_token_accuracy": 0.6657146126031875, "num_tokens": 31511342.0, "step": 8640 }, { "epoch": 66.5, "grad_norm": 0.0, "learning_rate": 6.78e-06, "loss": 1.7357, "mean_token_accuracy": 0.6563478767871856, "num_tokens": 31529627.0, "step": 8645 }, { "epoch": 66.53846153846153, "grad_norm": 0.0, "learning_rate": 6.7550000000000005e-06, "loss": 1.7731, "mean_token_accuracy": 0.6471589595079422, "num_tokens": 31546956.0, "step": 8650 }, { "epoch": 66.57692307692308, "grad_norm": 0.0, "learning_rate": 6.73e-06, "loss": 1.7086, "mean_token_accuracy": 0.6558182656764984, "num_tokens": 31565836.0, "step": 8655 }, { "epoch": 66.61538461538461, "grad_norm": 0.0, "learning_rate": 6.705e-06, "loss": 1.7636, "mean_token_accuracy": 0.6479879409074784, "num_tokens": 31583762.0, "step": 8660 }, { "epoch": 66.65384615384616, "grad_norm": 0.0, "learning_rate": 6.68e-06, "loss": 1.7374, "mean_token_accuracy": 0.6501346677541733, "num_tokens": 31601864.0, "step": 8665 }, { "epoch": 66.6923076923077, "grad_norm": 0.0, "learning_rate": 6.655e-06, "loss": 1.7447, "mean_token_accuracy": 0.6492434620857239, "num_tokens": 31619969.0, "step": 8670 }, { "epoch": 66.73076923076923, "grad_norm": 0.0, "learning_rate": 6.63e-06, "loss": 1.7562, "mean_token_accuracy": 0.6430635213851928, "num_tokens": 31638170.0, "step": 8675 }, { "epoch": 66.76923076923077, "grad_norm": 0.0, "learning_rate": 6.605e-06, "loss": 1.6433, "mean_token_accuracy": 0.6654472947120667, "num_tokens": 31657320.0, "step": 8680 }, { "epoch": 66.8076923076923, "grad_norm": 0.0, "learning_rate": 6.58e-06, "loss": 1.7904, "mean_token_accuracy": 0.6401758790016174, "num_tokens": 31674990.0, "step": 8685 }, { "epoch": 66.84615384615384, "grad_norm": 0.0, "learning_rate": 6.555e-06, "loss": 1.7889, "mean_token_accuracy": 0.6432503432035446, "num_tokens": 31692632.0, "step": 8690 }, { "epoch": 66.88461538461539, "grad_norm": 0.0, "learning_rate": 6.53e-06, "loss": 1.8144, "mean_token_accuracy": 0.6369650483131408, "num_tokens": 31710059.0, "step": 8695 }, { "epoch": 66.92307692307692, "grad_norm": 0.0, "learning_rate": 6.505e-06, "loss": 1.6865, "mean_token_accuracy": 0.6562155693769455, "num_tokens": 31729323.0, "step": 8700 }, { "epoch": 66.96153846153847, "grad_norm": 0.0, "learning_rate": 6.48e-06, "loss": 1.8384, "mean_token_accuracy": 0.6272630810737609, "num_tokens": 31746601.0, "step": 8705 }, { "epoch": 67.0, "grad_norm": 0.0, "learning_rate": 6.455e-06, "loss": 1.6929, "mean_token_accuracy": 0.6641561448574066, "num_tokens": 31765035.0, "step": 8710 }, { "epoch": 67.03846153846153, "grad_norm": 0.0, "learning_rate": 6.43e-06, "loss": 1.754, "mean_token_accuracy": 0.6518040865659713, "num_tokens": 31783454.0, "step": 8715 }, { "epoch": 67.07692307692308, "grad_norm": 0.0, "learning_rate": 6.405e-06, "loss": 1.7343, "mean_token_accuracy": 0.6550649493932724, "num_tokens": 31801583.0, "step": 8720 }, { "epoch": 67.11538461538461, "grad_norm": 0.0, "learning_rate": 6.38e-06, "loss": 1.7363, "mean_token_accuracy": 0.6507348865270615, "num_tokens": 31819899.0, "step": 8725 }, { "epoch": 67.15384615384616, "grad_norm": 0.0, "learning_rate": 6.355e-06, "loss": 1.7462, "mean_token_accuracy": 0.6472515076398849, "num_tokens": 31838133.0, "step": 8730 }, { "epoch": 67.1923076923077, "grad_norm": 0.0, "learning_rate": 6.3299999999999995e-06, "loss": 1.7469, "mean_token_accuracy": 0.6492879390716553, "num_tokens": 31856487.0, "step": 8735 }, { "epoch": 67.23076923076923, "grad_norm": 0.0, "learning_rate": 6.305e-06, "loss": 1.7255, "mean_token_accuracy": 0.6533631831407547, "num_tokens": 31874474.0, "step": 8740 }, { "epoch": 67.26923076923077, "grad_norm": 0.0, "learning_rate": 6.28e-06, "loss": 1.6755, "mean_token_accuracy": 0.6652740865945816, "num_tokens": 31893418.0, "step": 8745 }, { "epoch": 67.3076923076923, "grad_norm": 0.0, "learning_rate": 6.254999999999999e-06, "loss": 1.7896, "mean_token_accuracy": 0.639714989066124, "num_tokens": 31911351.0, "step": 8750 }, { "epoch": 67.34615384615384, "grad_norm": 0.0, "learning_rate": 6.2300000000000005e-06, "loss": 1.7045, "mean_token_accuracy": 0.653382807970047, "num_tokens": 31930240.0, "step": 8755 }, { "epoch": 67.38461538461539, "grad_norm": 0.0, "learning_rate": 6.205000000000001e-06, "loss": 1.8023, "mean_token_accuracy": 0.6487904995679855, "num_tokens": 31947197.0, "step": 8760 }, { "epoch": 67.42307692307692, "grad_norm": 0.0, "learning_rate": 6.18e-06, "loss": 1.7365, "mean_token_accuracy": 0.6566553086042404, "num_tokens": 31965045.0, "step": 8765 }, { "epoch": 67.46153846153847, "grad_norm": 0.0, "learning_rate": 6.155e-06, "loss": 1.7982, "mean_token_accuracy": 0.6392051458358765, "num_tokens": 31982476.0, "step": 8770 }, { "epoch": 67.5, "grad_norm": 0.0, "learning_rate": 6.130000000000001e-06, "loss": 1.7112, "mean_token_accuracy": 0.6520894914865494, "num_tokens": 32001093.0, "step": 8775 }, { "epoch": 67.53846153846153, "grad_norm": 0.0, "learning_rate": 6.105e-06, "loss": 1.7363, "mean_token_accuracy": 0.651617681980133, "num_tokens": 32019131.0, "step": 8780 }, { "epoch": 67.57692307692308, "grad_norm": 0.0, "learning_rate": 6.08e-06, "loss": 1.7092, "mean_token_accuracy": 0.6570148169994354, "num_tokens": 32037428.0, "step": 8785 }, { "epoch": 67.61538461538461, "grad_norm": 0.0, "learning_rate": 6.0550000000000005e-06, "loss": 1.7594, "mean_token_accuracy": 0.6468034237623215, "num_tokens": 32055491.0, "step": 8790 }, { "epoch": 67.65384615384616, "grad_norm": 0.0, "learning_rate": 6.03e-06, "loss": 1.7737, "mean_token_accuracy": 0.6413617432117462, "num_tokens": 32073407.0, "step": 8795 }, { "epoch": 67.6923076923077, "grad_norm": 0.0, "learning_rate": 6.005e-06, "loss": 1.733, "mean_token_accuracy": 0.6513085424900055, "num_tokens": 32092096.0, "step": 8800 }, { "epoch": 67.73076923076923, "grad_norm": 0.0, "learning_rate": 5.98e-06, "loss": 1.6929, "mean_token_accuracy": 0.6587874621152878, "num_tokens": 32110873.0, "step": 8805 }, { "epoch": 67.76923076923077, "grad_norm": 0.0, "learning_rate": 5.955000000000001e-06, "loss": 1.6503, "mean_token_accuracy": 0.6717524290084839, "num_tokens": 32129582.0, "step": 8810 }, { "epoch": 67.8076923076923, "grad_norm": 0.0, "learning_rate": 5.93e-06, "loss": 1.724, "mean_token_accuracy": 0.6568674236536026, "num_tokens": 32148017.0, "step": 8815 }, { "epoch": 67.84615384615384, "grad_norm": 0.0, "learning_rate": 5.905e-06, "loss": 1.7912, "mean_token_accuracy": 0.642418372631073, "num_tokens": 32165982.0, "step": 8820 }, { "epoch": 67.88461538461539, "grad_norm": 0.0, "learning_rate": 5.8800000000000005e-06, "loss": 1.6831, "mean_token_accuracy": 0.6590377599000931, "num_tokens": 32185226.0, "step": 8825 }, { "epoch": 67.92307692307692, "grad_norm": 0.0, "learning_rate": 5.855e-06, "loss": 1.8126, "mean_token_accuracy": 0.6364179074764251, "num_tokens": 32202532.0, "step": 8830 }, { "epoch": 67.96153846153847, "grad_norm": 0.0, "learning_rate": 5.83e-06, "loss": 1.842, "mean_token_accuracy": 0.629499414563179, "num_tokens": 32219939.0, "step": 8835 }, { "epoch": 68.0, "grad_norm": 0.0, "learning_rate": 5.805e-06, "loss": 1.7138, "mean_token_accuracy": 0.651302108168602, "num_tokens": 32239140.0, "step": 8840 }, { "epoch": 68.03846153846153, "grad_norm": 0.0, "learning_rate": 5.78e-06, "loss": 1.7806, "mean_token_accuracy": 0.6400546163320542, "num_tokens": 32257043.0, "step": 8845 }, { "epoch": 68.07692307692308, "grad_norm": 0.0, "learning_rate": 5.755e-06, "loss": 1.7155, "mean_token_accuracy": 0.6579635113477706, "num_tokens": 32275029.0, "step": 8850 }, { "epoch": 68.11538461538461, "grad_norm": 0.0, "learning_rate": 5.73e-06, "loss": 1.7521, "mean_token_accuracy": 0.6502374112606049, "num_tokens": 32292737.0, "step": 8855 }, { "epoch": 68.15384615384616, "grad_norm": 0.0, "learning_rate": 5.705e-06, "loss": 1.7176, "mean_token_accuracy": 0.6563062936067581, "num_tokens": 32311081.0, "step": 8860 }, { "epoch": 68.1923076923077, "grad_norm": 0.0, "learning_rate": 5.680000000000001e-06, "loss": 1.7643, "mean_token_accuracy": 0.6508367985486985, "num_tokens": 32328973.0, "step": 8865 }, { "epoch": 68.23076923076923, "grad_norm": 0.0, "learning_rate": 5.655000000000001e-06, "loss": 1.7741, "mean_token_accuracy": 0.6429946154356003, "num_tokens": 32346455.0, "step": 8870 }, { "epoch": 68.26923076923077, "grad_norm": 0.0, "learning_rate": 5.63e-06, "loss": 1.7832, "mean_token_accuracy": 0.6436864167451859, "num_tokens": 32364893.0, "step": 8875 }, { "epoch": 68.3076923076923, "grad_norm": 0.0, "learning_rate": 5.6050000000000005e-06, "loss": 1.6646, "mean_token_accuracy": 0.66810921728611, "num_tokens": 32383625.0, "step": 8880 }, { "epoch": 68.34615384615384, "grad_norm": 0.0, "learning_rate": 5.580000000000001e-06, "loss": 1.7547, "mean_token_accuracy": 0.6530828505754471, "num_tokens": 32401543.0, "step": 8885 }, { "epoch": 68.38461538461539, "grad_norm": 0.0, "learning_rate": 5.555e-06, "loss": 1.7685, "mean_token_accuracy": 0.6441059678792953, "num_tokens": 32419418.0, "step": 8890 }, { "epoch": 68.42307692307692, "grad_norm": 0.0, "learning_rate": 5.53e-06, "loss": 1.8152, "mean_token_accuracy": 0.6348958969116211, "num_tokens": 32436930.0, "step": 8895 }, { "epoch": 68.46153846153847, "grad_norm": 0.0, "learning_rate": 5.505000000000001e-06, "loss": 1.7356, "mean_token_accuracy": 0.6481510013341903, "num_tokens": 32455486.0, "step": 8900 }, { "epoch": 68.5, "grad_norm": 0.0, "learning_rate": 5.48e-06, "loss": 1.6469, "mean_token_accuracy": 0.6719613999128342, "num_tokens": 32475172.0, "step": 8905 }, { "epoch": 68.53846153846153, "grad_norm": 0.0, "learning_rate": 5.455e-06, "loss": 1.7171, "mean_token_accuracy": 0.6506960332393646, "num_tokens": 32493791.0, "step": 8910 }, { "epoch": 68.57692307692308, "grad_norm": 0.0, "learning_rate": 5.4300000000000005e-06, "loss": 1.7171, "mean_token_accuracy": 0.6513607323169708, "num_tokens": 32512069.0, "step": 8915 }, { "epoch": 68.61538461538461, "grad_norm": 0.0, "learning_rate": 5.405e-06, "loss": 1.6989, "mean_token_accuracy": 0.6555921375751496, "num_tokens": 32530853.0, "step": 8920 }, { "epoch": 68.65384615384616, "grad_norm": 0.0, "learning_rate": 5.38e-06, "loss": 1.7921, "mean_token_accuracy": 0.6404555082321167, "num_tokens": 32547929.0, "step": 8925 }, { "epoch": 68.6923076923077, "grad_norm": 0.0, "learning_rate": 5.355e-06, "loss": 1.7402, "mean_token_accuracy": 0.6483048588037491, "num_tokens": 32566393.0, "step": 8930 }, { "epoch": 68.73076923076923, "grad_norm": 0.0, "learning_rate": 5.330000000000001e-06, "loss": 1.7599, "mean_token_accuracy": 0.6482443422079086, "num_tokens": 32584222.0, "step": 8935 }, { "epoch": 68.76923076923077, "grad_norm": 0.0, "learning_rate": 5.305e-06, "loss": 1.7298, "mean_token_accuracy": 0.6541435539722442, "num_tokens": 32602652.0, "step": 8940 }, { "epoch": 68.8076923076923, "grad_norm": 0.0, "learning_rate": 5.28e-06, "loss": 1.8005, "mean_token_accuracy": 0.6381476581096649, "num_tokens": 32620436.0, "step": 8945 }, { "epoch": 68.84615384615384, "grad_norm": 0.0, "learning_rate": 5.2550000000000005e-06, "loss": 1.7707, "mean_token_accuracy": 0.6429592072963715, "num_tokens": 32638580.0, "step": 8950 }, { "epoch": 68.88461538461539, "grad_norm": 0.0, "learning_rate": 5.23e-06, "loss": 1.7375, "mean_token_accuracy": 0.6491723597049713, "num_tokens": 32657574.0, "step": 8955 }, { "epoch": 68.92307692307692, "grad_norm": 0.0, "learning_rate": 5.205e-06, "loss": 1.6892, "mean_token_accuracy": 0.6607306629419327, "num_tokens": 32676715.0, "step": 8960 }, { "epoch": 68.96153846153847, "grad_norm": 0.0, "learning_rate": 5.18e-06, "loss": 1.7511, "mean_token_accuracy": 0.6513434231281281, "num_tokens": 32694775.0, "step": 8965 }, { "epoch": 69.0, "grad_norm": 0.0, "learning_rate": 5.155e-06, "loss": 1.7027, "mean_token_accuracy": 0.6560029089450836, "num_tokens": 32713245.0, "step": 8970 }, { "epoch": 69.03846153846153, "grad_norm": 0.0, "learning_rate": 5.13e-06, "loss": 1.7155, "mean_token_accuracy": 0.6513414025306702, "num_tokens": 32732025.0, "step": 8975 }, { "epoch": 69.07692307692308, "grad_norm": 0.0, "learning_rate": 5.105e-06, "loss": 1.7302, "mean_token_accuracy": 0.6497060090303421, "num_tokens": 32751198.0, "step": 8980 }, { "epoch": 69.11538461538461, "grad_norm": 0.0, "learning_rate": 5.08e-06, "loss": 1.7842, "mean_token_accuracy": 0.6456437230110168, "num_tokens": 32768638.0, "step": 8985 }, { "epoch": 69.15384615384616, "grad_norm": 0.0, "learning_rate": 5.055e-06, "loss": 1.7089, "mean_token_accuracy": 0.6585129857063293, "num_tokens": 32786796.0, "step": 8990 }, { "epoch": 69.1923076923077, "grad_norm": 0.0, "learning_rate": 5.03e-06, "loss": 1.8112, "mean_token_accuracy": 0.6334796369075775, "num_tokens": 32804348.0, "step": 8995 }, { "epoch": 69.23076923076923, "grad_norm": 0.0, "learning_rate": 5.005e-06, "loss": 1.7595, "mean_token_accuracy": 0.6397767275571823, "num_tokens": 32822708.0, "step": 9000 }, { "epoch": 69.26923076923077, "grad_norm": 0.0, "learning_rate": 4.98e-06, "loss": 1.724, "mean_token_accuracy": 0.6544425517320633, "num_tokens": 32841355.0, "step": 9005 }, { "epoch": 69.3076923076923, "grad_norm": 0.0, "learning_rate": 4.955e-06, "loss": 1.7819, "mean_token_accuracy": 0.6387849062681198, "num_tokens": 32859210.0, "step": 9010 }, { "epoch": 69.34615384615384, "grad_norm": 0.0, "learning_rate": 4.93e-06, "loss": 1.717, "mean_token_accuracy": 0.6556219637393952, "num_tokens": 32877341.0, "step": 9015 }, { "epoch": 69.38461538461539, "grad_norm": 0.0, "learning_rate": 4.9050000000000005e-06, "loss": 1.7134, "mean_token_accuracy": 0.6564109534025192, "num_tokens": 32895282.0, "step": 9020 }, { "epoch": 69.42307692307692, "grad_norm": 0.0, "learning_rate": 4.880000000000001e-06, "loss": 1.7273, "mean_token_accuracy": 0.6536680698394776, "num_tokens": 32912961.0, "step": 9025 }, { "epoch": 69.46153846153847, "grad_norm": 0.0, "learning_rate": 4.855e-06, "loss": 1.7511, "mean_token_accuracy": 0.645021739602089, "num_tokens": 32931470.0, "step": 9030 }, { "epoch": 69.5, "grad_norm": 0.0, "learning_rate": 4.83e-06, "loss": 1.8157, "mean_token_accuracy": 0.6354044616222382, "num_tokens": 32949088.0, "step": 9035 }, { "epoch": 69.53846153846153, "grad_norm": 0.0, "learning_rate": 4.805000000000001e-06, "loss": 1.7142, "mean_token_accuracy": 0.6553671360015869, "num_tokens": 32967668.0, "step": 9040 }, { "epoch": 69.57692307692308, "grad_norm": 0.0, "learning_rate": 4.780000000000001e-06, "loss": 1.7314, "mean_token_accuracy": 0.6554368674755097, "num_tokens": 32985426.0, "step": 9045 }, { "epoch": 69.61538461538461, "grad_norm": 0.0, "learning_rate": 4.755e-06, "loss": 1.7828, "mean_token_accuracy": 0.6435191184282303, "num_tokens": 33003912.0, "step": 9050 }, { "epoch": 69.65384615384616, "grad_norm": 0.0, "learning_rate": 4.7300000000000005e-06, "loss": 1.6889, "mean_token_accuracy": 0.6613710403442383, "num_tokens": 33022739.0, "step": 9055 }, { "epoch": 69.6923076923077, "grad_norm": 0.0, "learning_rate": 4.705000000000001e-06, "loss": 1.7765, "mean_token_accuracy": 0.6455245047807694, "num_tokens": 33040528.0, "step": 9060 }, { "epoch": 69.73076923076923, "grad_norm": 0.0, "learning_rate": 4.68e-06, "loss": 1.734, "mean_token_accuracy": 0.6510042399168015, "num_tokens": 33058790.0, "step": 9065 }, { "epoch": 69.76923076923077, "grad_norm": 0.0, "learning_rate": 4.655e-06, "loss": 1.7801, "mean_token_accuracy": 0.6483240693807601, "num_tokens": 33076529.0, "step": 9070 }, { "epoch": 69.8076923076923, "grad_norm": 0.0, "learning_rate": 4.6300000000000006e-06, "loss": 1.706, "mean_token_accuracy": 0.6595020055770874, "num_tokens": 33095203.0, "step": 9075 }, { "epoch": 69.84615384615384, "grad_norm": 0.0, "learning_rate": 4.605e-06, "loss": 1.7031, "mean_token_accuracy": 0.6580096065998078, "num_tokens": 33113524.0, "step": 9080 }, { "epoch": 69.88461538461539, "grad_norm": 0.0, "learning_rate": 4.58e-06, "loss": 1.7544, "mean_token_accuracy": 0.6489035278558731, "num_tokens": 33131870.0, "step": 9085 }, { "epoch": 69.92307692307692, "grad_norm": 0.0, "learning_rate": 4.5550000000000004e-06, "loss": 1.7172, "mean_token_accuracy": 0.6561585962772369, "num_tokens": 33150722.0, "step": 9090 }, { "epoch": 69.96153846153847, "grad_norm": 0.0, "learning_rate": 4.53e-06, "loss": 1.6798, "mean_token_accuracy": 0.6648322910070419, "num_tokens": 33169488.0, "step": 9095 }, { "epoch": 70.0, "grad_norm": 0.0, "learning_rate": 4.505e-06, "loss": 1.7484, "mean_token_accuracy": 0.6524565041065216, "num_tokens": 33187350.0, "step": 9100 }, { "epoch": 70.03846153846153, "grad_norm": 0.0, "learning_rate": 4.48e-06, "loss": 1.7404, "mean_token_accuracy": 0.6531502634286881, "num_tokens": 33205235.0, "step": 9105 }, { "epoch": 70.07692307692308, "grad_norm": 0.0, "learning_rate": 4.4550000000000005e-06, "loss": 1.6612, "mean_token_accuracy": 0.6699670910835266, "num_tokens": 33224500.0, "step": 9110 }, { "epoch": 70.11538461538461, "grad_norm": 0.0, "learning_rate": 4.43e-06, "loss": 1.7034, "mean_token_accuracy": 0.6562402963638305, "num_tokens": 33243151.0, "step": 9115 }, { "epoch": 70.15384615384616, "grad_norm": 0.0, "learning_rate": 4.405e-06, "loss": 1.7118, "mean_token_accuracy": 0.6556406348943711, "num_tokens": 33261667.0, "step": 9120 }, { "epoch": 70.1923076923077, "grad_norm": 0.0, "learning_rate": 4.38e-06, "loss": 1.7115, "mean_token_accuracy": 0.6550081551074982, "num_tokens": 33280058.0, "step": 9125 }, { "epoch": 70.23076923076923, "grad_norm": 0.0, "learning_rate": 4.355e-06, "loss": 1.6983, "mean_token_accuracy": 0.6546464413404465, "num_tokens": 33299469.0, "step": 9130 }, { "epoch": 70.26923076923077, "grad_norm": 0.0, "learning_rate": 4.33e-06, "loss": 1.7084, "mean_token_accuracy": 0.6555704712867737, "num_tokens": 33317456.0, "step": 9135 }, { "epoch": 70.3076923076923, "grad_norm": 0.0, "learning_rate": 4.305e-06, "loss": 1.7392, "mean_token_accuracy": 0.649817219376564, "num_tokens": 33336143.0, "step": 9140 }, { "epoch": 70.34615384615384, "grad_norm": 0.0, "learning_rate": 4.28e-06, "loss": 1.7933, "mean_token_accuracy": 0.6456594437360763, "num_tokens": 33353477.0, "step": 9145 }, { "epoch": 70.38461538461539, "grad_norm": 0.0, "learning_rate": 4.255e-06, "loss": 1.7628, "mean_token_accuracy": 0.6442059248685836, "num_tokens": 33371827.0, "step": 9150 }, { "epoch": 70.42307692307692, "grad_norm": 0.0, "learning_rate": 4.23e-06, "loss": 1.784, "mean_token_accuracy": 0.6425058543682098, "num_tokens": 33390073.0, "step": 9155 }, { "epoch": 70.46153846153847, "grad_norm": 0.0, "learning_rate": 4.2049999999999996e-06, "loss": 1.8011, "mean_token_accuracy": 0.6376978397369385, "num_tokens": 33408396.0, "step": 9160 }, { "epoch": 70.5, "grad_norm": 0.0, "learning_rate": 4.18e-06, "loss": 1.7991, "mean_token_accuracy": 0.6374391674995422, "num_tokens": 33425771.0, "step": 9165 }, { "epoch": 70.53846153846153, "grad_norm": 0.0, "learning_rate": 4.155e-06, "loss": 1.8244, "mean_token_accuracy": 0.6361591160297394, "num_tokens": 33443218.0, "step": 9170 }, { "epoch": 70.57692307692308, "grad_norm": 0.0, "learning_rate": 4.13e-06, "loss": 1.7396, "mean_token_accuracy": 0.6503497928380966, "num_tokens": 33462100.0, "step": 9175 }, { "epoch": 70.61538461538461, "grad_norm": 0.0, "learning_rate": 4.1050000000000005e-06, "loss": 1.7632, "mean_token_accuracy": 0.6452891588211059, "num_tokens": 33479921.0, "step": 9180 }, { "epoch": 70.65384615384616, "grad_norm": 0.0, "learning_rate": 4.080000000000001e-06, "loss": 1.7462, "mean_token_accuracy": 0.6508474260568619, "num_tokens": 33497623.0, "step": 9185 }, { "epoch": 70.6923076923077, "grad_norm": 0.0, "learning_rate": 4.055e-06, "loss": 1.7899, "mean_token_accuracy": 0.6407651424407959, "num_tokens": 33515029.0, "step": 9190 }, { "epoch": 70.73076923076923, "grad_norm": 0.0, "learning_rate": 4.03e-06, "loss": 1.677, "mean_token_accuracy": 0.6583020657300949, "num_tokens": 33533481.0, "step": 9195 }, { "epoch": 70.76923076923077, "grad_norm": 0.0, "learning_rate": 4.005000000000001e-06, "loss": 1.7266, "mean_token_accuracy": 0.6561136662960052, "num_tokens": 33551333.0, "step": 9200 }, { "epoch": 70.8076923076923, "grad_norm": 0.0, "learning_rate": 3.98e-06, "loss": 1.793, "mean_token_accuracy": 0.6400943785905838, "num_tokens": 33569017.0, "step": 9205 }, { "epoch": 70.84615384615384, "grad_norm": 0.0, "learning_rate": 3.955e-06, "loss": 1.7658, "mean_token_accuracy": 0.6454756885766983, "num_tokens": 33587052.0, "step": 9210 }, { "epoch": 70.88461538461539, "grad_norm": 0.0, "learning_rate": 3.9300000000000005e-06, "loss": 1.7387, "mean_token_accuracy": 0.6554303973913193, "num_tokens": 33604732.0, "step": 9215 }, { "epoch": 70.92307692307692, "grad_norm": 0.0, "learning_rate": 3.905000000000001e-06, "loss": 1.7101, "mean_token_accuracy": 0.6562773436307907, "num_tokens": 33623715.0, "step": 9220 }, { "epoch": 70.96153846153847, "grad_norm": 0.0, "learning_rate": 3.88e-06, "loss": 1.6681, "mean_token_accuracy": 0.6675117045640946, "num_tokens": 33642936.0, "step": 9225 }, { "epoch": 71.0, "grad_norm": 0.0, "learning_rate": 3.855e-06, "loss": 1.7163, "mean_token_accuracy": 0.6542115420103073, "num_tokens": 33661455.0, "step": 9230 }, { "epoch": 71.03846153846153, "grad_norm": 0.0, "learning_rate": 3.830000000000001e-06, "loss": 1.7668, "mean_token_accuracy": 0.6549015700817108, "num_tokens": 33679273.0, "step": 9235 }, { "epoch": 71.07692307692308, "grad_norm": 0.0, "learning_rate": 3.8050000000000004e-06, "loss": 1.7505, "mean_token_accuracy": 0.6491267591714859, "num_tokens": 33697022.0, "step": 9240 }, { "epoch": 71.11538461538461, "grad_norm": 0.0, "learning_rate": 3.7800000000000002e-06, "loss": 1.6273, "mean_token_accuracy": 0.6743998020887375, "num_tokens": 33715961.0, "step": 9245 }, { "epoch": 71.15384615384616, "grad_norm": 0.0, "learning_rate": 3.755e-06, "loss": 1.722, "mean_token_accuracy": 0.657853615283966, "num_tokens": 33734029.0, "step": 9250 }, { "epoch": 71.1923076923077, "grad_norm": 0.0, "learning_rate": 3.7300000000000003e-06, "loss": 1.7182, "mean_token_accuracy": 0.653509646654129, "num_tokens": 33752377.0, "step": 9255 }, { "epoch": 71.23076923076923, "grad_norm": 0.0, "learning_rate": 3.705e-06, "loss": 1.7218, "mean_token_accuracy": 0.6558929115533829, "num_tokens": 33770568.0, "step": 9260 }, { "epoch": 71.26923076923077, "grad_norm": 0.0, "learning_rate": 3.68e-06, "loss": 1.6921, "mean_token_accuracy": 0.6597426235675812, "num_tokens": 33789676.0, "step": 9265 }, { "epoch": 71.3076923076923, "grad_norm": 0.0, "learning_rate": 3.655e-06, "loss": 1.7497, "mean_token_accuracy": 0.6486911535263061, "num_tokens": 33807428.0, "step": 9270 }, { "epoch": 71.34615384615384, "grad_norm": 0.0, "learning_rate": 3.63e-06, "loss": 1.7888, "mean_token_accuracy": 0.6441873848438263, "num_tokens": 33825382.0, "step": 9275 }, { "epoch": 71.38461538461539, "grad_norm": 0.0, "learning_rate": 3.6050000000000002e-06, "loss": 1.7271, "mean_token_accuracy": 0.6498310655355454, "num_tokens": 33844179.0, "step": 9280 }, { "epoch": 71.42307692307692, "grad_norm": 0.0, "learning_rate": 3.58e-06, "loss": 1.7322, "mean_token_accuracy": 0.6521621882915497, "num_tokens": 33862565.0, "step": 9285 }, { "epoch": 71.46153846153847, "grad_norm": 0.0, "learning_rate": 3.555e-06, "loss": 1.7472, "mean_token_accuracy": 0.6527814954519272, "num_tokens": 33881172.0, "step": 9290 }, { "epoch": 71.5, "grad_norm": 0.0, "learning_rate": 3.53e-06, "loss": 1.6943, "mean_token_accuracy": 0.661613667011261, "num_tokens": 33899656.0, "step": 9295 }, { "epoch": 71.53846153846153, "grad_norm": 0.0, "learning_rate": 3.505e-06, "loss": 1.7055, "mean_token_accuracy": 0.647359648346901, "num_tokens": 33918686.0, "step": 9300 }, { "epoch": 71.57692307692308, "grad_norm": 0.0, "learning_rate": 3.4799999999999997e-06, "loss": 1.6996, "mean_token_accuracy": 0.6571532785892487, "num_tokens": 33937081.0, "step": 9305 }, { "epoch": 71.61538461538461, "grad_norm": 0.0, "learning_rate": 3.455e-06, "loss": 1.7333, "mean_token_accuracy": 0.6503713637590408, "num_tokens": 33954828.0, "step": 9310 }, { "epoch": 71.65384615384616, "grad_norm": 0.0, "learning_rate": 3.4299999999999998e-06, "loss": 1.6752, "mean_token_accuracy": 0.6633023738861084, "num_tokens": 33973326.0, "step": 9315 }, { "epoch": 71.6923076923077, "grad_norm": 0.0, "learning_rate": 3.405e-06, "loss": 1.757, "mean_token_accuracy": 0.6501501977443696, "num_tokens": 33991262.0, "step": 9320 }, { "epoch": 71.73076923076923, "grad_norm": 0.0, "learning_rate": 3.38e-06, "loss": 1.6826, "mean_token_accuracy": 0.6632235527038575, "num_tokens": 34010592.0, "step": 9325 }, { "epoch": 71.76923076923077, "grad_norm": 0.0, "learning_rate": 3.3550000000000005e-06, "loss": 1.7825, "mean_token_accuracy": 0.6419746041297912, "num_tokens": 34028492.0, "step": 9330 }, { "epoch": 71.8076923076923, "grad_norm": 0.0, "learning_rate": 3.3300000000000003e-06, "loss": 1.8112, "mean_token_accuracy": 0.6337680786848068, "num_tokens": 34045754.0, "step": 9335 }, { "epoch": 71.84615384615384, "grad_norm": 0.0, "learning_rate": 3.3050000000000005e-06, "loss": 1.7796, "mean_token_accuracy": 0.6412538975477219, "num_tokens": 34063747.0, "step": 9340 }, { "epoch": 71.88461538461539, "grad_norm": 0.0, "learning_rate": 3.2800000000000004e-06, "loss": 1.8211, "mean_token_accuracy": 0.6337336391210556, "num_tokens": 34080973.0, "step": 9345 }, { "epoch": 71.92307692307692, "grad_norm": 0.0, "learning_rate": 3.2550000000000006e-06, "loss": 1.7922, "mean_token_accuracy": 0.6401098728179931, "num_tokens": 34099093.0, "step": 9350 }, { "epoch": 71.96153846153847, "grad_norm": 0.0, "learning_rate": 3.2300000000000004e-06, "loss": 1.8057, "mean_token_accuracy": 0.6371114939451218, "num_tokens": 34117109.0, "step": 9355 }, { "epoch": 72.0, "grad_norm": 0.0, "learning_rate": 3.2050000000000002e-06, "loss": 1.7633, "mean_token_accuracy": 0.6458366394042969, "num_tokens": 34135560.0, "step": 9360 }, { "epoch": 72.03846153846153, "grad_norm": 0.0, "learning_rate": 3.1800000000000005e-06, "loss": 1.7497, "mean_token_accuracy": 0.6472752332687378, "num_tokens": 34153754.0, "step": 9365 }, { "epoch": 72.07692307692308, "grad_norm": 0.0, "learning_rate": 3.1550000000000003e-06, "loss": 1.7499, "mean_token_accuracy": 0.6482025504112243, "num_tokens": 34172147.0, "step": 9370 }, { "epoch": 72.11538461538461, "grad_norm": 0.0, "learning_rate": 3.13e-06, "loss": 1.7281, "mean_token_accuracy": 0.6510056018829345, "num_tokens": 34191430.0, "step": 9375 }, { "epoch": 72.15384615384616, "grad_norm": 0.0, "learning_rate": 3.1050000000000003e-06, "loss": 1.725, "mean_token_accuracy": 0.6562888383865356, "num_tokens": 34209348.0, "step": 9380 }, { "epoch": 72.1923076923077, "grad_norm": 0.0, "learning_rate": 3.08e-06, "loss": 1.6941, "mean_token_accuracy": 0.6613337188959122, "num_tokens": 34227848.0, "step": 9385 }, { "epoch": 72.23076923076923, "grad_norm": 0.0, "learning_rate": 3.0550000000000004e-06, "loss": 1.6769, "mean_token_accuracy": 0.6630650132894516, "num_tokens": 34246818.0, "step": 9390 }, { "epoch": 72.26923076923077, "grad_norm": 0.0, "learning_rate": 3.0300000000000002e-06, "loss": 1.75, "mean_token_accuracy": 0.6472353667020798, "num_tokens": 34264967.0, "step": 9395 }, { "epoch": 72.3076923076923, "grad_norm": 0.0, "learning_rate": 3.005e-06, "loss": 1.7359, "mean_token_accuracy": 0.6492182224988937, "num_tokens": 34283311.0, "step": 9400 }, { "epoch": 72.34615384615384, "grad_norm": 0.0, "learning_rate": 2.9800000000000003e-06, "loss": 1.7515, "mean_token_accuracy": 0.6472180843353271, "num_tokens": 34301877.0, "step": 9405 }, { "epoch": 72.38461538461539, "grad_norm": 0.0, "learning_rate": 2.955e-06, "loss": 1.7565, "mean_token_accuracy": 0.6518394500017166, "num_tokens": 34319958.0, "step": 9410 }, { "epoch": 72.42307692307692, "grad_norm": 0.0, "learning_rate": 2.93e-06, "loss": 1.7295, "mean_token_accuracy": 0.6590041428804397, "num_tokens": 34338437.0, "step": 9415 }, { "epoch": 72.46153846153847, "grad_norm": 0.0, "learning_rate": 2.905e-06, "loss": 1.8063, "mean_token_accuracy": 0.6363181412220001, "num_tokens": 34356326.0, "step": 9420 }, { "epoch": 72.5, "grad_norm": 0.0, "learning_rate": 2.88e-06, "loss": 1.811, "mean_token_accuracy": 0.6379002064466477, "num_tokens": 34373863.0, "step": 9425 }, { "epoch": 72.53846153846153, "grad_norm": 0.0, "learning_rate": 2.855e-06, "loss": 1.7939, "mean_token_accuracy": 0.64038887321949, "num_tokens": 34391352.0, "step": 9430 }, { "epoch": 72.57692307692308, "grad_norm": 0.0, "learning_rate": 2.83e-06, "loss": 1.7532, "mean_token_accuracy": 0.6454617887735367, "num_tokens": 34409347.0, "step": 9435 }, { "epoch": 72.61538461538461, "grad_norm": 0.0, "learning_rate": 2.805e-06, "loss": 1.7184, "mean_token_accuracy": 0.6529299259185791, "num_tokens": 34427248.0, "step": 9440 }, { "epoch": 72.65384615384616, "grad_norm": 0.0, "learning_rate": 2.78e-06, "loss": 1.7713, "mean_token_accuracy": 0.6492305964231491, "num_tokens": 34445017.0, "step": 9445 }, { "epoch": 72.6923076923077, "grad_norm": 0.0, "learning_rate": 2.7550000000000003e-06, "loss": 1.7775, "mean_token_accuracy": 0.6450568825006485, "num_tokens": 34462759.0, "step": 9450 }, { "epoch": 72.73076923076923, "grad_norm": 0.0, "learning_rate": 2.73e-06, "loss": 1.791, "mean_token_accuracy": 0.6397794485092163, "num_tokens": 34480097.0, "step": 9455 }, { "epoch": 72.76923076923077, "grad_norm": 0.0, "learning_rate": 2.7050000000000004e-06, "loss": 1.7267, "mean_token_accuracy": 0.6520882248878479, "num_tokens": 34498104.0, "step": 9460 }, { "epoch": 72.8076923076923, "grad_norm": 0.0, "learning_rate": 2.68e-06, "loss": 1.7094, "mean_token_accuracy": 0.6541548132896423, "num_tokens": 34516905.0, "step": 9465 }, { "epoch": 72.84615384615384, "grad_norm": 0.0, "learning_rate": 2.655e-06, "loss": 1.6942, "mean_token_accuracy": 0.6577578127384186, "num_tokens": 34535704.0, "step": 9470 }, { "epoch": 72.88461538461539, "grad_norm": 0.0, "learning_rate": 2.6300000000000002e-06, "loss": 1.7822, "mean_token_accuracy": 0.6439197540283204, "num_tokens": 34553487.0, "step": 9475 }, { "epoch": 72.92307692307692, "grad_norm": 0.0, "learning_rate": 2.605e-06, "loss": 1.6994, "mean_token_accuracy": 0.6599182695150375, "num_tokens": 34572061.0, "step": 9480 }, { "epoch": 72.96153846153847, "grad_norm": 0.0, "learning_rate": 2.5800000000000003e-06, "loss": 1.6874, "mean_token_accuracy": 0.6625257313251496, "num_tokens": 34590845.0, "step": 9485 }, { "epoch": 73.0, "grad_norm": 0.0, "learning_rate": 2.555e-06, "loss": 1.6979, "mean_token_accuracy": 0.6583450496196747, "num_tokens": 34609665.0, "step": 9490 }, { "epoch": 73.03846153846153, "grad_norm": 0.0, "learning_rate": 2.53e-06, "loss": 1.7997, "mean_token_accuracy": 0.6400972425937652, "num_tokens": 34627692.0, "step": 9495 }, { "epoch": 73.07692307692308, "grad_norm": 0.0, "learning_rate": 2.505e-06, "loss": 1.7601, "mean_token_accuracy": 0.6496747404336929, "num_tokens": 34645368.0, "step": 9500 }, { "epoch": 73.11538461538461, "grad_norm": 0.0, "learning_rate": 2.48e-06, "loss": 1.7188, "mean_token_accuracy": 0.6584489345550537, "num_tokens": 34663574.0, "step": 9505 }, { "epoch": 73.15384615384616, "grad_norm": 0.0, "learning_rate": 2.4550000000000002e-06, "loss": 1.6701, "mean_token_accuracy": 0.664915868639946, "num_tokens": 34682190.0, "step": 9510 }, { "epoch": 73.1923076923077, "grad_norm": 0.0, "learning_rate": 2.43e-06, "loss": 1.718, "mean_token_accuracy": 0.6526754826307297, "num_tokens": 34700323.0, "step": 9515 }, { "epoch": 73.23076923076923, "grad_norm": 0.0, "learning_rate": 2.405e-06, "loss": 1.7598, "mean_token_accuracy": 0.648136830329895, "num_tokens": 34718585.0, "step": 9520 }, { "epoch": 73.26923076923077, "grad_norm": 0.0, "learning_rate": 2.38e-06, "loss": 1.8001, "mean_token_accuracy": 0.6407902717590332, "num_tokens": 34736230.0, "step": 9525 }, { "epoch": 73.3076923076923, "grad_norm": 0.0, "learning_rate": 2.3550000000000003e-06, "loss": 1.7617, "mean_token_accuracy": 0.6459957540035248, "num_tokens": 34754509.0, "step": 9530 }, { "epoch": 73.34615384615384, "grad_norm": 0.0, "learning_rate": 2.33e-06, "loss": 1.6889, "mean_token_accuracy": 0.6600727260112762, "num_tokens": 34772888.0, "step": 9535 }, { "epoch": 73.38461538461539, "grad_norm": 0.0, "learning_rate": 2.3050000000000004e-06, "loss": 1.7754, "mean_token_accuracy": 0.6485264748334885, "num_tokens": 34790801.0, "step": 9540 }, { "epoch": 73.42307692307692, "grad_norm": 0.0, "learning_rate": 2.28e-06, "loss": 1.7553, "mean_token_accuracy": 0.6497362434864045, "num_tokens": 34808604.0, "step": 9545 }, { "epoch": 73.46153846153847, "grad_norm": 0.0, "learning_rate": 2.255e-06, "loss": 1.7909, "mean_token_accuracy": 0.639983183145523, "num_tokens": 34826362.0, "step": 9550 }, { "epoch": 73.5, "grad_norm": 0.0, "learning_rate": 2.2300000000000002e-06, "loss": 1.7264, "mean_token_accuracy": 0.6542590230703353, "num_tokens": 34844689.0, "step": 9555 }, { "epoch": 73.53846153846153, "grad_norm": 0.0, "learning_rate": 2.205e-06, "loss": 1.7352, "mean_token_accuracy": 0.6505734384059906, "num_tokens": 34863598.0, "step": 9560 }, { "epoch": 73.57692307692308, "grad_norm": 0.0, "learning_rate": 2.1800000000000003e-06, "loss": 1.7534, "mean_token_accuracy": 0.644753509759903, "num_tokens": 34881884.0, "step": 9565 }, { "epoch": 73.61538461538461, "grad_norm": 0.0, "learning_rate": 2.155e-06, "loss": 1.7672, "mean_token_accuracy": 0.646284231543541, "num_tokens": 34900082.0, "step": 9570 }, { "epoch": 73.65384615384616, "grad_norm": 0.0, "learning_rate": 2.13e-06, "loss": 1.7086, "mean_token_accuracy": 0.6528146833181381, "num_tokens": 34919228.0, "step": 9575 }, { "epoch": 73.6923076923077, "grad_norm": 0.0, "learning_rate": 2.105e-06, "loss": 1.7914, "mean_token_accuracy": 0.6391745179891586, "num_tokens": 34936386.0, "step": 9580 }, { "epoch": 73.73076923076923, "grad_norm": 0.0, "learning_rate": 2.08e-06, "loss": 1.729, "mean_token_accuracy": 0.6509798347949982, "num_tokens": 34954716.0, "step": 9585 }, { "epoch": 73.76923076923077, "grad_norm": 0.0, "learning_rate": 2.055e-06, "loss": 1.6906, "mean_token_accuracy": 0.6624141722917557, "num_tokens": 34973992.0, "step": 9590 }, { "epoch": 73.8076923076923, "grad_norm": 0.0, "learning_rate": 2.03e-06, "loss": 1.7145, "mean_token_accuracy": 0.6552868157625198, "num_tokens": 34992491.0, "step": 9595 }, { "epoch": 73.84615384615384, "grad_norm": 0.0, "learning_rate": 2.005e-06, "loss": 1.763, "mean_token_accuracy": 0.6461809724569321, "num_tokens": 35010375.0, "step": 9600 }, { "epoch": 73.88461538461539, "grad_norm": 0.0, "learning_rate": 1.98e-06, "loss": 1.7652, "mean_token_accuracy": 0.6461312264204025, "num_tokens": 35028006.0, "step": 9605 }, { "epoch": 73.92307692307692, "grad_norm": 0.0, "learning_rate": 1.9550000000000003e-06, "loss": 1.7037, "mean_token_accuracy": 0.656490358710289, "num_tokens": 35046822.0, "step": 9610 }, { "epoch": 73.96153846153847, "grad_norm": 0.0, "learning_rate": 1.93e-06, "loss": 1.7048, "mean_token_accuracy": 0.6573001593351364, "num_tokens": 35065193.0, "step": 9615 }, { "epoch": 74.0, "grad_norm": 0.0, "learning_rate": 1.9050000000000002e-06, "loss": 1.7038, "mean_token_accuracy": 0.6570831030607224, "num_tokens": 35083770.0, "step": 9620 }, { "epoch": 74.03846153846153, "grad_norm": 0.0, "learning_rate": 1.8800000000000002e-06, "loss": 1.841, "mean_token_accuracy": 0.6277540296316146, "num_tokens": 35101050.0, "step": 9625 }, { "epoch": 74.07692307692308, "grad_norm": 0.0, "learning_rate": 1.8550000000000002e-06, "loss": 1.7552, "mean_token_accuracy": 0.6447629004716873, "num_tokens": 35119255.0, "step": 9630 }, { "epoch": 74.11538461538461, "grad_norm": 0.0, "learning_rate": 1.83e-06, "loss": 1.7651, "mean_token_accuracy": 0.6467320770025253, "num_tokens": 35137450.0, "step": 9635 }, { "epoch": 74.15384615384616, "grad_norm": 0.0, "learning_rate": 1.805e-06, "loss": 1.7862, "mean_token_accuracy": 0.6392817258834839, "num_tokens": 35155463.0, "step": 9640 }, { "epoch": 74.1923076923077, "grad_norm": 0.0, "learning_rate": 1.7800000000000001e-06, "loss": 1.7609, "mean_token_accuracy": 0.6492180019617081, "num_tokens": 35173695.0, "step": 9645 }, { "epoch": 74.23076923076923, "grad_norm": 0.0, "learning_rate": 1.7550000000000001e-06, "loss": 1.7774, "mean_token_accuracy": 0.6455636918544769, "num_tokens": 35191312.0, "step": 9650 }, { "epoch": 74.26923076923077, "grad_norm": 0.0, "learning_rate": 1.73e-06, "loss": 1.7323, "mean_token_accuracy": 0.656083607673645, "num_tokens": 35209464.0, "step": 9655 }, { "epoch": 74.3076923076923, "grad_norm": 0.0, "learning_rate": 1.705e-06, "loss": 1.6952, "mean_token_accuracy": 0.6594295680522919, "num_tokens": 35227948.0, "step": 9660 }, { "epoch": 74.34615384615384, "grad_norm": 0.0, "learning_rate": 1.68e-06, "loss": 1.7291, "mean_token_accuracy": 0.6519060641527176, "num_tokens": 35246383.0, "step": 9665 }, { "epoch": 74.38461538461539, "grad_norm": 0.0, "learning_rate": 1.655e-06, "loss": 1.7533, "mean_token_accuracy": 0.6488317787647248, "num_tokens": 35264529.0, "step": 9670 }, { "epoch": 74.42307692307692, "grad_norm": 0.0, "learning_rate": 1.6299999999999999e-06, "loss": 1.7144, "mean_token_accuracy": 0.6548575252294541, "num_tokens": 35283055.0, "step": 9675 }, { "epoch": 74.46153846153847, "grad_norm": 0.0, "learning_rate": 1.6049999999999999e-06, "loss": 1.7571, "mean_token_accuracy": 0.6488740295171738, "num_tokens": 35301939.0, "step": 9680 }, { "epoch": 74.5, "grad_norm": 0.0, "learning_rate": 1.5800000000000003e-06, "loss": 1.6961, "mean_token_accuracy": 0.6596014022827148, "num_tokens": 35320606.0, "step": 9685 }, { "epoch": 74.53846153846153, "grad_norm": 0.0, "learning_rate": 1.555e-06, "loss": 1.7047, "mean_token_accuracy": 0.6595968753099442, "num_tokens": 35338994.0, "step": 9690 }, { "epoch": 74.57692307692308, "grad_norm": 0.0, "learning_rate": 1.53e-06, "loss": 1.7309, "mean_token_accuracy": 0.6514750987291336, "num_tokens": 35357001.0, "step": 9695 }, { "epoch": 74.61538461538461, "grad_norm": 0.0, "learning_rate": 1.505e-06, "loss": 1.6814, "mean_token_accuracy": 0.6643463402986527, "num_tokens": 35375365.0, "step": 9700 }, { "epoch": 74.65384615384616, "grad_norm": 0.0, "learning_rate": 1.4800000000000002e-06, "loss": 1.7907, "mean_token_accuracy": 0.6419687926769256, "num_tokens": 35393033.0, "step": 9705 }, { "epoch": 74.6923076923077, "grad_norm": 0.0, "learning_rate": 1.455e-06, "loss": 1.7162, "mean_token_accuracy": 0.6562795639038086, "num_tokens": 35411155.0, "step": 9710 }, { "epoch": 74.73076923076923, "grad_norm": 0.0, "learning_rate": 1.43e-06, "loss": 1.7366, "mean_token_accuracy": 0.6549755454063415, "num_tokens": 35429377.0, "step": 9715 }, { "epoch": 74.76923076923077, "grad_norm": 0.0, "learning_rate": 1.405e-06, "loss": 1.6836, "mean_token_accuracy": 0.658492824435234, "num_tokens": 35448518.0, "step": 9720 }, { "epoch": 74.8076923076923, "grad_norm": 0.0, "learning_rate": 1.3800000000000001e-06, "loss": 1.7531, "mean_token_accuracy": 0.6465422600507736, "num_tokens": 35466311.0, "step": 9725 }, { "epoch": 74.84615384615384, "grad_norm": 0.0, "learning_rate": 1.355e-06, "loss": 1.7721, "mean_token_accuracy": 0.6479380965232849, "num_tokens": 35484088.0, "step": 9730 }, { "epoch": 74.88461538461539, "grad_norm": 0.0, "learning_rate": 1.33e-06, "loss": 1.747, "mean_token_accuracy": 0.6452481150627136, "num_tokens": 35502027.0, "step": 9735 }, { "epoch": 74.92307692307692, "grad_norm": 0.0, "learning_rate": 1.3050000000000002e-06, "loss": 1.7069, "mean_token_accuracy": 0.6560095697641373, "num_tokens": 35521179.0, "step": 9740 }, { "epoch": 74.96153846153847, "grad_norm": 0.0, "learning_rate": 1.28e-06, "loss": 1.7289, "mean_token_accuracy": 0.6569119274616242, "num_tokens": 35539684.0, "step": 9745 }, { "epoch": 75.0, "grad_norm": 0.0, "learning_rate": 1.255e-06, "loss": 1.7786, "mean_token_accuracy": 0.6401764988899231, "num_tokens": 35557875.0, "step": 9750 }, { "epoch": 75.03846153846153, "grad_norm": 0.0, "learning_rate": 1.23e-06, "loss": 1.7148, "mean_token_accuracy": 0.6562530130147934, "num_tokens": 35576356.0, "step": 9755 }, { "epoch": 75.07692307692308, "grad_norm": 0.0, "learning_rate": 1.2050000000000001e-06, "loss": 1.6865, "mean_token_accuracy": 0.6659580796957016, "num_tokens": 35594809.0, "step": 9760 }, { "epoch": 75.11538461538461, "grad_norm": 0.0, "learning_rate": 1.18e-06, "loss": 1.7825, "mean_token_accuracy": 0.6393451571464539, "num_tokens": 35612932.0, "step": 9765 }, { "epoch": 75.15384615384616, "grad_norm": 0.0, "learning_rate": 1.155e-06, "loss": 1.8061, "mean_token_accuracy": 0.6380451053380967, "num_tokens": 35630549.0, "step": 9770 }, { "epoch": 75.1923076923077, "grad_norm": 0.0, "learning_rate": 1.13e-06, "loss": 1.8032, "mean_token_accuracy": 0.6359036058187485, "num_tokens": 35648480.0, "step": 9775 }, { "epoch": 75.23076923076923, "grad_norm": 0.0, "learning_rate": 1.1050000000000002e-06, "loss": 1.763, "mean_token_accuracy": 0.6435405910015106, "num_tokens": 35666156.0, "step": 9780 }, { "epoch": 75.26923076923077, "grad_norm": 0.0, "learning_rate": 1.08e-06, "loss": 1.7269, "mean_token_accuracy": 0.6546905755996704, "num_tokens": 35684367.0, "step": 9785 }, { "epoch": 75.3076923076923, "grad_norm": 0.0, "learning_rate": 1.055e-06, "loss": 1.7454, "mean_token_accuracy": 0.6543306648731232, "num_tokens": 35703139.0, "step": 9790 }, { "epoch": 75.34615384615384, "grad_norm": 0.0, "learning_rate": 1.03e-06, "loss": 1.7799, "mean_token_accuracy": 0.6433266043663025, "num_tokens": 35721254.0, "step": 9795 }, { "epoch": 75.38461538461539, "grad_norm": 0.0, "learning_rate": 1.0050000000000001e-06, "loss": 1.7613, "mean_token_accuracy": 0.647157472372055, "num_tokens": 35739283.0, "step": 9800 }, { "epoch": 75.42307692307692, "grad_norm": 0.0, "learning_rate": 9.8e-07, "loss": 1.7482, "mean_token_accuracy": 0.6474371433258057, "num_tokens": 35758040.0, "step": 9805 }, { "epoch": 75.46153846153847, "grad_norm": 0.0, "learning_rate": 9.55e-07, "loss": 1.6272, "mean_token_accuracy": 0.6756436079740524, "num_tokens": 35777534.0, "step": 9810 }, { "epoch": 75.5, "grad_norm": 0.0, "learning_rate": 9.3e-07, "loss": 1.6917, "mean_token_accuracy": 0.6589653193950653, "num_tokens": 35796039.0, "step": 9815 }, { "epoch": 75.53846153846153, "grad_norm": 0.0, "learning_rate": 9.050000000000001e-07, "loss": 1.7085, "mean_token_accuracy": 0.6560279041528702, "num_tokens": 35814453.0, "step": 9820 }, { "epoch": 75.57692307692308, "grad_norm": 0.0, "learning_rate": 8.8e-07, "loss": 1.717, "mean_token_accuracy": 0.6553375959396363, "num_tokens": 35833053.0, "step": 9825 }, { "epoch": 75.61538461538461, "grad_norm": 0.0, "learning_rate": 8.550000000000001e-07, "loss": 1.7881, "mean_token_accuracy": 0.6422079712152481, "num_tokens": 35850712.0, "step": 9830 }, { "epoch": 75.65384615384616, "grad_norm": 0.0, "learning_rate": 8.300000000000001e-07, "loss": 1.7786, "mean_token_accuracy": 0.6427565932273864, "num_tokens": 35868743.0, "step": 9835 }, { "epoch": 75.6923076923077, "grad_norm": 0.0, "learning_rate": 8.05e-07, "loss": 1.7353, "mean_token_accuracy": 0.657978093624115, "num_tokens": 35886399.0, "step": 9840 }, { "epoch": 75.73076923076923, "grad_norm": 0.0, "learning_rate": 7.8e-07, "loss": 1.7547, "mean_token_accuracy": 0.6459228157997131, "num_tokens": 35904125.0, "step": 9845 }, { "epoch": 75.76923076923077, "grad_norm": 0.0, "learning_rate": 7.550000000000001e-07, "loss": 1.7782, "mean_token_accuracy": 0.6488775163888931, "num_tokens": 35921829.0, "step": 9850 }, { "epoch": 75.8076923076923, "grad_norm": 0.0, "learning_rate": 7.3e-07, "loss": 1.8236, "mean_token_accuracy": 0.6332837879657746, "num_tokens": 35939046.0, "step": 9855 }, { "epoch": 75.84615384615384, "grad_norm": 0.0, "learning_rate": 7.05e-07, "loss": 1.6545, "mean_token_accuracy": 0.6667064487934112, "num_tokens": 35957880.0, "step": 9860 }, { "epoch": 75.88461538461539, "grad_norm": 0.0, "learning_rate": 6.8e-07, "loss": 1.7049, "mean_token_accuracy": 0.6505464226007461, "num_tokens": 35977218.0, "step": 9865 }, { "epoch": 75.92307692307692, "grad_norm": 0.0, "learning_rate": 6.550000000000001e-07, "loss": 1.6894, "mean_token_accuracy": 0.6595763146877289, "num_tokens": 35996459.0, "step": 9870 }, { "epoch": 75.96153846153847, "grad_norm": 0.0, "learning_rate": 6.3e-07, "loss": 1.7751, "mean_token_accuracy": 0.6440996766090393, "num_tokens": 36013795.0, "step": 9875 }, { "epoch": 76.0, "grad_norm": 0.0, "learning_rate": 6.05e-07, "loss": 1.719, "mean_token_accuracy": 0.6536471992731094, "num_tokens": 36031980.0, "step": 9880 }, { "epoch": 76.03846153846153, "grad_norm": 0.0, "learning_rate": 5.8e-07, "loss": 1.7253, "mean_token_accuracy": 0.6511723041534424, "num_tokens": 36050841.0, "step": 9885 }, { "epoch": 76.07692307692308, "grad_norm": 0.0, "learning_rate": 5.550000000000001e-07, "loss": 1.7605, "mean_token_accuracy": 0.6479182183742523, "num_tokens": 36069596.0, "step": 9890 }, { "epoch": 76.11538461538461, "grad_norm": 0.0, "learning_rate": 5.3e-07, "loss": 1.8029, "mean_token_accuracy": 0.6407642930746078, "num_tokens": 36087230.0, "step": 9895 }, { "epoch": 76.15384615384616, "grad_norm": 0.0, "learning_rate": 5.05e-07, "loss": 1.7748, "mean_token_accuracy": 0.6411665916442871, "num_tokens": 36105433.0, "step": 9900 }, { "epoch": 76.1923076923077, "grad_norm": 0.0, "learning_rate": 4.8e-07, "loss": 1.7081, "mean_token_accuracy": 0.658719289302826, "num_tokens": 36123588.0, "step": 9905 }, { "epoch": 76.23076923076923, "grad_norm": 0.0, "learning_rate": 4.5500000000000004e-07, "loss": 1.6927, "mean_token_accuracy": 0.6572551965713501, "num_tokens": 36141761.0, "step": 9910 }, { "epoch": 76.26923076923077, "grad_norm": 0.0, "learning_rate": 4.3e-07, "loss": 1.7517, "mean_token_accuracy": 0.6524038553237915, "num_tokens": 36159646.0, "step": 9915 }, { "epoch": 76.3076923076923, "grad_norm": 0.0, "learning_rate": 4.05e-07, "loss": 1.8024, "mean_token_accuracy": 0.6330039739608765, "num_tokens": 36177566.0, "step": 9920 }, { "epoch": 76.34615384615384, "grad_norm": 0.0, "learning_rate": 3.8e-07, "loss": 1.7486, "mean_token_accuracy": 0.6493139892816544, "num_tokens": 36195783.0, "step": 9925 }, { "epoch": 76.38461538461539, "grad_norm": 0.0, "learning_rate": 3.5500000000000004e-07, "loss": 1.7019, "mean_token_accuracy": 0.6548255890607834, "num_tokens": 36214515.0, "step": 9930 }, { "epoch": 76.42307692307692, "grad_norm": 0.0, "learning_rate": 3.3e-07, "loss": 1.7909, "mean_token_accuracy": 0.6404381603002548, "num_tokens": 36231874.0, "step": 9935 }, { "epoch": 76.46153846153847, "grad_norm": 0.0, "learning_rate": 3.0500000000000004e-07, "loss": 1.7498, "mean_token_accuracy": 0.6454123288393021, "num_tokens": 36250099.0, "step": 9940 }, { "epoch": 76.5, "grad_norm": 0.0, "learning_rate": 2.8e-07, "loss": 1.7129, "mean_token_accuracy": 0.6584518551826477, "num_tokens": 36268558.0, "step": 9945 }, { "epoch": 76.53846153846153, "grad_norm": 0.0, "learning_rate": 2.5500000000000005e-07, "loss": 1.7443, "mean_token_accuracy": 0.6507739454507828, "num_tokens": 36286936.0, "step": 9950 }, { "epoch": 76.57692307692308, "grad_norm": 0.0, "learning_rate": 2.3e-07, "loss": 1.6551, "mean_token_accuracy": 0.6698577493429184, "num_tokens": 36305937.0, "step": 9955 }, { "epoch": 76.61538461538461, "grad_norm": 0.0, "learning_rate": 2.0500000000000002e-07, "loss": 1.7305, "mean_token_accuracy": 0.6495934307575226, "num_tokens": 36324168.0, "step": 9960 }, { "epoch": 76.65384615384616, "grad_norm": 0.0, "learning_rate": 1.8e-07, "loss": 1.7612, "mean_token_accuracy": 0.6480752646923065, "num_tokens": 36342496.0, "step": 9965 }, { "epoch": 76.6923076923077, "grad_norm": 0.0, "learning_rate": 1.55e-07, "loss": 1.6781, "mean_token_accuracy": 0.6652602583169938, "num_tokens": 36360871.0, "step": 9970 }, { "epoch": 76.73076923076923, "grad_norm": 0.0, "learning_rate": 1.3e-07, "loss": 1.7443, "mean_token_accuracy": 0.6487196087837219, "num_tokens": 36379371.0, "step": 9975 }, { "epoch": 76.76923076923077, "grad_norm": 0.0, "learning_rate": 1.05e-07, "loss": 1.737, "mean_token_accuracy": 0.6534163296222687, "num_tokens": 36398038.0, "step": 9980 }, { "epoch": 76.8076923076923, "grad_norm": 0.0, "learning_rate": 8e-08, "loss": 1.7521, "mean_token_accuracy": 0.6514124810695648, "num_tokens": 36415958.0, "step": 9985 }, { "epoch": 76.84615384615384, "grad_norm": 0.0, "learning_rate": 5.5e-08, "loss": 1.7213, "mean_token_accuracy": 0.6536638140678406, "num_tokens": 36434047.0, "step": 9990 }, { "epoch": 76.88461538461539, "grad_norm": 0.0, "learning_rate": 3e-08, "loss": 1.7509, "mean_token_accuracy": 0.6517219811677932, "num_tokens": 36452055.0, "step": 9995 }, { "epoch": 76.92307692307692, "grad_norm": 0.0, "learning_rate": 5e-09, "loss": 1.6948, "mean_token_accuracy": 0.6620088309049607, "num_tokens": 36470917.0, "step": 10000 } ], "logging_steps": 5, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 77, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8493505729091994e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }