{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 25.375, "learning_rate": 5.333333333333335e-07, "loss": 1.7543, "mean_token_accuracy": 0.7542933464050293, "num_input_tokens_seen": 3295, "num_tokens": 3295.0, "step": 5, "train_runtime": 15.7246, "train_tokens_per_second": 209.544 }, { "epoch": 0.002, "grad_norm": 20.25, "learning_rate": 1.2000000000000002e-06, "loss": 0.8715, "mean_token_accuracy": 0.8657029747962952, "num_input_tokens_seen": 7420, "num_tokens": 7420.0, "step": 10, "train_runtime": 17.2463, "train_tokens_per_second": 430.238 }, { "epoch": 0.003, "grad_norm": 44.25, "learning_rate": 1.8666666666666669e-06, "loss": 1.9388, "mean_token_accuracy": 0.7570277929306031, "num_input_tokens_seen": 11308, "num_tokens": 11308.0, "step": 15, "train_runtime": 18.7127, "train_tokens_per_second": 604.295 }, { "epoch": 0.004, "grad_norm": 12.875, "learning_rate": 2.5333333333333338e-06, "loss": 0.9017, "mean_token_accuracy": 0.8499565124511719, "num_input_tokens_seen": 15758, "num_tokens": 15758.0, "step": 20, "train_runtime": 20.1095, "train_tokens_per_second": 783.61 }, { "epoch": 0.005, "grad_norm": 8.125, "learning_rate": 3.2000000000000003e-06, "loss": 0.9578, "mean_token_accuracy": 0.8539481401443482, "num_input_tokens_seen": 20595, "num_tokens": 20595.0, "step": 25, "train_runtime": 21.6458, "train_tokens_per_second": 951.457 }, { "epoch": 0.006, "grad_norm": 16.625, "learning_rate": 3.866666666666667e-06, "loss": 1.1091, "mean_token_accuracy": 0.8002119779586792, "num_input_tokens_seen": 23842, "num_tokens": 23842.0, "step": 30, "train_runtime": 22.9953, "train_tokens_per_second": 1036.82 }, { "epoch": 0.007, "grad_norm": 114.0, "learning_rate": 4.533333333333334e-06, "loss": 0.8445, "mean_token_accuracy": 0.8750168204307556, "num_input_tokens_seen": 27533, "num_tokens": 27533.0, "step": 35, "train_runtime": 24.4948, "train_tokens_per_second": 1124.033 }, { "epoch": 0.008, "grad_norm": 8.0625, "learning_rate": 5.2e-06, "loss": 0.6957, "mean_token_accuracy": 0.8931362152099609, "num_input_tokens_seen": 30394, "num_tokens": 30394.0, "step": 40, "train_runtime": 25.8068, "train_tokens_per_second": 1177.751 }, { "epoch": 0.009, "grad_norm": 11.5625, "learning_rate": 5.8666666666666675e-06, "loss": 0.6756, "mean_token_accuracy": 0.8714292883872986, "num_input_tokens_seen": 33453, "num_tokens": 33453.0, "step": 45, "train_runtime": 27.1348, "train_tokens_per_second": 1232.843 }, { "epoch": 0.01, "grad_norm": 9.5, "learning_rate": 6.533333333333334e-06, "loss": 0.5579, "mean_token_accuracy": 0.8697065591812134, "num_input_tokens_seen": 36334, "num_tokens": 36334.0, "step": 50, "train_runtime": 28.4259, "train_tokens_per_second": 1278.201 }, { "epoch": 0.011, "grad_norm": 50.0, "learning_rate": 7.2000000000000005e-06, "loss": 0.4888, "mean_token_accuracy": 0.9049678444862366, "num_input_tokens_seen": 39190, "num_tokens": 39190.0, "step": 55, "train_runtime": 29.7597, "train_tokens_per_second": 1316.881 }, { "epoch": 0.012, "grad_norm": 16.375, "learning_rate": 7.866666666666667e-06, "loss": 0.6571, "mean_token_accuracy": 0.8493815183639526, "num_input_tokens_seen": 42770, "num_tokens": 42770.0, "step": 60, "train_runtime": 31.1995, "train_tokens_per_second": 1370.856 }, { "epoch": 0.013, "grad_norm": 4.15625, "learning_rate": 8.533333333333335e-06, "loss": 0.3241, "mean_token_accuracy": 0.9125397205352783, "num_input_tokens_seen": 48886, "num_tokens": 48886.0, "step": 65, "train_runtime": 32.7357, "train_tokens_per_second": 1493.354 }, { "epoch": 0.014, "grad_norm": 7.8125, "learning_rate": 9.200000000000002e-06, "loss": 0.5143, "mean_token_accuracy": 0.8600565195083618, "num_input_tokens_seen": 52404, "num_tokens": 52404.0, "step": 70, "train_runtime": 34.0706, "train_tokens_per_second": 1538.098 }, { "epoch": 0.015, "grad_norm": 5.9375, "learning_rate": 9.866666666666668e-06, "loss": 0.3192, "mean_token_accuracy": 0.9033533334732056, "num_input_tokens_seen": 57185, "num_tokens": 57185.0, "step": 75, "train_runtime": 35.4749, "train_tokens_per_second": 1611.983 }, { "epoch": 0.016, "grad_norm": 13.5, "learning_rate": 1.0533333333333333e-05, "loss": 0.3371, "mean_token_accuracy": 0.9086463451385498, "num_input_tokens_seen": 62052, "num_tokens": 62052.0, "step": 80, "train_runtime": 36.9279, "train_tokens_per_second": 1680.354 }, { "epoch": 0.017, "grad_norm": 5.28125, "learning_rate": 1.1200000000000001e-05, "loss": 0.4086, "mean_token_accuracy": 0.8853184342384338, "num_input_tokens_seen": 66463, "num_tokens": 66463.0, "step": 85, "train_runtime": 38.3926, "train_tokens_per_second": 1731.142 }, { "epoch": 0.018, "grad_norm": 10.0625, "learning_rate": 1.186666666666667e-05, "loss": 0.3525, "mean_token_accuracy": 0.8978313446044922, "num_input_tokens_seen": 69582, "num_tokens": 69582.0, "step": 90, "train_runtime": 39.7431, "train_tokens_per_second": 1750.794 }, { "epoch": 0.019, "grad_norm": 7.5, "learning_rate": 1.2533333333333336e-05, "loss": 0.3845, "mean_token_accuracy": 0.8846698999404907, "num_input_tokens_seen": 73240, "num_tokens": 73240.0, "step": 95, "train_runtime": 41.1184, "train_tokens_per_second": 1781.199 }, { "epoch": 0.02, "grad_norm": 10.5, "learning_rate": 1.3200000000000002e-05, "loss": 0.3562, "mean_token_accuracy": 0.8983347058296204, "num_input_tokens_seen": 77190, "num_tokens": 77190.0, "step": 100, "train_runtime": 42.5405, "train_tokens_per_second": 1814.506 }, { "epoch": 0.021, "grad_norm": 9.75, "learning_rate": 1.3866666666666669e-05, "loss": 0.5241, "mean_token_accuracy": 0.8488840222358703, "num_input_tokens_seen": 81391, "num_tokens": 81391.0, "step": 105, "train_runtime": 43.9757, "train_tokens_per_second": 1850.816 }, { "epoch": 0.022, "grad_norm": 46.0, "learning_rate": 1.4533333333333335e-05, "loss": 0.3992, "mean_token_accuracy": 0.8728883624076843, "num_input_tokens_seen": 85300, "num_tokens": 85300.0, "step": 110, "train_runtime": 45.2852, "train_tokens_per_second": 1883.617 }, { "epoch": 0.023, "grad_norm": 11.1875, "learning_rate": 1.5200000000000002e-05, "loss": 0.3643, "mean_token_accuracy": 0.8838326215744019, "num_input_tokens_seen": 87883, "num_tokens": 87883.0, "step": 115, "train_runtime": 46.5808, "train_tokens_per_second": 1886.68 }, { "epoch": 0.024, "grad_norm": 8.375, "learning_rate": 1.586666666666667e-05, "loss": 0.2374, "mean_token_accuracy": 0.9314336180686951, "num_input_tokens_seen": 92590, "num_tokens": 92590.0, "step": 120, "train_runtime": 48.0857, "train_tokens_per_second": 1925.519 }, { "epoch": 0.025, "grad_norm": 10.0, "learning_rate": 1.6533333333333333e-05, "loss": 0.3393, "mean_token_accuracy": 0.891490924358368, "num_input_tokens_seen": 95729, "num_tokens": 95729.0, "step": 125, "train_runtime": 49.4663, "train_tokens_per_second": 1935.238 }, { "epoch": 0.026, "grad_norm": 5.09375, "learning_rate": 1.72e-05, "loss": 0.2418, "mean_token_accuracy": 0.9252637505531311, "num_input_tokens_seen": 99994, "num_tokens": 99994.0, "step": 130, "train_runtime": 50.8605, "train_tokens_per_second": 1966.043 }, { "epoch": 0.027, "grad_norm": 4.6875, "learning_rate": 1.7866666666666666e-05, "loss": 0.3039, "mean_token_accuracy": 0.9059112310409546, "num_input_tokens_seen": 105164, "num_tokens": 105164.0, "step": 135, "train_runtime": 52.263, "train_tokens_per_second": 2012.208 }, { "epoch": 0.028, "grad_norm": 10.0625, "learning_rate": 1.8533333333333334e-05, "loss": 0.4135, "mean_token_accuracy": 0.8822806358337403, "num_input_tokens_seen": 108242, "num_tokens": 108242.0, "step": 140, "train_runtime": 53.5331, "train_tokens_per_second": 2021.963 }, { "epoch": 0.029, "grad_norm": 13.0, "learning_rate": 1.9200000000000003e-05, "loss": 0.4921, "mean_token_accuracy": 0.8458282351493835, "num_input_tokens_seen": 111663, "num_tokens": 111663.0, "step": 145, "train_runtime": 54.8623, "train_tokens_per_second": 2035.334 }, { "epoch": 0.03, "grad_norm": 6.90625, "learning_rate": 1.9866666666666667e-05, "loss": 0.3689, "mean_token_accuracy": 0.887873649597168, "num_input_tokens_seen": 114618, "num_tokens": 114618.0, "step": 150, "train_runtime": 56.1425, "train_tokens_per_second": 2041.556 }, { "epoch": 0.031, "grad_norm": 8.0, "learning_rate": 1.999996643350365e-05, "loss": 0.3827, "mean_token_accuracy": 0.8870453238487244, "num_input_tokens_seen": 117957, "num_tokens": 117957.0, "step": 155, "train_runtime": 57.4334, "train_tokens_per_second": 2053.806 }, { "epoch": 0.032, "grad_norm": 6.9375, "learning_rate": 1.999983006999844e-05, "loss": 0.2918, "mean_token_accuracy": 0.9162590026855468, "num_input_tokens_seen": 121848, "num_tokens": 121848.0, "step": 160, "train_runtime": 58.8328, "train_tokens_per_second": 2071.089 }, { "epoch": 0.033, "grad_norm": 12.6875, "learning_rate": 1.999958881300763e-05, "loss": 0.308, "mean_token_accuracy": 0.9007701873779297, "num_input_tokens_seen": 126237, "num_tokens": 126237.0, "step": 165, "train_runtime": 60.2676, "train_tokens_per_second": 2094.606 }, { "epoch": 0.034, "grad_norm": 10.875, "learning_rate": 1.99992426650619e-05, "loss": 0.3323, "mean_token_accuracy": 0.8900646448135376, "num_input_tokens_seen": 130020, "num_tokens": 130020.0, "step": 170, "train_runtime": 61.6395, "train_tokens_per_second": 2109.363 }, { "epoch": 0.035, "grad_norm": 8.375, "learning_rate": 1.9998791629792172e-05, "loss": 0.2782, "mean_token_accuracy": 0.9175599813461304, "num_input_tokens_seen": 133464, "num_tokens": 133464.0, "step": 175, "train_runtime": 62.9902, "train_tokens_per_second": 2118.805 }, { "epoch": 0.036, "grad_norm": 5.0, "learning_rate": 1.9998235711929593e-05, "loss": 0.3431, "mean_token_accuracy": 0.9037263512611389, "num_input_tokens_seen": 137588, "num_tokens": 137588.0, "step": 180, "train_runtime": 64.3511, "train_tokens_per_second": 2138.083 }, { "epoch": 0.037, "grad_norm": 13.1875, "learning_rate": 1.999757491730548e-05, "loss": 0.3999, "mean_token_accuracy": 0.8687745571136475, "num_input_tokens_seen": 141166, "num_tokens": 141166.0, "step": 185, "train_runtime": 65.7008, "train_tokens_per_second": 2148.619 }, { "epoch": 0.038, "grad_norm": 7.40625, "learning_rate": 1.9996809252851254e-05, "loss": 0.3465, "mean_token_accuracy": 0.8930590510368347, "num_input_tokens_seen": 144412, "num_tokens": 144412.0, "step": 190, "train_runtime": 66.9809, "train_tokens_per_second": 2156.018 }, { "epoch": 0.039, "grad_norm": 5.0, "learning_rate": 1.9995938726598374e-05, "loss": 0.3406, "mean_token_accuracy": 0.8985598683357239, "num_input_tokens_seen": 149897, "num_tokens": 149897.0, "step": 195, "train_runtime": 68.3906, "train_tokens_per_second": 2191.778 }, { "epoch": 0.04, "grad_norm": 5.59375, "learning_rate": 1.999496334767825e-05, "loss": 0.335, "mean_token_accuracy": 0.8968366861343384, "num_input_tokens_seen": 154122, "num_tokens": 154122.0, "step": 200, "train_runtime": 69.76, "train_tokens_per_second": 2209.318 }, { "epoch": 0.041, "grad_norm": 9.8125, "learning_rate": 1.9993883126322142e-05, "loss": 0.3496, "mean_token_accuracy": 0.8911818385124206, "num_input_tokens_seen": 159238, "num_tokens": 159238.0, "step": 205, "train_runtime": 71.3056, "train_tokens_per_second": 2233.178 }, { "epoch": 0.042, "grad_norm": 4.625, "learning_rate": 1.9992698073861067e-05, "loss": 0.2695, "mean_token_accuracy": 0.9158300042152405, "num_input_tokens_seen": 162204, "num_tokens": 162204.0, "step": 210, "train_runtime": 72.6133, "train_tokens_per_second": 2233.807 }, { "epoch": 0.043, "grad_norm": 15.3125, "learning_rate": 1.999140820272566e-05, "loss": 0.3636, "mean_token_accuracy": 0.8851131916046142, "num_input_tokens_seen": 165894, "num_tokens": 165894.0, "step": 215, "train_runtime": 73.9906, "train_tokens_per_second": 2242.097 }, { "epoch": 0.044, "grad_norm": 4.375, "learning_rate": 1.9990013526446056e-05, "loss": 0.2848, "mean_token_accuracy": 0.9103469967842102, "num_input_tokens_seen": 169700, "num_tokens": 169700.0, "step": 220, "train_runtime": 75.3519, "train_tokens_per_second": 2252.1 }, { "epoch": 0.045, "grad_norm": 11.4375, "learning_rate": 1.998851405965175e-05, "loss": 0.4081, "mean_token_accuracy": 0.8818454146385193, "num_input_tokens_seen": 173072, "num_tokens": 173072.0, "step": 225, "train_runtime": 76.7164, "train_tokens_per_second": 2255.998 }, { "epoch": 0.046, "grad_norm": 4.125, "learning_rate": 1.998690981807145e-05, "loss": 0.2645, "mean_token_accuracy": 0.9165312886238098, "num_input_tokens_seen": 178966, "num_tokens": 178966.0, "step": 230, "train_runtime": 78.2085, "train_tokens_per_second": 2288.32 }, { "epoch": 0.047, "grad_norm": 6.46875, "learning_rate": 1.9985200818532873e-05, "loss": 0.4496, "mean_token_accuracy": 0.8624901533126831, "num_input_tokens_seen": 183255, "num_tokens": 183255.0, "step": 235, "train_runtime": 79.6204, "train_tokens_per_second": 2301.61 }, { "epoch": 0.048, "grad_norm": 9.4375, "learning_rate": 1.9983387078962634e-05, "loss": 0.4141, "mean_token_accuracy": 0.8800104975700378, "num_input_tokens_seen": 186270, "num_tokens": 186270.0, "step": 240, "train_runtime": 80.9126, "train_tokens_per_second": 2302.115 }, { "epoch": 0.049, "grad_norm": 3.921875, "learning_rate": 1.998146861838599e-05, "loss": 0.2342, "mean_token_accuracy": 0.925393033027649, "num_input_tokens_seen": 190740, "num_tokens": 190740.0, "step": 245, "train_runtime": 82.2934, "train_tokens_per_second": 2317.805 }, { "epoch": 0.05, "grad_norm": 7.75, "learning_rate": 1.997944545692669e-05, "loss": 0.3031, "mean_token_accuracy": 0.9109432339668274, "num_input_tokens_seen": 194972, "num_tokens": 194972.0, "step": 250, "train_runtime": 83.7382, "train_tokens_per_second": 2328.353 }, { "epoch": 0.051, "grad_norm": 12.8125, "learning_rate": 1.9977317615806738e-05, "loss": 0.5655, "mean_token_accuracy": 0.8443936705589294, "num_input_tokens_seen": 199605, "num_tokens": 199605.0, "step": 255, "train_runtime": 85.1787, "train_tokens_per_second": 2343.368 }, { "epoch": 0.052, "grad_norm": 12.6875, "learning_rate": 1.997508511734618e-05, "loss": 0.4446, "mean_token_accuracy": 0.8792689681053162, "num_input_tokens_seen": 203674, "num_tokens": 203674.0, "step": 260, "train_runtime": 86.5729, "train_tokens_per_second": 2352.63 }, { "epoch": 0.053, "grad_norm": 6.375, "learning_rate": 1.997274798496287e-05, "loss": 0.335, "mean_token_accuracy": 0.8953316807746887, "num_input_tokens_seen": 207127, "num_tokens": 207127.0, "step": 265, "train_runtime": 87.9406, "train_tokens_per_second": 2355.306 }, { "epoch": 0.054, "grad_norm": 40.0, "learning_rate": 1.9970306243172223e-05, "loss": 0.3407, "mean_token_accuracy": 0.9025797843933105, "num_input_tokens_seen": 211118, "num_tokens": 211118.0, "step": 270, "train_runtime": 89.3109, "train_tokens_per_second": 2363.855 }, { "epoch": 0.055, "grad_norm": 12.125, "learning_rate": 1.9967759917586953e-05, "loss": 0.4502, "mean_token_accuracy": 0.8784193158149719, "num_input_tokens_seen": 215832, "num_tokens": 215832.0, "step": 275, "train_runtime": 90.7625, "train_tokens_per_second": 2377.987 }, { "epoch": 0.056, "grad_norm": 12.0625, "learning_rate": 1.9965109034916806e-05, "loss": 0.4911, "mean_token_accuracy": 0.857312798500061, "num_input_tokens_seen": 219524, "num_tokens": 219524.0, "step": 280, "train_runtime": 92.1769, "train_tokens_per_second": 2381.55 }, { "epoch": 0.057, "grad_norm": 9.9375, "learning_rate": 1.9962353622968296e-05, "loss": 0.4858, "mean_token_accuracy": 0.8528090715408325, "num_input_tokens_seen": 223340, "num_tokens": 223340.0, "step": 285, "train_runtime": 93.5608, "train_tokens_per_second": 2387.111 }, { "epoch": 0.058, "grad_norm": 7.25, "learning_rate": 1.9959493710644385e-05, "loss": 0.3891, "mean_token_accuracy": 0.8807625412940979, "num_input_tokens_seen": 227278, "num_tokens": 227278.0, "step": 290, "train_runtime": 94.9797, "train_tokens_per_second": 2392.911 }, { "epoch": 0.059, "grad_norm": 13.75, "learning_rate": 1.9956529327944198e-05, "loss": 0.337, "mean_token_accuracy": 0.8912041544914245, "num_input_tokens_seen": 233135, "num_tokens": 233135.0, "step": 295, "train_runtime": 96.4603, "train_tokens_per_second": 2416.9 }, { "epoch": 0.06, "grad_norm": 7.78125, "learning_rate": 1.995346050596271e-05, "loss": 0.3817, "mean_token_accuracy": 0.8886746883392334, "num_input_tokens_seen": 235828, "num_tokens": 235828.0, "step": 300, "train_runtime": 97.7786, "train_tokens_per_second": 2411.856 }, { "epoch": 0.061, "grad_norm": 4.53125, "learning_rate": 1.995028727689041e-05, "loss": 0.2725, "mean_token_accuracy": 0.9170053720474243, "num_input_tokens_seen": 238638, "num_tokens": 238638.0, "step": 305, "train_runtime": 99.0976, "train_tokens_per_second": 2408.11 }, { "epoch": 0.062, "grad_norm": 4.78125, "learning_rate": 1.9947009674012975e-05, "loss": 0.3301, "mean_token_accuracy": 0.895681357383728, "num_input_tokens_seen": 242172, "num_tokens": 242172.0, "step": 310, "train_runtime": 100.4308, "train_tokens_per_second": 2411.333 }, { "epoch": 0.063, "grad_norm": 3.546875, "learning_rate": 1.9943627731710896e-05, "loss": 0.2324, "mean_token_accuracy": 0.9405329465866089, "num_input_tokens_seen": 245827, "num_tokens": 245827.0, "step": 315, "train_runtime": 101.7954, "train_tokens_per_second": 2414.913 }, { "epoch": 0.064, "grad_norm": 5.78125, "learning_rate": 1.994014148545916e-05, "loss": 0.3654, "mean_token_accuracy": 0.8857975482940674, "num_input_tokens_seen": 250738, "num_tokens": 250738.0, "step": 320, "train_runtime": 103.202, "train_tokens_per_second": 2429.584 }, { "epoch": 0.065, "grad_norm": 14.5625, "learning_rate": 1.9936550971826835e-05, "loss": 0.5518, "mean_token_accuracy": 0.8401648521423339, "num_input_tokens_seen": 253189, "num_tokens": 253189.0, "step": 325, "train_runtime": 104.518, "train_tokens_per_second": 2422.444 }, { "epoch": 0.066, "grad_norm": 20.875, "learning_rate": 1.9932856228476705e-05, "loss": 0.3815, "mean_token_accuracy": 0.8773168802261353, "num_input_tokens_seen": 256678, "num_tokens": 256678.0, "step": 330, "train_runtime": 105.8144, "train_tokens_per_second": 2425.739 }, { "epoch": 0.067, "grad_norm": 9.25, "learning_rate": 1.9929057294164894e-05, "loss": 0.5549, "mean_token_accuracy": 0.848885440826416, "num_input_tokens_seen": 260411, "num_tokens": 260411.0, "step": 335, "train_runtime": 107.2233, "train_tokens_per_second": 2428.68 }, { "epoch": 0.068, "grad_norm": 14.125, "learning_rate": 1.9925154208740412e-05, "loss": 0.503, "mean_token_accuracy": 0.8559433341026306, "num_input_tokens_seen": 264604, "num_tokens": 264604.0, "step": 340, "train_runtime": 108.6163, "train_tokens_per_second": 2436.135 }, { "epoch": 0.069, "grad_norm": 7.96875, "learning_rate": 1.9921147013144782e-05, "loss": 0.3113, "mean_token_accuracy": 0.9154181480407715, "num_input_tokens_seen": 269653, "num_tokens": 269653.0, "step": 345, "train_runtime": 110.0572, "train_tokens_per_second": 2450.117 }, { "epoch": 0.07, "grad_norm": 4.90625, "learning_rate": 1.9917035749411585e-05, "loss": 0.4255, "mean_token_accuracy": 0.8897024989128113, "num_input_tokens_seen": 273940, "num_tokens": 273940.0, "step": 350, "train_runtime": 111.4659, "train_tokens_per_second": 2457.613 }, { "epoch": 0.071, "grad_norm": 20.5, "learning_rate": 1.9912820460666046e-05, "loss": 0.4981, "mean_token_accuracy": 0.8397509932518006, "num_input_tokens_seen": 278078, "num_tokens": 278078.0, "step": 355, "train_runtime": 112.8906, "train_tokens_per_second": 2463.251 }, { "epoch": 0.072, "grad_norm": 4.21875, "learning_rate": 1.9908501191124535e-05, "loss": 0.5018, "mean_token_accuracy": 0.8471596002578735, "num_input_tokens_seen": 281228, "num_tokens": 281228.0, "step": 360, "train_runtime": 114.1892, "train_tokens_per_second": 2462.825 }, { "epoch": 0.073, "grad_norm": 6.46875, "learning_rate": 1.9904077986094153e-05, "loss": 0.4473, "mean_token_accuracy": 0.8641816258430481, "num_input_tokens_seen": 285399, "num_tokens": 285399.0, "step": 365, "train_runtime": 115.5359, "train_tokens_per_second": 2470.22 }, { "epoch": 0.074, "grad_norm": 4.875, "learning_rate": 1.9899550891972224e-05, "loss": 0.3721, "mean_token_accuracy": 0.8825353264808655, "num_input_tokens_seen": 288646, "num_tokens": 288646.0, "step": 370, "train_runtime": 116.8283, "train_tokens_per_second": 2470.686 }, { "epoch": 0.075, "grad_norm": 9.625, "learning_rate": 1.9894919956245825e-05, "loss": 0.4836, "mean_token_accuracy": 0.8586413979530334, "num_input_tokens_seen": 292188, "num_tokens": 292188.0, "step": 375, "train_runtime": 118.2419, "train_tokens_per_second": 2471.104 }, { "epoch": 0.076, "grad_norm": 3.171875, "learning_rate": 1.9890185227491285e-05, "loss": 0.1799, "mean_token_accuracy": 0.9473751902580261, "num_input_tokens_seen": 297820, "num_tokens": 297820.0, "step": 380, "train_runtime": 119.8047, "train_tokens_per_second": 2485.878 }, { "epoch": 0.077, "grad_norm": 7.84375, "learning_rate": 1.988534675537366e-05, "loss": 0.2926, "mean_token_accuracy": 0.90623939037323, "num_input_tokens_seen": 301375, "num_tokens": 301375.0, "step": 385, "train_runtime": 121.1607, "train_tokens_per_second": 2487.398 }, { "epoch": 0.078, "grad_norm": 3.078125, "learning_rate": 1.9880404590646233e-05, "loss": 0.1832, "mean_token_accuracy": 0.9478045225143432, "num_input_tokens_seen": 304810, "num_tokens": 304810.0, "step": 390, "train_runtime": 122.4975, "train_tokens_per_second": 2488.295 }, { "epoch": 0.079, "grad_norm": 9.25, "learning_rate": 1.9875358785149982e-05, "loss": 0.3465, "mean_token_accuracy": 0.9022574305534363, "num_input_tokens_seen": 309719, "num_tokens": 309719.0, "step": 395, "train_runtime": 123.9967, "train_tokens_per_second": 2497.801 }, { "epoch": 0.08, "grad_norm": 2.640625, "learning_rate": 1.9870209391813013e-05, "loss": 0.2646, "mean_token_accuracy": 0.9134648203849792, "num_input_tokens_seen": 313542, "num_tokens": 313542.0, "step": 400, "train_runtime": 125.3495, "train_tokens_per_second": 2501.343 }, { "epoch": 0.081, "grad_norm": 34.5, "learning_rate": 1.9864956464650027e-05, "loss": 0.3524, "mean_token_accuracy": 0.8984482765197754, "num_input_tokens_seen": 318360, "num_tokens": 318360.0, "step": 405, "train_runtime": 126.7593, "train_tokens_per_second": 2511.532 }, { "epoch": 0.082, "grad_norm": 6.15625, "learning_rate": 1.985960005876174e-05, "loss": 0.3668, "mean_token_accuracy": 0.8949682712554932, "num_input_tokens_seen": 322050, "num_tokens": 322050.0, "step": 410, "train_runtime": 128.1602, "train_tokens_per_second": 2512.871 }, { "epoch": 0.083, "grad_norm": 20.375, "learning_rate": 1.9854140230334323e-05, "loss": 0.5019, "mean_token_accuracy": 0.8527088522911072, "num_input_tokens_seen": 324703, "num_tokens": 324703.0, "step": 415, "train_runtime": 129.4973, "train_tokens_per_second": 2507.412 }, { "epoch": 0.084, "grad_norm": 4.96875, "learning_rate": 1.984857703663879e-05, "loss": 0.304, "mean_token_accuracy": 0.9074190378189086, "num_input_tokens_seen": 328574, "num_tokens": 328574.0, "step": 420, "train_runtime": 131.1838, "train_tokens_per_second": 2504.684 }, { "epoch": 0.085, "grad_norm": 9.625, "learning_rate": 1.98429105360304e-05, "loss": 0.3258, "mean_token_accuracy": 0.8995503425598145, "num_input_tokens_seen": 332108, "num_tokens": 332108.0, "step": 425, "train_runtime": 132.5117, "train_tokens_per_second": 2506.253 }, { "epoch": 0.086, "grad_norm": 26.875, "learning_rate": 1.9837140787948082e-05, "loss": 0.4189, "mean_token_accuracy": 0.8601342797279358, "num_input_tokens_seen": 335534, "num_tokens": 335534.0, "step": 430, "train_runtime": 133.8334, "train_tokens_per_second": 2507.102 }, { "epoch": 0.087, "grad_norm": 21.875, "learning_rate": 1.983126785291375e-05, "loss": 0.4426, "mean_token_accuracy": 0.8721643924713135, "num_input_tokens_seen": 338656, "num_tokens": 338656.0, "step": 435, "train_runtime": 135.1926, "train_tokens_per_second": 2504.99 }, { "epoch": 0.088, "grad_norm": 7.78125, "learning_rate": 1.9825291792531717e-05, "loss": 0.473, "mean_token_accuracy": 0.8532997250556946, "num_input_tokens_seen": 343170, "num_tokens": 343170.0, "step": 440, "train_runtime": 136.6089, "train_tokens_per_second": 2512.061 }, { "epoch": 0.089, "grad_norm": 9.6875, "learning_rate": 1.9819212669488026e-05, "loss": 0.4124, "mean_token_accuracy": 0.8679243326187134, "num_input_tokens_seen": 348670, "num_tokens": 348670.0, "step": 445, "train_runtime": 138.0927, "train_tokens_per_second": 2524.897 }, { "epoch": 0.09, "grad_norm": 6.96875, "learning_rate": 1.9813030547549806e-05, "loss": 0.3884, "mean_token_accuracy": 0.8822393536567688, "num_input_tokens_seen": 351508, "num_tokens": 351508.0, "step": 450, "train_runtime": 139.4049, "train_tokens_per_second": 2521.489 }, { "epoch": 0.091, "grad_norm": 8.5625, "learning_rate": 1.9806745491564588e-05, "loss": 0.3235, "mean_token_accuracy": 0.8989403724670411, "num_input_tokens_seen": 354670, "num_tokens": 354670.0, "step": 455, "train_runtime": 140.712, "train_tokens_per_second": 2520.538 }, { "epoch": 0.092, "grad_norm": 4.09375, "learning_rate": 1.9800357567459633e-05, "loss": 0.2613, "mean_token_accuracy": 0.9180483818054199, "num_input_tokens_seen": 359858, "num_tokens": 359858.0, "step": 460, "train_runtime": 142.1734, "train_tokens_per_second": 2531.121 }, { "epoch": 0.093, "grad_norm": 13.125, "learning_rate": 1.9793866842241245e-05, "loss": 0.5006, "mean_token_accuracy": 0.8658200621604919, "num_input_tokens_seen": 364739, "num_tokens": 364739.0, "step": 465, "train_runtime": 143.6387, "train_tokens_per_second": 2539.281 }, { "epoch": 0.094, "grad_norm": 7.09375, "learning_rate": 1.978727338399406e-05, "loss": 0.494, "mean_token_accuracy": 0.8560270667076111, "num_input_tokens_seen": 369646, "num_tokens": 369646.0, "step": 470, "train_runtime": 145.0978, "train_tokens_per_second": 2547.564 }, { "epoch": 0.095, "grad_norm": 40.5, "learning_rate": 1.9780577261880336e-05, "loss": 0.4057, "mean_token_accuracy": 0.8605768799781799, "num_input_tokens_seen": 373027, "num_tokens": 373027.0, "step": 475, "train_runtime": 146.3949, "train_tokens_per_second": 2548.087 }, { "epoch": 0.096, "grad_norm": 6.3125, "learning_rate": 1.9773778546139228e-05, "loss": 0.2738, "mean_token_accuracy": 0.909856641292572, "num_input_tokens_seen": 376134, "num_tokens": 376134.0, "step": 480, "train_runtime": 147.753, "train_tokens_per_second": 2545.695 }, { "epoch": 0.097, "grad_norm": 7.21875, "learning_rate": 1.9766877308086038e-05, "loss": 0.4094, "mean_token_accuracy": 0.8764268159866333, "num_input_tokens_seen": 380804, "num_tokens": 380804.0, "step": 485, "train_runtime": 149.1257, "train_tokens_per_second": 2553.577 }, { "epoch": 0.098, "grad_norm": 8.8125, "learning_rate": 1.9759873620111492e-05, "loss": 0.3851, "mean_token_accuracy": 0.8840700626373291, "num_input_tokens_seen": 384926, "num_tokens": 384926.0, "step": 490, "train_runtime": 150.5503, "train_tokens_per_second": 2556.793 }, { "epoch": 0.099, "grad_norm": 13.9375, "learning_rate": 1.9752767555680967e-05, "loss": 0.4115, "mean_token_accuracy": 0.871186351776123, "num_input_tokens_seen": 388601, "num_tokens": 388601.0, "step": 495, "train_runtime": 151.8871, "train_tokens_per_second": 2558.487 }, { "epoch": 0.1, "grad_norm": 7.1875, "learning_rate": 1.974555918933371e-05, "loss": 0.4773, "mean_token_accuracy": 0.8502319931983948, "num_input_tokens_seen": 393118, "num_tokens": 393118.0, "step": 500, "train_runtime": 153.2947, "train_tokens_per_second": 2564.46 }, { "epoch": 0.101, "grad_norm": 11.5, "learning_rate": 1.9738248596682078e-05, "loss": 0.4178, "mean_token_accuracy": 0.8672469973564148, "num_input_tokens_seen": 397536, "num_tokens": 397536.0, "step": 505, "train_runtime": 154.6758, "train_tokens_per_second": 2570.124 }, { "epoch": 0.102, "grad_norm": 6.875, "learning_rate": 1.9730835854410726e-05, "loss": 0.3656, "mean_token_accuracy": 0.8897880911827087, "num_input_tokens_seen": 401812, "num_tokens": 401812.0, "step": 510, "train_runtime": 156.0971, "train_tokens_per_second": 2574.115 }, { "epoch": 0.103, "grad_norm": 3.6875, "learning_rate": 1.9723321040275816e-05, "loss": 0.351, "mean_token_accuracy": 0.8899795532226562, "num_input_tokens_seen": 405267, "num_tokens": 405267.0, "step": 515, "train_runtime": 157.3842, "train_tokens_per_second": 2575.017 }, { "epoch": 0.104, "grad_norm": 4.75, "learning_rate": 1.9715704233104188e-05, "loss": 0.3141, "mean_token_accuracy": 0.905564546585083, "num_input_tokens_seen": 411568, "num_tokens": 411568.0, "step": 520, "train_runtime": 158.9257, "train_tokens_per_second": 2589.689 }, { "epoch": 0.105, "grad_norm": 8.4375, "learning_rate": 1.9707985512792544e-05, "loss": 0.4037, "mean_token_accuracy": 0.8812000393867493, "num_input_tokens_seen": 416511, "num_tokens": 416511.0, "step": 525, "train_runtime": 160.4164, "train_tokens_per_second": 2596.436 }, { "epoch": 0.106, "grad_norm": 5.21875, "learning_rate": 1.9700164960306612e-05, "loss": 0.4526, "mean_token_accuracy": 0.8588366985321045, "num_input_tokens_seen": 419568, "num_tokens": 419568.0, "step": 530, "train_runtime": 161.7555, "train_tokens_per_second": 2593.84 }, { "epoch": 0.107, "grad_norm": 6.90625, "learning_rate": 1.9692242657680286e-05, "loss": 0.5419, "mean_token_accuracy": 0.8513948559761048, "num_input_tokens_seen": 422654, "num_tokens": 422654.0, "step": 535, "train_runtime": 163.0627, "train_tokens_per_second": 2591.972 }, { "epoch": 0.108, "grad_norm": 4.5, "learning_rate": 1.9684218688014773e-05, "loss": 0.3557, "mean_token_accuracy": 0.8887395739555359, "num_input_tokens_seen": 426240, "num_tokens": 426240.0, "step": 540, "train_runtime": 164.3794, "train_tokens_per_second": 2593.026 }, { "epoch": 0.109, "grad_norm": 12.0, "learning_rate": 1.9676093135477713e-05, "loss": 0.4518, "mean_token_accuracy": 0.8580608367919922, "num_input_tokens_seen": 429063, "num_tokens": 429063.0, "step": 545, "train_runtime": 165.6709, "train_tokens_per_second": 2589.851 }, { "epoch": 0.11, "grad_norm": 6.53125, "learning_rate": 1.9667866085302312e-05, "loss": 0.3081, "mean_token_accuracy": 0.9274378895759583, "num_input_tokens_seen": 432208, "num_tokens": 432208.0, "step": 550, "train_runtime": 166.987, "train_tokens_per_second": 2588.274 }, { "epoch": 0.111, "grad_norm": 4.28125, "learning_rate": 1.9659537623786428e-05, "loss": 0.202, "mean_token_accuracy": 0.931344747543335, "num_input_tokens_seen": 437933, "num_tokens": 437933.0, "step": 555, "train_runtime": 168.4489, "train_tokens_per_second": 2599.798 }, { "epoch": 0.112, "grad_norm": 8.75, "learning_rate": 1.965110783829169e-05, "loss": 0.376, "mean_token_accuracy": 0.900883948802948, "num_input_tokens_seen": 441776, "num_tokens": 441776.0, "step": 560, "train_runtime": 169.8636, "train_tokens_per_second": 2600.769 }, { "epoch": 0.113, "grad_norm": 8.0625, "learning_rate": 1.9642576817242553e-05, "loss": 0.3608, "mean_token_accuracy": 0.888201367855072, "num_input_tokens_seen": 445950, "num_tokens": 445950.0, "step": 565, "train_runtime": 171.2193, "train_tokens_per_second": 2604.554 }, { "epoch": 0.114, "grad_norm": 4.28125, "learning_rate": 1.963394465012539e-05, "loss": 0.2954, "mean_token_accuracy": 0.912472414970398, "num_input_tokens_seen": 450490, "num_tokens": 450490.0, "step": 570, "train_runtime": 172.6488, "train_tokens_per_second": 2609.286 }, { "epoch": 0.115, "grad_norm": 4.90625, "learning_rate": 1.962521142748755e-05, "loss": 0.35, "mean_token_accuracy": 0.8888274788856506, "num_input_tokens_seen": 454408, "num_tokens": 454408.0, "step": 575, "train_runtime": 174.0669, "train_tokens_per_second": 2610.536 }, { "epoch": 0.116, "grad_norm": 6.9375, "learning_rate": 1.961637724093641e-05, "loss": 0.3668, "mean_token_accuracy": 0.8867895603179932, "num_input_tokens_seen": 458938, "num_tokens": 458938.0, "step": 580, "train_runtime": 175.4362, "train_tokens_per_second": 2615.983 }, { "epoch": 0.117, "grad_norm": 14.875, "learning_rate": 1.9607442183138403e-05, "loss": 0.3355, "mean_token_accuracy": 0.8930761694908143, "num_input_tokens_seen": 463193, "num_tokens": 463193.0, "step": 585, "train_runtime": 176.8667, "train_tokens_per_second": 2618.882 }, { "epoch": 0.118, "grad_norm": 4.3125, "learning_rate": 1.9598406347818056e-05, "loss": 0.4601, "mean_token_accuracy": 0.8631152391433716, "num_input_tokens_seen": 468644, "num_tokens": 468644.0, "step": 590, "train_runtime": 178.2943, "train_tokens_per_second": 2628.486 }, { "epoch": 0.119, "grad_norm": 9.0, "learning_rate": 1.958926982975701e-05, "loss": 0.4553, "mean_token_accuracy": 0.8717481970787049, "num_input_tokens_seen": 471116, "num_tokens": 471116.0, "step": 595, "train_runtime": 179.5705, "train_tokens_per_second": 2623.571 }, { "epoch": 0.12, "grad_norm": 4.59375, "learning_rate": 1.958003272479301e-05, "loss": 0.2996, "mean_token_accuracy": 0.9096265316009522, "num_input_tokens_seen": 474596, "num_tokens": 474596.0, "step": 600, "train_runtime": 180.8863, "train_tokens_per_second": 2623.725 }, { "epoch": 0.121, "grad_norm": 5.75, "learning_rate": 1.9570695129818928e-05, "loss": 0.4149, "mean_token_accuracy": 0.8729338049888611, "num_input_tokens_seen": 477890, "num_tokens": 477890.0, "step": 605, "train_runtime": 182.2267, "train_tokens_per_second": 2622.503 }, { "epoch": 0.122, "grad_norm": 4.78125, "learning_rate": 1.9561257142781706e-05, "loss": 0.4101, "mean_token_accuracy": 0.8640504002571106, "num_input_tokens_seen": 483638, "num_tokens": 483638.0, "step": 610, "train_runtime": 183.6801, "train_tokens_per_second": 2633.045 }, { "epoch": 0.123, "grad_norm": 5.21875, "learning_rate": 1.9551718862681363e-05, "loss": 0.4018, "mean_token_accuracy": 0.8759877920150757, "num_input_tokens_seen": 486912, "num_tokens": 486912.0, "step": 615, "train_runtime": 185.0291, "train_tokens_per_second": 2631.543 }, { "epoch": 0.124, "grad_norm": 3.03125, "learning_rate": 1.9542080389569947e-05, "loss": 0.3335, "mean_token_accuracy": 0.8893043041229248, "num_input_tokens_seen": 490908, "num_tokens": 490908.0, "step": 620, "train_runtime": 186.3987, "train_tokens_per_second": 2633.645 }, { "epoch": 0.125, "grad_norm": 6.09375, "learning_rate": 1.953234182455048e-05, "loss": 0.3173, "mean_token_accuracy": 0.900081217288971, "num_input_tokens_seen": 494379, "num_tokens": 494379.0, "step": 625, "train_runtime": 187.7302, "train_tokens_per_second": 2633.455 }, { "epoch": 0.126, "grad_norm": 1.59375, "learning_rate": 1.9522503269775897e-05, "loss": 0.2303, "mean_token_accuracy": 0.9240877985954284, "num_input_tokens_seen": 500814, "num_tokens": 500814.0, "step": 630, "train_runtime": 189.2922, "train_tokens_per_second": 2645.72 }, { "epoch": 0.127, "grad_norm": 9.9375, "learning_rate": 1.951256482844799e-05, "loss": 0.2916, "mean_token_accuracy": 0.9116342306137085, "num_input_tokens_seen": 504621, "num_tokens": 504621.0, "step": 635, "train_runtime": 190.6854, "train_tokens_per_second": 2646.354 }, { "epoch": 0.128, "grad_norm": 4.21875, "learning_rate": 1.9502526604816293e-05, "loss": 0.2988, "mean_token_accuracy": 0.9047788977622986, "num_input_tokens_seen": 506980, "num_tokens": 506980.0, "step": 640, "train_runtime": 191.9573, "train_tokens_per_second": 2641.108 }, { "epoch": 0.129, "grad_norm": 8.125, "learning_rate": 1.9492388704177036e-05, "loss": 0.358, "mean_token_accuracy": 0.8898242115974426, "num_input_tokens_seen": 512687, "num_tokens": 512687.0, "step": 645, "train_runtime": 193.4445, "train_tokens_per_second": 2650.306 }, { "epoch": 0.13, "grad_norm": 8.5625, "learning_rate": 1.948215123287199e-05, "loss": 0.465, "mean_token_accuracy": 0.8623763680458069, "num_input_tokens_seen": 515470, "num_tokens": 515470.0, "step": 650, "train_runtime": 194.7332, "train_tokens_per_second": 2647.057 }, { "epoch": 0.131, "grad_norm": 9.6875, "learning_rate": 1.947181429828739e-05, "loss": 0.5495, "mean_token_accuracy": 0.8467213988304139, "num_input_tokens_seen": 518269, "num_tokens": 518269.0, "step": 655, "train_runtime": 196.0267, "train_tokens_per_second": 2643.87 }, { "epoch": 0.132, "grad_norm": 5.53125, "learning_rate": 1.9461378008852785e-05, "loss": 0.3559, "mean_token_accuracy": 0.8845923066139221, "num_input_tokens_seen": 522914, "num_tokens": 522914.0, "step": 660, "train_runtime": 197.5094, "train_tokens_per_second": 2647.54 }, { "epoch": 0.133, "grad_norm": 5.71875, "learning_rate": 1.9450842474039914e-05, "loss": 0.492, "mean_token_accuracy": 0.86255943775177, "num_input_tokens_seen": 526938, "num_tokens": 526938.0, "step": 665, "train_runtime": 198.9575, "train_tokens_per_second": 2648.495 }, { "epoch": 0.134, "grad_norm": 2.25, "learning_rate": 1.944020780436155e-05, "loss": 0.2636, "mean_token_accuracy": 0.9127674102783203, "num_input_tokens_seen": 530302, "num_tokens": 530302.0, "step": 670, "train_runtime": 200.2243, "train_tokens_per_second": 2648.54 }, { "epoch": 0.135, "grad_norm": 9.0625, "learning_rate": 1.942947411137035e-05, "loss": 0.3743, "mean_token_accuracy": 0.8907998204231262, "num_input_tokens_seen": 533257, "num_tokens": 533257.0, "step": 675, "train_runtime": 201.5504, "train_tokens_per_second": 2645.776 }, { "epoch": 0.136, "grad_norm": 2.921875, "learning_rate": 1.9418641507657673e-05, "loss": 0.2514, "mean_token_accuracy": 0.921090281009674, "num_input_tokens_seen": 537088, "num_tokens": 537088.0, "step": 680, "train_runtime": 202.9018, "train_tokens_per_second": 2647.034 }, { "epoch": 0.137, "grad_norm": 13.375, "learning_rate": 1.9407710106852405e-05, "loss": 0.4166, "mean_token_accuracy": 0.8800059676170349, "num_input_tokens_seen": 542201, "num_tokens": 542201.0, "step": 685, "train_runtime": 204.3567, "train_tokens_per_second": 2653.208 }, { "epoch": 0.138, "grad_norm": 8.5, "learning_rate": 1.9396680023619767e-05, "loss": 0.4388, "mean_token_accuracy": 0.8624199628829956, "num_input_tokens_seen": 545160, "num_tokens": 545160.0, "step": 690, "train_runtime": 205.7103, "train_tokens_per_second": 2650.134 }, { "epoch": 0.139, "grad_norm": 8.9375, "learning_rate": 1.9385551373660113e-05, "loss": 0.4954, "mean_token_accuracy": 0.8541293501853943, "num_input_tokens_seen": 547908, "num_tokens": 547908.0, "step": 695, "train_runtime": 206.9909, "train_tokens_per_second": 2647.015 }, { "epoch": 0.14, "grad_norm": 4.375, "learning_rate": 1.9374324273707717e-05, "loss": 0.4545, "mean_token_accuracy": 0.872514283657074, "num_input_tokens_seen": 550944, "num_tokens": 550944.0, "step": 700, "train_runtime": 208.2877, "train_tokens_per_second": 2645.11 }, { "epoch": 0.141, "grad_norm": 4.59375, "learning_rate": 1.9362998841529542e-05, "loss": 0.3849, "mean_token_accuracy": 0.8774855017662049, "num_input_tokens_seen": 553600, "num_tokens": 553600.0, "step": 705, "train_runtime": 209.5676, "train_tokens_per_second": 2641.629 }, { "epoch": 0.142, "grad_norm": 3.359375, "learning_rate": 1.9351575195924014e-05, "loss": 0.2589, "mean_token_accuracy": 0.9162958025932312, "num_input_tokens_seen": 557574, "num_tokens": 557574.0, "step": 710, "train_runtime": 210.8587, "train_tokens_per_second": 2644.301 }, { "epoch": 0.143, "grad_norm": 18.875, "learning_rate": 1.9340053456719768e-05, "loss": 0.3435, "mean_token_accuracy": 0.894795048236847, "num_input_tokens_seen": 561842, "num_tokens": 561842.0, "step": 715, "train_runtime": 212.2505, "train_tokens_per_second": 2647.07 }, { "epoch": 0.144, "grad_norm": 3.765625, "learning_rate": 1.9328433744774403e-05, "loss": 0.369, "mean_token_accuracy": 0.894304621219635, "num_input_tokens_seen": 566712, "num_tokens": 566712.0, "step": 720, "train_runtime": 213.6548, "train_tokens_per_second": 2652.466 }, { "epoch": 0.145, "grad_norm": 8.3125, "learning_rate": 1.931671618197319e-05, "loss": 0.5911, "mean_token_accuracy": 0.8231608152389527, "num_input_tokens_seen": 570866, "num_tokens": 570866.0, "step": 725, "train_runtime": 215.1019, "train_tokens_per_second": 2653.933 }, { "epoch": 0.146, "grad_norm": 7.6875, "learning_rate": 1.9304900891227825e-05, "loss": 0.3598, "mean_token_accuracy": 0.8848813533782959, "num_input_tokens_seen": 576416, "num_tokens": 576416.0, "step": 730, "train_runtime": 216.5592, "train_tokens_per_second": 2661.702 }, { "epoch": 0.147, "grad_norm": 12.625, "learning_rate": 1.9292987996475113e-05, "loss": 0.4869, "mean_token_accuracy": 0.8602707147598266, "num_input_tokens_seen": 580018, "num_tokens": 580018.0, "step": 735, "train_runtime": 217.9128, "train_tokens_per_second": 2661.697 }, { "epoch": 0.148, "grad_norm": 10.5, "learning_rate": 1.928097762267568e-05, "loss": 0.7007, "mean_token_accuracy": 0.7971182942390442, "num_input_tokens_seen": 583628, "num_tokens": 583628.0, "step": 740, "train_runtime": 219.2587, "train_tokens_per_second": 2661.823 }, { "epoch": 0.149, "grad_norm": 12.375, "learning_rate": 1.9268869895812673e-05, "loss": 0.2758, "mean_token_accuracy": 0.9111303687095642, "num_input_tokens_seen": 588820, "num_tokens": 588820.0, "step": 745, "train_runtime": 220.7254, "train_tokens_per_second": 2667.658 }, { "epoch": 0.15, "grad_norm": 5.34375, "learning_rate": 1.9256664942890412e-05, "loss": 0.3915, "mean_token_accuracy": 0.8697043657302856, "num_input_tokens_seen": 592422, "num_tokens": 592422.0, "step": 750, "train_runtime": 222.0407, "train_tokens_per_second": 2668.078 }, { "epoch": 0.151, "grad_norm": 4.03125, "learning_rate": 1.9244362891933077e-05, "loss": 0.381, "mean_token_accuracy": 0.8861169934272766, "num_input_tokens_seen": 595959, "num_tokens": 595959.0, "step": 755, "train_runtime": 223.4055, "train_tokens_per_second": 2667.611 }, { "epoch": 0.152, "grad_norm": 12.375, "learning_rate": 1.9231963871983367e-05, "loss": 0.4503, "mean_token_accuracy": 0.867646849155426, "num_input_tokens_seen": 600082, "num_tokens": 600082.0, "step": 760, "train_runtime": 224.7741, "train_tokens_per_second": 2669.712 }, { "epoch": 0.153, "grad_norm": 5.28125, "learning_rate": 1.9219468013101123e-05, "loss": 0.3421, "mean_token_accuracy": 0.8927922487258911, "num_input_tokens_seen": 604987, "num_tokens": 604987.0, "step": 765, "train_runtime": 226.1704, "train_tokens_per_second": 2674.917 }, { "epoch": 0.154, "grad_norm": 6.84375, "learning_rate": 1.9206875446362005e-05, "loss": 0.2455, "mean_token_accuracy": 0.9237758040428161, "num_input_tokens_seen": 609134, "num_tokens": 609134.0, "step": 770, "train_runtime": 227.6236, "train_tokens_per_second": 2676.058 }, { "epoch": 0.155, "grad_norm": 5.65625, "learning_rate": 1.919418630385607e-05, "loss": 0.2398, "mean_token_accuracy": 0.9231855273246765, "num_input_tokens_seen": 615022, "num_tokens": 615022.0, "step": 775, "train_runtime": 229.1713, "train_tokens_per_second": 2683.678 }, { "epoch": 0.156, "grad_norm": 5.53125, "learning_rate": 1.918140071868642e-05, "loss": 0.4564, "mean_token_accuracy": 0.8663341403007507, "num_input_tokens_seen": 617162, "num_tokens": 617162.0, "step": 780, "train_runtime": 230.4502, "train_tokens_per_second": 2678.071 }, { "epoch": 0.157, "grad_norm": 9.875, "learning_rate": 1.9168518824967797e-05, "loss": 0.5911, "mean_token_accuracy": 0.8241053462028504, "num_input_tokens_seen": 621439, "num_tokens": 621439.0, "step": 785, "train_runtime": 231.8521, "train_tokens_per_second": 2680.325 }, { "epoch": 0.158, "grad_norm": 8.1875, "learning_rate": 1.9155540757825168e-05, "loss": 0.6218, "mean_token_accuracy": 0.8190143942832947, "num_input_tokens_seen": 625808, "num_tokens": 625808.0, "step": 790, "train_runtime": 233.2673, "train_tokens_per_second": 2682.794 }, { "epoch": 0.159, "grad_norm": 6.9375, "learning_rate": 1.9142466653392317e-05, "loss": 0.3379, "mean_token_accuracy": 0.9036740303039551, "num_input_tokens_seen": 629429, "num_tokens": 629429.0, "step": 795, "train_runtime": 234.7136, "train_tokens_per_second": 2681.689 }, { "epoch": 0.16, "grad_norm": 2.484375, "learning_rate": 1.912929664881041e-05, "loss": 0.2102, "mean_token_accuracy": 0.9352823853492737, "num_input_tokens_seen": 635260, "num_tokens": 635260.0, "step": 800, "train_runtime": 236.4013, "train_tokens_per_second": 2687.211 }, { "epoch": 0.161, "grad_norm": 2.9375, "learning_rate": 1.911603088222657e-05, "loss": 0.3155, "mean_token_accuracy": 0.9152554869651794, "num_input_tokens_seen": 640340, "num_tokens": 640340.0, "step": 805, "train_runtime": 237.8703, "train_tokens_per_second": 2691.971 }, { "epoch": 0.162, "grad_norm": 4.71875, "learning_rate": 1.9102669492792406e-05, "loss": 0.3328, "mean_token_accuracy": 0.9042384028434753, "num_input_tokens_seen": 644708, "num_tokens": 644708.0, "step": 810, "train_runtime": 239.3119, "train_tokens_per_second": 2694.008 }, { "epoch": 0.163, "grad_norm": 24.75, "learning_rate": 1.908921262066257e-05, "loss": 0.4024, "mean_token_accuracy": 0.8811074137687683, "num_input_tokens_seen": 648428, "num_tokens": 648428.0, "step": 815, "train_runtime": 240.6694, "train_tokens_per_second": 2694.269 }, { "epoch": 0.164, "grad_norm": 2.90625, "learning_rate": 1.9075660406993285e-05, "loss": 0.3146, "mean_token_accuracy": 0.9160866856575012, "num_input_tokens_seen": 653176, "num_tokens": 653176.0, "step": 820, "train_runtime": 242.1653, "train_tokens_per_second": 2697.232 }, { "epoch": 0.165, "grad_norm": 9.0625, "learning_rate": 1.906201299394086e-05, "loss": 0.5898, "mean_token_accuracy": 0.8210569024085999, "num_input_tokens_seen": 657962, "num_tokens": 657962.0, "step": 825, "train_runtime": 243.592, "train_tokens_per_second": 2701.082 }, { "epoch": 0.166, "grad_norm": 3.96875, "learning_rate": 1.9048270524660197e-05, "loss": 0.3627, "mean_token_accuracy": 0.885036563873291, "num_input_tokens_seen": 662438, "num_tokens": 662438.0, "step": 830, "train_runtime": 244.9496, "train_tokens_per_second": 2704.385 }, { "epoch": 0.167, "grad_norm": 3.359375, "learning_rate": 1.90344331433033e-05, "loss": 0.3109, "mean_token_accuracy": 0.8919856309890747, "num_input_tokens_seen": 666091, "num_tokens": 666091.0, "step": 835, "train_runtime": 246.3127, "train_tokens_per_second": 2704.25 }, { "epoch": 0.168, "grad_norm": 3.578125, "learning_rate": 1.9020500995017747e-05, "loss": 0.2433, "mean_token_accuracy": 0.9184059023857116, "num_input_tokens_seen": 670986, "num_tokens": 670986.0, "step": 840, "train_runtime": 247.6926, "train_tokens_per_second": 2708.947 }, { "epoch": 0.169, "grad_norm": 5.6875, "learning_rate": 1.900647422594519e-05, "loss": 0.2867, "mean_token_accuracy": 0.9081339240074158, "num_input_tokens_seen": 675552, "num_tokens": 675552.0, "step": 845, "train_runtime": 249.1667, "train_tokens_per_second": 2711.245 }, { "epoch": 0.17, "grad_norm": 8.6875, "learning_rate": 1.8992352983219785e-05, "loss": 0.3521, "mean_token_accuracy": 0.8919368028640747, "num_input_tokens_seen": 678902, "num_tokens": 678902.0, "step": 850, "train_runtime": 250.5673, "train_tokens_per_second": 2709.459 }, { "epoch": 0.171, "grad_norm": 5.25, "learning_rate": 1.89781374149667e-05, "loss": 0.4009, "mean_token_accuracy": 0.8807836771011353, "num_input_tokens_seen": 683493, "num_tokens": 683493.0, "step": 855, "train_runtime": 251.9928, "train_tokens_per_second": 2712.352 }, { "epoch": 0.172, "grad_norm": 7.53125, "learning_rate": 1.8963827670300512e-05, "loss": 0.3769, "mean_token_accuracy": 0.9033098697662354, "num_input_tokens_seen": 686446, "num_tokens": 686446.0, "step": 860, "train_runtime": 253.3431, "train_tokens_per_second": 2709.551 }, { "epoch": 0.173, "grad_norm": 4.65625, "learning_rate": 1.894942389932367e-05, "loss": 0.5358, "mean_token_accuracy": 0.8468871474266052, "num_input_tokens_seen": 690168, "num_tokens": 690168.0, "step": 865, "train_runtime": 254.7238, "train_tokens_per_second": 2709.476 }, { "epoch": 0.174, "grad_norm": 4.5625, "learning_rate": 1.8934926253124922e-05, "loss": 0.2558, "mean_token_accuracy": 0.917995834350586, "num_input_tokens_seen": 695888, "num_tokens": 695888.0, "step": 870, "train_runtime": 256.1639, "train_tokens_per_second": 2716.573 }, { "epoch": 0.175, "grad_norm": 4.4375, "learning_rate": 1.892033488377771e-05, "loss": 0.4556, "mean_token_accuracy": 0.863447082042694, "num_input_tokens_seen": 699998, "num_tokens": 699998.0, "step": 875, "train_runtime": 257.595, "train_tokens_per_second": 2717.436 }, { "epoch": 0.176, "grad_norm": 7.96875, "learning_rate": 1.8905649944338596e-05, "loss": 0.3706, "mean_token_accuracy": 0.9007187366485596, "num_input_tokens_seen": 704454, "num_tokens": 704454.0, "step": 880, "train_runtime": 259.0199, "train_tokens_per_second": 2719.691 }, { "epoch": 0.177, "grad_norm": 8.4375, "learning_rate": 1.8890871588845653e-05, "loss": 0.4786, "mean_token_accuracy": 0.8600791692733765, "num_input_tokens_seen": 708551, "num_tokens": 708551.0, "step": 885, "train_runtime": 260.4768, "train_tokens_per_second": 2720.208 }, { "epoch": 0.178, "grad_norm": 10.75, "learning_rate": 1.8875999972316826e-05, "loss": 0.5986, "mean_token_accuracy": 0.8269249320030212, "num_input_tokens_seen": 712152, "num_tokens": 712152.0, "step": 890, "train_runtime": 261.7945, "train_tokens_per_second": 2720.271 }, { "epoch": 0.179, "grad_norm": 7.75, "learning_rate": 1.8861035250748343e-05, "loss": 0.6697, "mean_token_accuracy": 0.8020060300827027, "num_input_tokens_seen": 716001, "num_tokens": 716001.0, "step": 895, "train_runtime": 263.2824, "train_tokens_per_second": 2719.518 }, { "epoch": 0.18, "grad_norm": 5.09375, "learning_rate": 1.8845977581113048e-05, "loss": 0.4512, "mean_token_accuracy": 0.8687720775604248, "num_input_tokens_seen": 719402, "num_tokens": 719402.0, "step": 900, "train_runtime": 264.681, "train_tokens_per_second": 2717.996 }, { "epoch": 0.181, "grad_norm": 18.875, "learning_rate": 1.883082712135877e-05, "loss": 0.4768, "mean_token_accuracy": 0.8565496563911438, "num_input_tokens_seen": 722837, "num_tokens": 722837.0, "step": 905, "train_runtime": 265.9931, "train_tokens_per_second": 2717.503 }, { "epoch": 0.182, "grad_norm": 8.375, "learning_rate": 1.8815584030406663e-05, "loss": 0.57, "mean_token_accuracy": 0.8366148829460144, "num_input_tokens_seen": 726256, "num_tokens": 726256.0, "step": 910, "train_runtime": 267.4801, "train_tokens_per_second": 2715.177 }, { "epoch": 0.183, "grad_norm": 7.8125, "learning_rate": 1.8800248468149545e-05, "loss": 0.6508, "mean_token_accuracy": 0.8012928366661072, "num_input_tokens_seen": 730305, "num_tokens": 730305.0, "step": 915, "train_runtime": 268.9659, "train_tokens_per_second": 2715.233 }, { "epoch": 0.184, "grad_norm": 4.125, "learning_rate": 1.8784820595450198e-05, "loss": 0.3294, "mean_token_accuracy": 0.8942192554473877, "num_input_tokens_seen": 735002, "num_tokens": 735002.0, "step": 920, "train_runtime": 270.4668, "train_tokens_per_second": 2717.531 }, { "epoch": 0.185, "grad_norm": 6.375, "learning_rate": 1.876930057413971e-05, "loss": 0.5548, "mean_token_accuracy": 0.837021780014038, "num_input_tokens_seen": 738527, "num_tokens": 738527.0, "step": 925, "train_runtime": 271.7837, "train_tokens_per_second": 2717.334 }, { "epoch": 0.186, "grad_norm": 4.1875, "learning_rate": 1.875368856701576e-05, "loss": 0.2685, "mean_token_accuracy": 0.9202930212020874, "num_input_tokens_seen": 743188, "num_tokens": 743188.0, "step": 930, "train_runtime": 273.2667, "train_tokens_per_second": 2719.643 }, { "epoch": 0.187, "grad_norm": 3.90625, "learning_rate": 1.873798473784092e-05, "loss": 0.2575, "mean_token_accuracy": 0.9148669958114624, "num_input_tokens_seen": 747469, "num_tokens": 747469.0, "step": 935, "train_runtime": 274.7027, "train_tokens_per_second": 2721.01 }, { "epoch": 0.188, "grad_norm": 10.125, "learning_rate": 1.872218925134092e-05, "loss": 0.3521, "mean_token_accuracy": 0.8804177641868591, "num_input_tokens_seen": 750734, "num_tokens": 750734.0, "step": 940, "train_runtime": 275.9861, "train_tokens_per_second": 2720.187 }, { "epoch": 0.189, "grad_norm": 19.875, "learning_rate": 1.870630227320294e-05, "loss": 0.4695, "mean_token_accuracy": 0.8530766487121582, "num_input_tokens_seen": 753560, "num_tokens": 753560.0, "step": 945, "train_runtime": 277.3409, "train_tokens_per_second": 2717.09 }, { "epoch": 0.19, "grad_norm": 3.109375, "learning_rate": 1.8690323970073874e-05, "loss": 0.2616, "mean_token_accuracy": 0.9173449516296387, "num_input_tokens_seen": 758080, "num_tokens": 758080.0, "step": 950, "train_runtime": 278.8188, "train_tokens_per_second": 2718.898 }, { "epoch": 0.191, "grad_norm": 49.75, "learning_rate": 1.8674254509558544e-05, "loss": 0.5679, "mean_token_accuracy": 0.8285036325454712, "num_input_tokens_seen": 761148, "num_tokens": 761148.0, "step": 955, "train_runtime": 280.1393, "train_tokens_per_second": 2717.034 }, { "epoch": 0.192, "grad_norm": 9.8125, "learning_rate": 1.8658094060218e-05, "loss": 0.4511, "mean_token_accuracy": 0.8581412553787231, "num_input_tokens_seen": 764988, "num_tokens": 764988.0, "step": 960, "train_runtime": 281.5432, "train_tokens_per_second": 2717.125 }, { "epoch": 0.193, "grad_norm": 4.40625, "learning_rate": 1.86418427915677e-05, "loss": 0.2334, "mean_token_accuracy": 0.92465158700943, "num_input_tokens_seen": 769167, "num_tokens": 769167.0, "step": 965, "train_runtime": 282.9319, "train_tokens_per_second": 2718.558 }, { "epoch": 0.194, "grad_norm": 3.5625, "learning_rate": 1.862550087407577e-05, "loss": 0.2151, "mean_token_accuracy": 0.9331644415855408, "num_input_tokens_seen": 772772, "num_tokens": 772772.0, "step": 970, "train_runtime": 284.3013, "train_tokens_per_second": 2718.145 }, { "epoch": 0.195, "grad_norm": 9.3125, "learning_rate": 1.8609068479161182e-05, "loss": 0.4503, "mean_token_accuracy": 0.8710917949676513, "num_input_tokens_seen": 776383, "num_tokens": 776383.0, "step": 975, "train_runtime": 285.6822, "train_tokens_per_second": 2717.646 }, { "epoch": 0.196, "grad_norm": 8.4375, "learning_rate": 1.8592545779191993e-05, "loss": 0.377, "mean_token_accuracy": 0.8973711490631103, "num_input_tokens_seen": 780012, "num_tokens": 780012.0, "step": 980, "train_runtime": 287.0449, "train_tokens_per_second": 2717.387 }, { "epoch": 0.197, "grad_norm": 15.25, "learning_rate": 1.8575932947483503e-05, "loss": 0.5271, "mean_token_accuracy": 0.855127489566803, "num_input_tokens_seen": 784770, "num_tokens": 784770.0, "step": 985, "train_runtime": 288.5472, "train_tokens_per_second": 2719.728 }, { "epoch": 0.198, "grad_norm": 5.1875, "learning_rate": 1.8559230158296454e-05, "loss": 0.4855, "mean_token_accuracy": 0.8476455569267273, "num_input_tokens_seen": 788168, "num_tokens": 788168.0, "step": 990, "train_runtime": 289.8541, "train_tokens_per_second": 2719.189 }, { "epoch": 0.199, "grad_norm": 14.375, "learning_rate": 1.8542437586835202e-05, "loss": 0.5606, "mean_token_accuracy": 0.8517792344093322, "num_input_tokens_seen": 791733, "num_tokens": 791733.0, "step": 995, "train_runtime": 291.2125, "train_tokens_per_second": 2718.747 }, { "epoch": 0.2, "grad_norm": 39.25, "learning_rate": 1.8525555409245877e-05, "loss": 0.3594, "mean_token_accuracy": 0.8901917099952698, "num_input_tokens_seen": 794766, "num_tokens": 794766.0, "step": 1000, "train_runtime": 292.5109, "train_tokens_per_second": 2717.047 }, { "epoch": 0.201, "grad_norm": 6.78125, "learning_rate": 1.8508583802614534e-05, "loss": 0.1611, "mean_token_accuracy": 0.9469778895378113, "num_input_tokens_seen": 797797, "num_tokens": 797797.0, "step": 1005, "train_runtime": 293.7972, "train_tokens_per_second": 2715.469 }, { "epoch": 0.202, "grad_norm": 11.1875, "learning_rate": 1.849152294496529e-05, "loss": 0.2862, "mean_token_accuracy": 0.9087902545928955, "num_input_tokens_seen": 800840, "num_tokens": 800840.0, "step": 1010, "train_runtime": 295.0835, "train_tokens_per_second": 2713.944 }, { "epoch": 0.203, "grad_norm": 6.59375, "learning_rate": 1.8474373015258472e-05, "loss": 0.4383, "mean_token_accuracy": 0.8819673299789429, "num_input_tokens_seen": 803453, "num_tokens": 803453.0, "step": 1015, "train_runtime": 296.4109, "train_tokens_per_second": 2710.605 }, { "epoch": 0.204, "grad_norm": 8.75, "learning_rate": 1.845713419338873e-05, "loss": 0.6584, "mean_token_accuracy": 0.789824640750885, "num_input_tokens_seen": 806862, "num_tokens": 806862.0, "step": 1020, "train_runtime": 297.7335, "train_tokens_per_second": 2710.014 }, { "epoch": 0.205, "grad_norm": 7.96875, "learning_rate": 1.843980666018315e-05, "loss": 0.3265, "mean_token_accuracy": 0.9028616309165954, "num_input_tokens_seen": 810034, "num_tokens": 810034.0, "step": 1025, "train_runtime": 299.0119, "train_tokens_per_second": 2709.036 }, { "epoch": 0.206, "grad_norm": 5.84375, "learning_rate": 1.842239059739935e-05, "loss": 0.4315, "mean_token_accuracy": 0.8716623306274414, "num_input_tokens_seen": 812972, "num_tokens": 812972.0, "step": 1030, "train_runtime": 300.3165, "train_tokens_per_second": 2707.051 }, { "epoch": 0.207, "grad_norm": 6.0, "learning_rate": 1.840488618772359e-05, "loss": 0.3874, "mean_token_accuracy": 0.8816864848136902, "num_input_tokens_seen": 815936, "num_tokens": 815936.0, "step": 1035, "train_runtime": 301.5871, "train_tokens_per_second": 2705.474 }, { "epoch": 0.208, "grad_norm": 5.3125, "learning_rate": 1.8387293614768843e-05, "loss": 0.353, "mean_token_accuracy": 0.8836291790008545, "num_input_tokens_seen": 819818, "num_tokens": 819818.0, "step": 1040, "train_runtime": 302.8898, "train_tokens_per_second": 2706.654 }, { "epoch": 0.209, "grad_norm": 4.6875, "learning_rate": 1.8369613063072875e-05, "loss": 0.7399, "mean_token_accuracy": 0.7805561184883117, "num_input_tokens_seen": 823407, "num_tokens": 823407.0, "step": 1045, "train_runtime": 304.2532, "train_tokens_per_second": 2706.321 }, { "epoch": 0.21, "grad_norm": 5.71875, "learning_rate": 1.835184471809631e-05, "loss": 0.4171, "mean_token_accuracy": 0.8717728018760681, "num_input_tokens_seen": 827948, "num_tokens": 827948.0, "step": 1050, "train_runtime": 305.6637, "train_tokens_per_second": 2708.69 }, { "epoch": 0.211, "grad_norm": 3.390625, "learning_rate": 1.8333988766220676e-05, "loss": 0.4492, "mean_token_accuracy": 0.8556033611297608, "num_input_tokens_seen": 831900, "num_tokens": 831900.0, "step": 1055, "train_runtime": 307.0775, "train_tokens_per_second": 2709.088 }, { "epoch": 0.212, "grad_norm": 8.0, "learning_rate": 1.831604539474646e-05, "loss": 0.368, "mean_token_accuracy": 0.8837300181388855, "num_input_tokens_seen": 836888, "num_tokens": 836888.0, "step": 1060, "train_runtime": 308.5121, "train_tokens_per_second": 2712.658 }, { "epoch": 0.213, "grad_norm": 9.75, "learning_rate": 1.8298014791891138e-05, "loss": 0.5238, "mean_token_accuracy": 0.8480489015579223, "num_input_tokens_seen": 840939, "num_tokens": 840939.0, "step": 1065, "train_runtime": 309.8767, "train_tokens_per_second": 2713.786 }, { "epoch": 0.214, "grad_norm": 5.40625, "learning_rate": 1.8279897146787204e-05, "loss": 0.3442, "mean_token_accuracy": 0.8955756068229676, "num_input_tokens_seen": 845104, "num_tokens": 845104.0, "step": 1070, "train_runtime": 311.2611, "train_tokens_per_second": 2715.096 }, { "epoch": 0.215, "grad_norm": 11.75, "learning_rate": 1.8261692649480174e-05, "loss": 0.5964, "mean_token_accuracy": 0.819271183013916, "num_input_tokens_seen": 848511, "num_tokens": 848511.0, "step": 1075, "train_runtime": 312.6762, "train_tokens_per_second": 2713.705 }, { "epoch": 0.216, "grad_norm": 9.6875, "learning_rate": 1.8243401490926623e-05, "loss": 0.3502, "mean_token_accuracy": 0.9055861949920654, "num_input_tokens_seen": 852766, "num_tokens": 852766.0, "step": 1080, "train_runtime": 314.0898, "train_tokens_per_second": 2715.038 }, { "epoch": 0.217, "grad_norm": 8.0625, "learning_rate": 1.822502386299214e-05, "loss": 0.5568, "mean_token_accuracy": 0.8429929614067078, "num_input_tokens_seen": 857361, "num_tokens": 857361.0, "step": 1085, "train_runtime": 315.5312, "train_tokens_per_second": 2717.199 }, { "epoch": 0.218, "grad_norm": 9.75, "learning_rate": 1.820655995844935e-05, "loss": 0.3878, "mean_token_accuracy": 0.8906150460243225, "num_input_tokens_seen": 861842, "num_tokens": 861842.0, "step": 1090, "train_runtime": 316.9345, "train_tokens_per_second": 2719.306 }, { "epoch": 0.219, "grad_norm": 11.0, "learning_rate": 1.818800997097587e-05, "loss": 0.5616, "mean_token_accuracy": 0.8594035148620606, "num_input_tokens_seen": 865910, "num_tokens": 865910.0, "step": 1095, "train_runtime": 318.3538, "train_tokens_per_second": 2719.961 }, { "epoch": 0.22, "grad_norm": 3.421875, "learning_rate": 1.8169374095152298e-05, "loss": 0.2283, "mean_token_accuracy": 0.9218070864677429, "num_input_tokens_seen": 869194, "num_tokens": 869194.0, "step": 1100, "train_runtime": 319.6829, "train_tokens_per_second": 2718.926 }, { "epoch": 0.221, "grad_norm": 11.75, "learning_rate": 1.8150652526460146e-05, "loss": 0.3372, "mean_token_accuracy": 0.8953383088111877, "num_input_tokens_seen": 873119, "num_tokens": 873119.0, "step": 1105, "train_runtime": 321.0213, "train_tokens_per_second": 2719.817 }, { "epoch": 0.222, "grad_norm": 5.6875, "learning_rate": 1.8131845461279813e-05, "loss": 0.4103, "mean_token_accuracy": 0.8891323924064636, "num_input_tokens_seen": 876268, "num_tokens": 876268.0, "step": 1110, "train_runtime": 322.3536, "train_tokens_per_second": 2718.344 }, { "epoch": 0.223, "grad_norm": 4.53125, "learning_rate": 1.8112953096888517e-05, "loss": 0.4233, "mean_token_accuracy": 0.8740591049194336, "num_input_tokens_seen": 880110, "num_tokens": 880110.0, "step": 1115, "train_runtime": 323.6679, "train_tokens_per_second": 2719.176 }, { "epoch": 0.224, "grad_norm": 11.8125, "learning_rate": 1.8093975631458215e-05, "loss": 0.5525, "mean_token_accuracy": 0.8331067323684692, "num_input_tokens_seen": 883712, "num_tokens": 883712.0, "step": 1120, "train_runtime": 325.0654, "train_tokens_per_second": 2718.567 }, { "epoch": 0.225, "grad_norm": 3.6875, "learning_rate": 1.8074913264053547e-05, "loss": 0.4436, "mean_token_accuracy": 0.8731826782226563, "num_input_tokens_seen": 887945, "num_tokens": 887945.0, "step": 1125, "train_runtime": 326.4954, "train_tokens_per_second": 2719.625 }, { "epoch": 0.226, "grad_norm": 3.640625, "learning_rate": 1.8055766194629717e-05, "loss": 0.2229, "mean_token_accuracy": 0.9289170622825622, "num_input_tokens_seen": 893294, "num_tokens": 893294.0, "step": 1130, "train_runtime": 327.9663, "train_tokens_per_second": 2723.737 }, { "epoch": 0.227, "grad_norm": 4.21875, "learning_rate": 1.8036534624030428e-05, "loss": 0.3664, "mean_token_accuracy": 0.9010273218154907, "num_input_tokens_seen": 896809, "num_tokens": 896809.0, "step": 1135, "train_runtime": 329.303, "train_tokens_per_second": 2723.355 }, { "epoch": 0.228, "grad_norm": 5.25, "learning_rate": 1.8017218753985758e-05, "loss": 0.3774, "mean_token_accuracy": 0.8850043416023254, "num_input_tokens_seen": 901160, "num_tokens": 901160.0, "step": 1140, "train_runtime": 330.6712, "train_tokens_per_second": 2725.245 }, { "epoch": 0.229, "grad_norm": 9.875, "learning_rate": 1.7997818787110043e-05, "loss": 0.3873, "mean_token_accuracy": 0.8873796463012695, "num_input_tokens_seen": 905628, "num_tokens": 905628.0, "step": 1145, "train_runtime": 332.0595, "train_tokens_per_second": 2727.306 }, { "epoch": 0.23, "grad_norm": 6.5, "learning_rate": 1.7978334926899748e-05, "loss": 0.3582, "mean_token_accuracy": 0.8932291030883789, "num_input_tokens_seen": 908698, "num_tokens": 908698.0, "step": 1150, "train_runtime": 333.4082, "train_tokens_per_second": 2725.482 }, { "epoch": 0.231, "grad_norm": 4.78125, "learning_rate": 1.795876737773136e-05, "loss": 0.3881, "mean_token_accuracy": 0.8805781602859497, "num_input_tokens_seen": 913006, "num_tokens": 913006.0, "step": 1155, "train_runtime": 334.8153, "train_tokens_per_second": 2726.894 }, { "epoch": 0.232, "grad_norm": 8.625, "learning_rate": 1.79391163448592e-05, "loss": 0.5232, "mean_token_accuracy": 0.8424407482147217, "num_input_tokens_seen": 915976, "num_tokens": 915976.0, "step": 1160, "train_runtime": 336.1402, "train_tokens_per_second": 2724.982 }, { "epoch": 0.233, "grad_norm": 7.4375, "learning_rate": 1.7919382034413306e-05, "loss": 0.3478, "mean_token_accuracy": 0.8923591732978821, "num_input_tokens_seen": 918918, "num_tokens": 918918.0, "step": 1165, "train_runtime": 337.4968, "train_tokens_per_second": 2722.746 }, { "epoch": 0.234, "grad_norm": 17.5, "learning_rate": 1.789956465339726e-05, "loss": 0.4392, "mean_token_accuracy": 0.873762559890747, "num_input_tokens_seen": 923990, "num_tokens": 923990.0, "step": 1170, "train_runtime": 338.998, "train_tokens_per_second": 2725.65 }, { "epoch": 0.235, "grad_norm": 4.6875, "learning_rate": 1.7879664409686007e-05, "loss": 0.5179, "mean_token_accuracy": 0.8277951955795289, "num_input_tokens_seen": 927366, "num_tokens": 927366.0, "step": 1175, "train_runtime": 340.3749, "train_tokens_per_second": 2724.543 }, { "epoch": 0.236, "grad_norm": 7.21875, "learning_rate": 1.7859681512023694e-05, "loss": 0.4557, "mean_token_accuracy": 0.87276850938797, "num_input_tokens_seen": 931564, "num_tokens": 931564.0, "step": 1180, "train_runtime": 341.8298, "train_tokens_per_second": 2725.227 }, { "epoch": 0.237, "grad_norm": 6.1875, "learning_rate": 1.7839616170021452e-05, "loss": 0.4456, "mean_token_accuracy": 0.8761956691741943, "num_input_tokens_seen": 935673, "num_tokens": 935673.0, "step": 1185, "train_runtime": 343.2706, "train_tokens_per_second": 2725.759 }, { "epoch": 0.238, "grad_norm": 4.9375, "learning_rate": 1.7819468594155235e-05, "loss": 0.457, "mean_token_accuracy": 0.8697030186653137, "num_input_tokens_seen": 940364, "num_tokens": 940364.0, "step": 1190, "train_runtime": 344.6805, "train_tokens_per_second": 2728.219 }, { "epoch": 0.239, "grad_norm": 7.0, "learning_rate": 1.779923899576357e-05, "loss": 0.4861, "mean_token_accuracy": 0.851796555519104, "num_input_tokens_seen": 942265, "num_tokens": 942265.0, "step": 1195, "train_runtime": 346.0017, "train_tokens_per_second": 2723.296 }, { "epoch": 0.24, "grad_norm": 12.5625, "learning_rate": 1.7778927587045373e-05, "loss": 0.4914, "mean_token_accuracy": 0.8666201233863831, "num_input_tokens_seen": 946960, "num_tokens": 946960.0, "step": 1200, "train_runtime": 347.4676, "train_tokens_per_second": 2725.319 }, { "epoch": 0.241, "grad_norm": 3.265625, "learning_rate": 1.775853458105772e-05, "loss": 0.2755, "mean_token_accuracy": 0.9105640530586243, "num_input_tokens_seen": 951147, "num_tokens": 951147.0, "step": 1205, "train_runtime": 348.8418, "train_tokens_per_second": 2726.586 }, { "epoch": 0.242, "grad_norm": 9.75, "learning_rate": 1.773806019171358e-05, "loss": 0.7536, "mean_token_accuracy": 0.7889259934425354, "num_input_tokens_seen": 954796, "num_tokens": 954796.0, "step": 1210, "train_runtime": 350.215, "train_tokens_per_second": 2726.314 }, { "epoch": 0.243, "grad_norm": 4.75, "learning_rate": 1.7717504633779618e-05, "loss": 0.478, "mean_token_accuracy": 0.8523941397666931, "num_input_tokens_seen": 957742, "num_tokens": 957742.0, "step": 1215, "train_runtime": 351.5142, "train_tokens_per_second": 2724.618 }, { "epoch": 0.244, "grad_norm": 6.25, "learning_rate": 1.769686812287391e-05, "loss": 0.3188, "mean_token_accuracy": 0.8984705924987793, "num_input_tokens_seen": 960248, "num_tokens": 960248.0, "step": 1220, "train_runtime": 352.8303, "train_tokens_per_second": 2721.557 }, { "epoch": 0.245, "grad_norm": 6.8125, "learning_rate": 1.7676150875463688e-05, "loss": 0.5472, "mean_token_accuracy": 0.8612379431724548, "num_input_tokens_seen": 962863, "num_tokens": 962863.0, "step": 1225, "train_runtime": 354.1677, "train_tokens_per_second": 2718.664 }, { "epoch": 0.246, "grad_norm": 8.8125, "learning_rate": 1.7655353108863068e-05, "loss": 0.4648, "mean_token_accuracy": 0.8674867272377014, "num_input_tokens_seen": 967310, "num_tokens": 967310.0, "step": 1230, "train_runtime": 355.9588, "train_tokens_per_second": 2717.477 }, { "epoch": 0.247, "grad_norm": 3.046875, "learning_rate": 1.7634475041230796e-05, "loss": 0.2089, "mean_token_accuracy": 0.9286256074905396, "num_input_tokens_seen": 971111, "num_tokens": 971111.0, "step": 1235, "train_runtime": 357.6863, "train_tokens_per_second": 2714.979 }, { "epoch": 0.248, "grad_norm": 9.875, "learning_rate": 1.7613516891567907e-05, "loss": 0.361, "mean_token_accuracy": 0.8855413556098938, "num_input_tokens_seen": 973830, "num_tokens": 973830.0, "step": 1240, "train_runtime": 358.9988, "train_tokens_per_second": 2712.627 }, { "epoch": 0.249, "grad_norm": 4.75, "learning_rate": 1.759247887971548e-05, "loss": 0.2766, "mean_token_accuracy": 0.9154811263084411, "num_input_tokens_seen": 978505, "num_tokens": 978505.0, "step": 1245, "train_runtime": 360.4577, "train_tokens_per_second": 2714.618 }, { "epoch": 0.25, "grad_norm": 4.46875, "learning_rate": 1.7571361226352305e-05, "loss": 0.29, "mean_token_accuracy": 0.919904088973999, "num_input_tokens_seen": 982126, "num_tokens": 982126.0, "step": 1250, "train_runtime": 361.7865, "train_tokens_per_second": 2714.656 }, { "epoch": 0.251, "grad_norm": 3.546875, "learning_rate": 1.7550164152992573e-05, "loss": 0.3863, "mean_token_accuracy": 0.8768803834915161, "num_input_tokens_seen": 985887, "num_tokens": 985887.0, "step": 1255, "train_runtime": 363.1268, "train_tokens_per_second": 2714.994 }, { "epoch": 0.252, "grad_norm": 2.546875, "learning_rate": 1.752888788198355e-05, "loss": 0.436, "mean_token_accuracy": 0.8765869021415711, "num_input_tokens_seen": 991662, "num_tokens": 991662.0, "step": 1260, "train_runtime": 364.6306, "train_tokens_per_second": 2719.635 }, { "epoch": 0.253, "grad_norm": 5.59375, "learning_rate": 1.7507532636503256e-05, "loss": 0.3187, "mean_token_accuracy": 0.9084311723709106, "num_input_tokens_seen": 995454, "num_tokens": 995454.0, "step": 1265, "train_runtime": 366.0426, "train_tokens_per_second": 2719.503 }, { "epoch": 0.254, "grad_norm": 6.4375, "learning_rate": 1.7486098640558105e-05, "loss": 0.4074, "mean_token_accuracy": 0.8747420072555542, "num_input_tokens_seen": 999158, "num_tokens": 999158.0, "step": 1270, "train_runtime": 367.3826, "train_tokens_per_second": 2719.666 }, { "epoch": 0.255, "grad_norm": 10.9375, "learning_rate": 1.746458611898058e-05, "loss": 0.4546, "mean_token_accuracy": 0.8622093319892883, "num_input_tokens_seen": 1003721, "num_tokens": 1003721.0, "step": 1275, "train_runtime": 368.8155, "train_tokens_per_second": 2721.472 }, { "epoch": 0.256, "grad_norm": 8.25, "learning_rate": 1.7442995297426846e-05, "loss": 0.5134, "mean_token_accuracy": 0.8471454620361328, "num_input_tokens_seen": 1006824, "num_tokens": 1006824.0, "step": 1280, "train_runtime": 370.2289, "train_tokens_per_second": 2719.464 }, { "epoch": 0.257, "grad_norm": 4.625, "learning_rate": 1.7421326402374406e-05, "loss": 0.3309, "mean_token_accuracy": 0.8942543745040894, "num_input_tokens_seen": 1012781, "num_tokens": 1012781.0, "step": 1285, "train_runtime": 371.7748, "train_tokens_per_second": 2724.179 }, { "epoch": 0.258, "grad_norm": 2.71875, "learning_rate": 1.7399579661119713e-05, "loss": 0.2874, "mean_token_accuracy": 0.9109650015830993, "num_input_tokens_seen": 1018450, "num_tokens": 1018450.0, "step": 1290, "train_runtime": 373.2451, "train_tokens_per_second": 2728.636 }, { "epoch": 0.259, "grad_norm": 4.53125, "learning_rate": 1.73777553017758e-05, "loss": 0.3053, "mean_token_accuracy": 0.9123963832855224, "num_input_tokens_seen": 1022830, "num_tokens": 1022830.0, "step": 1295, "train_runtime": 374.6594, "train_tokens_per_second": 2730.026 }, { "epoch": 0.26, "grad_norm": 4.59375, "learning_rate": 1.7355853553269865e-05, "loss": 0.3392, "mean_token_accuracy": 0.8937130093574523, "num_input_tokens_seen": 1026050, "num_tokens": 1026050.0, "step": 1300, "train_runtime": 375.9762, "train_tokens_per_second": 2729.029 }, { "epoch": 0.261, "grad_norm": 11.0625, "learning_rate": 1.7333874645340886e-05, "loss": 0.459, "mean_token_accuracy": 0.8555809378623962, "num_input_tokens_seen": 1030465, "num_tokens": 1030465.0, "step": 1305, "train_runtime": 377.4258, "train_tokens_per_second": 2730.245 }, { "epoch": 0.262, "grad_norm": 5.03125, "learning_rate": 1.7311818808537206e-05, "loss": 0.3151, "mean_token_accuracy": 0.9068294167518616, "num_input_tokens_seen": 1034112, "num_tokens": 1034112.0, "step": 1310, "train_runtime": 378.8585, "train_tokens_per_second": 2729.547 }, { "epoch": 0.263, "grad_norm": 3.828125, "learning_rate": 1.7289686274214116e-05, "loss": 0.308, "mean_token_accuracy": 0.9022178053855896, "num_input_tokens_seen": 1036942, "num_tokens": 1036942.0, "step": 1315, "train_runtime": 380.2265, "train_tokens_per_second": 2727.169 }, { "epoch": 0.264, "grad_norm": 3.140625, "learning_rate": 1.7267477274531432e-05, "loss": 0.404, "mean_token_accuracy": 0.8793684124946595, "num_input_tokens_seen": 1042210, "num_tokens": 1042210.0, "step": 1320, "train_runtime": 381.6841, "train_tokens_per_second": 2730.557 }, { "epoch": 0.265, "grad_norm": 19.375, "learning_rate": 1.724519204245105e-05, "loss": 0.4976, "mean_token_accuracy": 0.8482323169708252, "num_input_tokens_seen": 1045924, "num_tokens": 1045924.0, "step": 1325, "train_runtime": 383.058, "train_tokens_per_second": 2730.458 }, { "epoch": 0.266, "grad_norm": 7.875, "learning_rate": 1.7222830811734502e-05, "loss": 0.4166, "mean_token_accuracy": 0.8774193525314331, "num_input_tokens_seen": 1049630, "num_tokens": 1049630.0, "step": 1330, "train_runtime": 384.4378, "train_tokens_per_second": 2730.298 }, { "epoch": 0.267, "grad_norm": 15.3125, "learning_rate": 1.720039381694053e-05, "loss": 0.4612, "mean_token_accuracy": 0.8649683833122254, "num_input_tokens_seen": 1054221, "num_tokens": 1054221.0, "step": 1335, "train_runtime": 385.8368, "train_tokens_per_second": 2732.298 }, { "epoch": 0.268, "grad_norm": 9.125, "learning_rate": 1.7177881293422586e-05, "loss": 0.3372, "mean_token_accuracy": 0.8990665793418884, "num_input_tokens_seen": 1058678, "num_tokens": 1058678.0, "step": 1340, "train_runtime": 387.2824, "train_tokens_per_second": 2733.607 }, { "epoch": 0.269, "grad_norm": 4.8125, "learning_rate": 1.7155293477326385e-05, "loss": 0.4275, "mean_token_accuracy": 0.8608209133148194, "num_input_tokens_seen": 1062386, "num_tokens": 1062386.0, "step": 1345, "train_runtime": 388.6719, "train_tokens_per_second": 2733.375 }, { "epoch": 0.27, "grad_norm": 6.625, "learning_rate": 1.7132630605587433e-05, "loss": 0.3398, "mean_token_accuracy": 0.886587393283844, "num_input_tokens_seen": 1065268, "num_tokens": 1065268.0, "step": 1350, "train_runtime": 389.9975, "train_tokens_per_second": 2731.474 }, { "epoch": 0.271, "grad_norm": 4.6875, "learning_rate": 1.7109892915928535e-05, "loss": 0.3338, "mean_token_accuracy": 0.8928539037704468, "num_input_tokens_seen": 1070394, "num_tokens": 1070394.0, "step": 1355, "train_runtime": 391.4655, "train_tokens_per_second": 2734.325 }, { "epoch": 0.272, "grad_norm": 6.59375, "learning_rate": 1.7087080646857293e-05, "loss": 0.3113, "mean_token_accuracy": 0.903121554851532, "num_input_tokens_seen": 1073534, "num_tokens": 1073534.0, "step": 1360, "train_runtime": 392.7943, "train_tokens_per_second": 2733.069 }, { "epoch": 0.273, "grad_norm": 4.71875, "learning_rate": 1.706419403766361e-05, "loss": 0.2812, "mean_token_accuracy": 0.9164461135864258, "num_input_tokens_seen": 1078667, "num_tokens": 1078667.0, "step": 1365, "train_runtime": 394.2857, "train_tokens_per_second": 2735.749 }, { "epoch": 0.274, "grad_norm": 4.90625, "learning_rate": 1.7041233328417194e-05, "loss": 0.4228, "mean_token_accuracy": 0.8700190782546997, "num_input_tokens_seen": 1083166, "num_tokens": 1083166.0, "step": 1370, "train_runtime": 395.6732, "train_tokens_per_second": 2737.527 }, { "epoch": 0.275, "grad_norm": 4.375, "learning_rate": 1.7018198759965018e-05, "loss": 0.4187, "mean_token_accuracy": 0.8686501502990722, "num_input_tokens_seen": 1086884, "num_tokens": 1086884.0, "step": 1375, "train_runtime": 397.0752, "train_tokens_per_second": 2737.225 }, { "epoch": 0.276, "grad_norm": 1.8515625, "learning_rate": 1.69950905739288e-05, "loss": 0.2418, "mean_token_accuracy": 0.9215017199516297, "num_input_tokens_seen": 1090492, "num_tokens": 1090492.0, "step": 1380, "train_runtime": 398.3879, "train_tokens_per_second": 2737.262 }, { "epoch": 0.277, "grad_norm": 9.75, "learning_rate": 1.6971909012702483e-05, "loss": 0.4795, "mean_token_accuracy": 0.8581609725952148, "num_input_tokens_seen": 1093902, "num_tokens": 1093902.0, "step": 1385, "train_runtime": 399.6952, "train_tokens_per_second": 2736.841 }, { "epoch": 0.278, "grad_norm": 4.96875, "learning_rate": 1.6948654319449674e-05, "loss": 0.4869, "mean_token_accuracy": 0.8512163281440734, "num_input_tokens_seen": 1096200, "num_tokens": 1096200.0, "step": 1390, "train_runtime": 400.9841, "train_tokens_per_second": 2733.774 }, { "epoch": 0.279, "grad_norm": 5.65625, "learning_rate": 1.69253267381011e-05, "loss": 0.5716, "mean_token_accuracy": 0.8427738428115845, "num_input_tokens_seen": 1099569, "num_tokens": 1099569.0, "step": 1395, "train_runtime": 402.2983, "train_tokens_per_second": 2733.218 }, { "epoch": 0.28, "grad_norm": 6.1875, "learning_rate": 1.6901926513352052e-05, "loss": 0.4507, "mean_token_accuracy": 0.8652465105056762, "num_input_tokens_seen": 1103064, "num_tokens": 1103064.0, "step": 1400, "train_runtime": 403.621, "train_tokens_per_second": 2732.921 }, { "epoch": 0.281, "grad_norm": 4.5, "learning_rate": 1.6878453890659815e-05, "loss": 0.2425, "mean_token_accuracy": 0.920332396030426, "num_input_tokens_seen": 1107391, "num_tokens": 1107391.0, "step": 1405, "train_runtime": 405.0036, "train_tokens_per_second": 2734.274 }, { "epoch": 0.282, "grad_norm": 5.96875, "learning_rate": 1.685490911624109e-05, "loss": 0.4107, "mean_token_accuracy": 0.8746109724044799, "num_input_tokens_seen": 1110756, "num_tokens": 1110756.0, "step": 1410, "train_runtime": 406.2831, "train_tokens_per_second": 2733.946 }, { "epoch": 0.283, "grad_norm": 5.09375, "learning_rate": 1.6831292437069425e-05, "loss": 0.3496, "mean_token_accuracy": 0.8887304186820983, "num_input_tokens_seen": 1114054, "num_tokens": 1114054.0, "step": 1415, "train_runtime": 407.6087, "train_tokens_per_second": 2733.146 }, { "epoch": 0.284, "grad_norm": 7.21875, "learning_rate": 1.6807604100872604e-05, "loss": 0.3255, "mean_token_accuracy": 0.8966919183731079, "num_input_tokens_seen": 1117308, "num_tokens": 1117308.0, "step": 1420, "train_runtime": 408.9236, "train_tokens_per_second": 2732.315 }, { "epoch": 0.285, "grad_norm": 6.96875, "learning_rate": 1.6783844356130073e-05, "loss": 0.3484, "mean_token_accuracy": 0.8884515881538391, "num_input_tokens_seen": 1119847, "num_tokens": 1119847.0, "step": 1425, "train_runtime": 410.2169, "train_tokens_per_second": 2729.89 }, { "epoch": 0.286, "grad_norm": 12.0625, "learning_rate": 1.6760013452070304e-05, "loss": 0.3856, "mean_token_accuracy": 0.885358464717865, "num_input_tokens_seen": 1123256, "num_tokens": 1123256.0, "step": 1430, "train_runtime": 411.554, "train_tokens_per_second": 2729.304 }, { "epoch": 0.287, "grad_norm": 4.46875, "learning_rate": 1.6736111638668203e-05, "loss": 0.2603, "mean_token_accuracy": 0.9121795892715454, "num_input_tokens_seen": 1128236, "num_tokens": 1128236.0, "step": 1435, "train_runtime": 412.9517, "train_tokens_per_second": 2732.126 }, { "epoch": 0.288, "grad_norm": 5.6875, "learning_rate": 1.671213916664249e-05, "loss": 0.4436, "mean_token_accuracy": 0.881895923614502, "num_input_tokens_seen": 1132012, "num_tokens": 1132012.0, "step": 1440, "train_runtime": 414.2715, "train_tokens_per_second": 2732.536 }, { "epoch": 0.289, "grad_norm": 3.625, "learning_rate": 1.6688096287453048e-05, "loss": 0.4607, "mean_token_accuracy": 0.8540219783782959, "num_input_tokens_seen": 1135580, "num_tokens": 1135580.0, "step": 1445, "train_runtime": 415.6116, "train_tokens_per_second": 2732.311 }, { "epoch": 0.29, "grad_norm": 3.84375, "learning_rate": 1.66639832532983e-05, "loss": 0.4296, "mean_token_accuracy": 0.8724186539649963, "num_input_tokens_seen": 1138622, "num_tokens": 1138622.0, "step": 1450, "train_runtime": 416.9206, "train_tokens_per_second": 2731.028 }, { "epoch": 0.291, "grad_norm": 4.53125, "learning_rate": 1.663980031711257e-05, "loss": 0.4924, "mean_token_accuracy": 0.8534921526908874, "num_input_tokens_seen": 1141498, "num_tokens": 1141498.0, "step": 1455, "train_runtime": 418.2359, "train_tokens_per_second": 2729.316 }, { "epoch": 0.292, "grad_norm": 5.15625, "learning_rate": 1.661554773256341e-05, "loss": 0.4439, "mean_token_accuracy": 0.8630551338195801, "num_input_tokens_seen": 1145494, "num_tokens": 1145494.0, "step": 1460, "train_runtime": 419.5808, "train_tokens_per_second": 2730.091 }, { "epoch": 0.293, "grad_norm": 4.34375, "learning_rate": 1.6591225754048963e-05, "loss": 0.2226, "mean_token_accuracy": 0.9342629194259644, "num_input_tokens_seen": 1150032, "num_tokens": 1150032.0, "step": 1465, "train_runtime": 421.0488, "train_tokens_per_second": 2731.351 }, { "epoch": 0.294, "grad_norm": 6.5625, "learning_rate": 1.6566834636695264e-05, "loss": 0.3171, "mean_token_accuracy": 0.8947740912437439, "num_input_tokens_seen": 1152916, "num_tokens": 1152916.0, "step": 1470, "train_runtime": 422.3577, "train_tokens_per_second": 2729.715 }, { "epoch": 0.295, "grad_norm": 7.15625, "learning_rate": 1.6542374636353605e-05, "loss": 0.319, "mean_token_accuracy": 0.9039330840110779, "num_input_tokens_seen": 1156358, "num_tokens": 1156358.0, "step": 1475, "train_runtime": 423.7473, "train_tokens_per_second": 2728.886 }, { "epoch": 0.296, "grad_norm": 7.9375, "learning_rate": 1.6517846009597804e-05, "loss": 0.4551, "mean_token_accuracy": 0.8637072443962097, "num_input_tokens_seen": 1160492, "num_tokens": 1160492.0, "step": 1480, "train_runtime": 425.2186, "train_tokens_per_second": 2729.166 }, { "epoch": 0.297, "grad_norm": 5.1875, "learning_rate": 1.6493249013721558e-05, "loss": 0.3279, "mean_token_accuracy": 0.904472005367279, "num_input_tokens_seen": 1163652, "num_tokens": 1163652.0, "step": 1485, "train_runtime": 426.5984, "train_tokens_per_second": 2727.746 }, { "epoch": 0.298, "grad_norm": 2.953125, "learning_rate": 1.646858390673571e-05, "loss": 0.3966, "mean_token_accuracy": 0.8890823006629944, "num_input_tokens_seen": 1166460, "num_tokens": 1166460.0, "step": 1490, "train_runtime": 427.9248, "train_tokens_per_second": 2725.853 }, { "epoch": 0.299, "grad_norm": 18.125, "learning_rate": 1.644385094736556e-05, "loss": 0.4795, "mean_token_accuracy": 0.8650658369064331, "num_input_tokens_seen": 1169175, "num_tokens": 1169175.0, "step": 1495, "train_runtime": 429.2873, "train_tokens_per_second": 2723.525 }, { "epoch": 0.3, "grad_norm": 5.34375, "learning_rate": 1.6419050395048147e-05, "loss": 0.1976, "mean_token_accuracy": 0.9400353074073792, "num_input_tokens_seen": 1174002, "num_tokens": 1174002.0, "step": 1500, "train_runtime": 430.8167, "train_tokens_per_second": 2725.062 }, { "epoch": 0.301, "grad_norm": 4.34375, "learning_rate": 1.639418250992954e-05, "loss": 0.4356, "mean_token_accuracy": 0.8579196453094482, "num_input_tokens_seen": 1178102, "num_tokens": 1178102.0, "step": 1505, "train_runtime": 432.2342, "train_tokens_per_second": 2725.611 }, { "epoch": 0.302, "grad_norm": 12.0, "learning_rate": 1.636924755286207e-05, "loss": 0.5529, "mean_token_accuracy": 0.8390391945838929, "num_input_tokens_seen": 1182056, "num_tokens": 1182056.0, "step": 1510, "train_runtime": 433.7152, "train_tokens_per_second": 2725.42 }, { "epoch": 0.303, "grad_norm": 13.0, "learning_rate": 1.6344245785401653e-05, "loss": 0.2867, "mean_token_accuracy": 0.9180734515190124, "num_input_tokens_seen": 1184704, "num_tokens": 1184704.0, "step": 1515, "train_runtime": 435.0345, "train_tokens_per_second": 2723.241 }, { "epoch": 0.304, "grad_norm": 4.46875, "learning_rate": 1.631917746980499e-05, "loss": 0.313, "mean_token_accuracy": 0.9152883291244507, "num_input_tokens_seen": 1190086, "num_tokens": 1190086.0, "step": 1520, "train_runtime": 436.5231, "train_tokens_per_second": 2726.284 }, { "epoch": 0.305, "grad_norm": 3.546875, "learning_rate": 1.629404286902685e-05, "loss": 0.2344, "mean_token_accuracy": 0.9208229303359985, "num_input_tokens_seen": 1194412, "num_tokens": 1194412.0, "step": 1525, "train_runtime": 437.9832, "train_tokens_per_second": 2727.072 }, { "epoch": 0.306, "grad_norm": 4.8125, "learning_rate": 1.6268842246717307e-05, "loss": 0.3499, "mean_token_accuracy": 0.8941912174224853, "num_input_tokens_seen": 1198668, "num_tokens": 1198668.0, "step": 1530, "train_runtime": 439.3325, "train_tokens_per_second": 2728.385 }, { "epoch": 0.307, "grad_norm": 7.59375, "learning_rate": 1.624357586721896e-05, "loss": 0.4294, "mean_token_accuracy": 0.8654176831245423, "num_input_tokens_seen": 1201776, "num_tokens": 1201776.0, "step": 1535, "train_runtime": 440.6854, "train_tokens_per_second": 2727.061 }, { "epoch": 0.308, "grad_norm": 5.90625, "learning_rate": 1.6218243995564177e-05, "loss": 0.3436, "mean_token_accuracy": 0.8835726976394653, "num_input_tokens_seen": 1205828, "num_tokens": 1205828.0, "step": 1540, "train_runtime": 442.0433, "train_tokens_per_second": 2727.85 }, { "epoch": 0.309, "grad_norm": 9.3125, "learning_rate": 1.61928468974723e-05, "loss": 0.3526, "mean_token_accuracy": 0.8929942369461059, "num_input_tokens_seen": 1209471, "num_tokens": 1209471.0, "step": 1545, "train_runtime": 443.4144, "train_tokens_per_second": 2727.631 }, { "epoch": 0.31, "grad_norm": 4.65625, "learning_rate": 1.6167384839346872e-05, "loss": 0.376, "mean_token_accuracy": 0.8891526460647583, "num_input_tokens_seen": 1212388, "num_tokens": 1212388.0, "step": 1550, "train_runtime": 444.7552, "train_tokens_per_second": 2725.967 }, { "epoch": 0.311, "grad_norm": 9.9375, "learning_rate": 1.6141858088272838e-05, "loss": 0.5845, "mean_token_accuracy": 0.8364776134490967, "num_input_tokens_seen": 1215280, "num_tokens": 1215280.0, "step": 1555, "train_runtime": 446.0869, "train_tokens_per_second": 2724.312 }, { "epoch": 0.312, "grad_norm": 6.5625, "learning_rate": 1.6116266912013734e-05, "loss": 0.5478, "mean_token_accuracy": 0.8418053030967713, "num_input_tokens_seen": 1218644, "num_tokens": 1218644.0, "step": 1560, "train_runtime": 447.4448, "train_tokens_per_second": 2723.563 }, { "epoch": 0.313, "grad_norm": 3.796875, "learning_rate": 1.609061157900889e-05, "loss": 0.4766, "mean_token_accuracy": 0.8726443290710449, "num_input_tokens_seen": 1223361, "num_tokens": 1223361.0, "step": 1565, "train_runtime": 448.9378, "train_tokens_per_second": 2725.012 }, { "epoch": 0.314, "grad_norm": 4.21875, "learning_rate": 1.6064892358370608e-05, "loss": 0.3091, "mean_token_accuracy": 0.9060997486114502, "num_input_tokens_seen": 1228142, "num_tokens": 1228142.0, "step": 1570, "train_runtime": 450.3068, "train_tokens_per_second": 2727.345 }, { "epoch": 0.315, "grad_norm": 6.5625, "learning_rate": 1.603910951988135e-05, "loss": 0.3955, "mean_token_accuracy": 0.8809860944747925, "num_input_tokens_seen": 1232007, "num_tokens": 1232007.0, "step": 1575, "train_runtime": 451.7031, "train_tokens_per_second": 2727.471 }, { "epoch": 0.316, "grad_norm": 4.71875, "learning_rate": 1.601326333399088e-05, "loss": 0.239, "mean_token_accuracy": 0.922086501121521, "num_input_tokens_seen": 1236266, "num_tokens": 1236266.0, "step": 1580, "train_runtime": 453.1395, "train_tokens_per_second": 2728.224 }, { "epoch": 0.317, "grad_norm": 4.875, "learning_rate": 1.598735407181347e-05, "loss": 0.3103, "mean_token_accuracy": 0.8965014100074769, "num_input_tokens_seen": 1240103, "num_tokens": 1240103.0, "step": 1585, "train_runtime": 454.4869, "train_tokens_per_second": 2728.578 }, { "epoch": 0.318, "grad_norm": 7.1875, "learning_rate": 1.596138200512501e-05, "loss": 0.3539, "mean_token_accuracy": 0.8828752398490906, "num_input_tokens_seen": 1243756, "num_tokens": 1243756.0, "step": 1590, "train_runtime": 455.8171, "train_tokens_per_second": 2728.63 }, { "epoch": 0.319, "grad_norm": 4.9375, "learning_rate": 1.5935347406360192e-05, "loss": 0.3486, "mean_token_accuracy": 0.8902850866317749, "num_input_tokens_seen": 1247510, "num_tokens": 1247510.0, "step": 1595, "train_runtime": 457.1654, "train_tokens_per_second": 2728.793 }, { "epoch": 0.32, "grad_norm": 2.9375, "learning_rate": 1.5909250548609644e-05, "loss": 0.2869, "mean_token_accuracy": 0.8999109387397766, "num_input_tokens_seen": 1252078, "num_tokens": 1252078.0, "step": 1600, "train_runtime": 458.5633, "train_tokens_per_second": 2730.437 }, { "epoch": 0.321, "grad_norm": 3.90625, "learning_rate": 1.5883091705617045e-05, "loss": 0.2874, "mean_token_accuracy": 0.9061586618423462, "num_input_tokens_seen": 1258099, "num_tokens": 1258099.0, "step": 1605, "train_runtime": 460.128, "train_tokens_per_second": 2734.237 }, { "epoch": 0.322, "grad_norm": 3.890625, "learning_rate": 1.585687115177629e-05, "loss": 0.3432, "mean_token_accuracy": 0.9033243179321289, "num_input_tokens_seen": 1261674, "num_tokens": 1261674.0, "step": 1610, "train_runtime": 461.4648, "train_tokens_per_second": 2734.063 }, { "epoch": 0.323, "grad_norm": 3.296875, "learning_rate": 1.5830589162128574e-05, "loss": 0.4557, "mean_token_accuracy": 0.863908314704895, "num_input_tokens_seen": 1264885, "num_tokens": 1264885.0, "step": 1615, "train_runtime": 462.7821, "train_tokens_per_second": 2733.219 }, { "epoch": 0.324, "grad_norm": 7.96875, "learning_rate": 1.5804246012359535e-05, "loss": 0.3607, "mean_token_accuracy": 0.8884742975234985, "num_input_tokens_seen": 1270154, "num_tokens": 1270154.0, "step": 1620, "train_runtime": 464.2486, "train_tokens_per_second": 2735.935 }, { "epoch": 0.325, "grad_norm": 15.125, "learning_rate": 1.5777841978796348e-05, "loss": 0.5129, "mean_token_accuracy": 0.8510098934173584, "num_input_tokens_seen": 1274175, "num_tokens": 1274175.0, "step": 1625, "train_runtime": 465.6263, "train_tokens_per_second": 2736.476 }, { "epoch": 0.326, "grad_norm": 8.125, "learning_rate": 1.575137733840483e-05, "loss": 0.4741, "mean_token_accuracy": 0.8565960168838501, "num_input_tokens_seen": 1278140, "num_tokens": 1278140.0, "step": 1630, "train_runtime": 466.9543, "train_tokens_per_second": 2737.184 }, { "epoch": 0.327, "grad_norm": 6.0, "learning_rate": 1.572485236878654e-05, "loss": 0.3004, "mean_token_accuracy": 0.8983551144599915, "num_input_tokens_seen": 1281676, "num_tokens": 1281676.0, "step": 1635, "train_runtime": 468.2363, "train_tokens_per_second": 2737.242 }, { "epoch": 0.328, "grad_norm": 3.84375, "learning_rate": 1.5698267348175852e-05, "loss": 0.2986, "mean_token_accuracy": 0.8952516198158265, "num_input_tokens_seen": 1286288, "num_tokens": 1286288.0, "step": 1640, "train_runtime": 469.6177, "train_tokens_per_second": 2739.011 }, { "epoch": 0.329, "grad_norm": 4.3125, "learning_rate": 1.5671622555437055e-05, "loss": 0.48, "mean_token_accuracy": 0.8539553761482239, "num_input_tokens_seen": 1290241, "num_tokens": 1290241.0, "step": 1645, "train_runtime": 470.988, "train_tokens_per_second": 2739.435 }, { "epoch": 0.33, "grad_norm": 7.40625, "learning_rate": 1.5644918270061418e-05, "loss": 0.3883, "mean_token_accuracy": 0.8611080050468445, "num_input_tokens_seen": 1292286, "num_tokens": 1292286.0, "step": 1650, "train_runtime": 472.3019, "train_tokens_per_second": 2736.144 }, { "epoch": 0.331, "grad_norm": 4.21875, "learning_rate": 1.5618154772164257e-05, "loss": 0.2928, "mean_token_accuracy": 0.9078295946121215, "num_input_tokens_seen": 1296849, "num_tokens": 1296849.0, "step": 1655, "train_runtime": 474.5184, "train_tokens_per_second": 2732.979 }, { "epoch": 0.332, "grad_norm": 5.25, "learning_rate": 1.5591332342482002e-05, "loss": 0.3343, "mean_token_accuracy": 0.899256420135498, "num_input_tokens_seen": 1301432, "num_tokens": 1301432.0, "step": 1660, "train_runtime": 475.9265, "train_tokens_per_second": 2734.523 }, { "epoch": 0.333, "grad_norm": 3.515625, "learning_rate": 1.5564451262369247e-05, "loss": 0.2112, "mean_token_accuracy": 0.9284720182418823, "num_input_tokens_seen": 1305175, "num_tokens": 1305175.0, "step": 1665, "train_runtime": 477.2473, "train_tokens_per_second": 2734.798 }, { "epoch": 0.334, "grad_norm": 4.125, "learning_rate": 1.55375118137958e-05, "loss": 0.3074, "mean_token_accuracy": 0.914725911617279, "num_input_tokens_seen": 1309082, "num_tokens": 1309082.0, "step": 1670, "train_runtime": 478.5953, "train_tokens_per_second": 2735.259 }, { "epoch": 0.335, "grad_norm": 4.75, "learning_rate": 1.5510514279343736e-05, "loss": 0.4159, "mean_token_accuracy": 0.858314061164856, "num_input_tokens_seen": 1311270, "num_tokens": 1311270.0, "step": 1675, "train_runtime": 479.8907, "train_tokens_per_second": 2732.435 }, { "epoch": 0.336, "grad_norm": 2.90625, "learning_rate": 1.5483458942204407e-05, "loss": 0.195, "mean_token_accuracy": 0.936877703666687, "num_input_tokens_seen": 1316562, "num_tokens": 1316562.0, "step": 1680, "train_runtime": 481.3543, "train_tokens_per_second": 2735.12 }, { "epoch": 0.337, "grad_norm": 6.28125, "learning_rate": 1.5456346086175508e-05, "loss": 0.3471, "mean_token_accuracy": 0.8903882741928101, "num_input_tokens_seen": 1319054, "num_tokens": 1319054.0, "step": 1685, "train_runtime": 482.6243, "train_tokens_per_second": 2733.086 }, { "epoch": 0.338, "grad_norm": 9.0, "learning_rate": 1.542917599565806e-05, "loss": 0.3655, "mean_token_accuracy": 0.8902895212173462, "num_input_tokens_seen": 1323050, "num_tokens": 1323050.0, "step": 1690, "train_runtime": 484.0226, "train_tokens_per_second": 2733.446 }, { "epoch": 0.339, "grad_norm": 4.125, "learning_rate": 1.540194895565346e-05, "loss": 0.6057, "mean_token_accuracy": 0.8183093190193176, "num_input_tokens_seen": 1326686, "num_tokens": 1326686.0, "step": 1695, "train_runtime": 485.3602, "train_tokens_per_second": 2733.405 }, { "epoch": 0.34, "grad_norm": 9.25, "learning_rate": 1.5374665251760474e-05, "loss": 0.3168, "mean_token_accuracy": 0.8834209084510803, "num_input_tokens_seen": 1329388, "num_tokens": 1329388.0, "step": 1700, "train_runtime": 486.6418, "train_tokens_per_second": 2731.759 }, { "epoch": 0.341, "grad_norm": 5.09375, "learning_rate": 1.5347325170172246e-05, "loss": 0.4745, "mean_token_accuracy": 0.8453391075134278, "num_input_tokens_seen": 1331655, "num_tokens": 1331655.0, "step": 1705, "train_runtime": 487.9428, "train_tokens_per_second": 2729.121 }, { "epoch": 0.342, "grad_norm": 3.703125, "learning_rate": 1.531992899767329e-05, "loss": 0.2648, "mean_token_accuracy": 0.918376910686493, "num_input_tokens_seen": 1336544, "num_tokens": 1336544.0, "step": 1710, "train_runtime": 489.4456, "train_tokens_per_second": 2730.73 }, { "epoch": 0.343, "grad_norm": 8.875, "learning_rate": 1.5292477021636498e-05, "loss": 0.2836, "mean_token_accuracy": 0.9088773965835572, "num_input_tokens_seen": 1340755, "num_tokens": 1340755.0, "step": 1715, "train_runtime": 490.8225, "train_tokens_per_second": 2731.649 }, { "epoch": 0.344, "grad_norm": 3.390625, "learning_rate": 1.5264969530020105e-05, "loss": 0.3069, "mean_token_accuracy": 0.9017263770103454, "num_input_tokens_seen": 1344922, "num_tokens": 1344922.0, "step": 1720, "train_runtime": 492.1729, "train_tokens_per_second": 2732.621 }, { "epoch": 0.345, "grad_norm": 12.6875, "learning_rate": 1.5237406811364682e-05, "loss": 0.4221, "mean_token_accuracy": 0.8690052151679992, "num_input_tokens_seen": 1349096, "num_tokens": 1349096.0, "step": 1725, "train_runtime": 493.5873, "train_tokens_per_second": 2733.247 }, { "epoch": 0.346, "grad_norm": 3.21875, "learning_rate": 1.5209789154790107e-05, "loss": 0.3546, "mean_token_accuracy": 0.8887965559959412, "num_input_tokens_seen": 1353228, "num_tokens": 1353228.0, "step": 1730, "train_runtime": 494.9611, "train_tokens_per_second": 2734.009 }, { "epoch": 0.347, "grad_norm": 4.53125, "learning_rate": 1.5182116849992528e-05, "loss": 0.4081, "mean_token_accuracy": 0.871317195892334, "num_input_tokens_seen": 1357693, "num_tokens": 1357693.0, "step": 1735, "train_runtime": 496.3109, "train_tokens_per_second": 2735.57 }, { "epoch": 0.348, "grad_norm": 9.5, "learning_rate": 1.5154390187241328e-05, "loss": 0.3959, "mean_token_accuracy": 0.8937503576278687, "num_input_tokens_seen": 1361580, "num_tokens": 1361580.0, "step": 1740, "train_runtime": 497.6866, "train_tokens_per_second": 2735.818 }, { "epoch": 0.349, "grad_norm": 4.40625, "learning_rate": 1.512660945737608e-05, "loss": 0.4503, "mean_token_accuracy": 0.8690906643867493, "num_input_tokens_seen": 1365285, "num_tokens": 1365285.0, "step": 1745, "train_runtime": 499.0906, "train_tokens_per_second": 2735.546 }, { "epoch": 0.35, "grad_norm": 5.15625, "learning_rate": 1.5098774951803492e-05, "loss": 0.4635, "mean_token_accuracy": 0.8728008508682251, "num_input_tokens_seen": 1368706, "num_tokens": 1368706.0, "step": 1750, "train_runtime": 500.4987, "train_tokens_per_second": 2734.684 }, { "epoch": 0.351, "grad_norm": 3.375, "learning_rate": 1.507088696249436e-05, "loss": 0.3882, "mean_token_accuracy": 0.8878094792366028, "num_input_tokens_seen": 1372455, "num_tokens": 1372455.0, "step": 1755, "train_runtime": 501.7939, "train_tokens_per_second": 2735.097 }, { "epoch": 0.352, "grad_norm": 6.34375, "learning_rate": 1.5042945781980494e-05, "loss": 0.3036, "mean_token_accuracy": 0.9164047122001648, "num_input_tokens_seen": 1375942, "num_tokens": 1375942.0, "step": 1760, "train_runtime": 503.1442, "train_tokens_per_second": 2734.687 }, { "epoch": 0.353, "grad_norm": 7.875, "learning_rate": 1.5014951703351655e-05, "loss": 0.372, "mean_token_accuracy": 0.8814056634902954, "num_input_tokens_seen": 1378851, "num_tokens": 1378851.0, "step": 1765, "train_runtime": 504.4583, "train_tokens_per_second": 2733.33 }, { "epoch": 0.354, "grad_norm": 2.875, "learning_rate": 1.4986905020252482e-05, "loss": 0.2687, "mean_token_accuracy": 0.9079343557357789, "num_input_tokens_seen": 1383938, "num_tokens": 1383938.0, "step": 1770, "train_runtime": 505.8701, "train_tokens_per_second": 2735.758 }, { "epoch": 0.355, "grad_norm": 17.375, "learning_rate": 1.4958806026879411e-05, "loss": 0.4914, "mean_token_accuracy": 0.8433244347572326, "num_input_tokens_seen": 1387254, "num_tokens": 1387254.0, "step": 1775, "train_runtime": 507.1933, "train_tokens_per_second": 2735.158 }, { "epoch": 0.356, "grad_norm": 11.375, "learning_rate": 1.4930655017977583e-05, "loss": 0.5745, "mean_token_accuracy": 0.8225432753562927, "num_input_tokens_seen": 1390956, "num_tokens": 1390956.0, "step": 1780, "train_runtime": 508.5157, "train_tokens_per_second": 2735.326 }, { "epoch": 0.357, "grad_norm": 2.953125, "learning_rate": 1.4902452288837761e-05, "loss": 0.2893, "mean_token_accuracy": 0.9090275406837464, "num_input_tokens_seen": 1393802, "num_tokens": 1393802.0, "step": 1785, "train_runtime": 509.829, "train_tokens_per_second": 2733.862 }, { "epoch": 0.358, "grad_norm": 8.1875, "learning_rate": 1.4874198135293232e-05, "loss": 0.3913, "mean_token_accuracy": 0.8779014348983765, "num_input_tokens_seen": 1398106, "num_tokens": 1398106.0, "step": 1790, "train_runtime": 511.2561, "train_tokens_per_second": 2734.649 }, { "epoch": 0.359, "grad_norm": 5.0625, "learning_rate": 1.4845892853716692e-05, "loss": 0.528, "mean_token_accuracy": 0.8401447772979737, "num_input_tokens_seen": 1401288, "num_tokens": 1401288.0, "step": 1795, "train_runtime": 512.5385, "train_tokens_per_second": 2734.015 }, { "epoch": 0.36, "grad_norm": 8.8125, "learning_rate": 1.4817536741017153e-05, "loss": 0.4439, "mean_token_accuracy": 0.8615753293037415, "num_input_tokens_seen": 1405606, "num_tokens": 1405606.0, "step": 1800, "train_runtime": 513.9322, "train_tokens_per_second": 2735.003 }, { "epoch": 0.361, "grad_norm": 10.125, "learning_rate": 1.478913009463682e-05, "loss": 0.3117, "mean_token_accuracy": 0.9050674915313721, "num_input_tokens_seen": 1409995, "num_tokens": 1409995.0, "step": 1805, "train_runtime": 515.3747, "train_tokens_per_second": 2735.864 }, { "epoch": 0.362, "grad_norm": 7.21875, "learning_rate": 1.4760673212547975e-05, "loss": 0.4903, "mean_token_accuracy": 0.8609210968017578, "num_input_tokens_seen": 1411724, "num_tokens": 1411724.0, "step": 1810, "train_runtime": 516.6685, "train_tokens_per_second": 2732.359 }, { "epoch": 0.363, "grad_norm": 8.875, "learning_rate": 1.473216639324984e-05, "loss": 0.4058, "mean_token_accuracy": 0.8812272071838378, "num_input_tokens_seen": 1414087, "num_tokens": 1414087.0, "step": 1815, "train_runtime": 517.9592, "train_tokens_per_second": 2730.113 }, { "epoch": 0.364, "grad_norm": 4.71875, "learning_rate": 1.4703609935765463e-05, "loss": 0.3677, "mean_token_accuracy": 0.8769798755645752, "num_input_tokens_seen": 1418654, "num_tokens": 1418654.0, "step": 1820, "train_runtime": 519.3342, "train_tokens_per_second": 2731.678 }, { "epoch": 0.365, "grad_norm": 15.25, "learning_rate": 1.467500413963857e-05, "loss": 0.3667, "mean_token_accuracy": 0.8811217784881592, "num_input_tokens_seen": 1421907, "num_tokens": 1421907.0, "step": 1825, "train_runtime": 520.6277, "train_tokens_per_second": 2731.14 }, { "epoch": 0.366, "grad_norm": 2.625, "learning_rate": 1.4646349304930426e-05, "loss": 0.331, "mean_token_accuracy": 0.897026252746582, "num_input_tokens_seen": 1426568, "num_tokens": 1426568.0, "step": 1830, "train_runtime": 521.9999, "train_tokens_per_second": 2732.889 }, { "epoch": 0.367, "grad_norm": 14.75, "learning_rate": 1.4617645732216686e-05, "loss": 0.3936, "mean_token_accuracy": 0.8652776837348938, "num_input_tokens_seen": 1429423, "num_tokens": 1429423.0, "step": 1835, "train_runtime": 523.3625, "train_tokens_per_second": 2731.229 }, { "epoch": 0.368, "grad_norm": 5.375, "learning_rate": 1.4588893722584247e-05, "loss": 0.2626, "mean_token_accuracy": 0.9183725833892822, "num_input_tokens_seen": 1432446, "num_tokens": 1432446.0, "step": 1840, "train_runtime": 524.6578, "train_tokens_per_second": 2730.248 }, { "epoch": 0.369, "grad_norm": 6.28125, "learning_rate": 1.456009357762809e-05, "loss": 0.2919, "mean_token_accuracy": 0.9049648761749267, "num_input_tokens_seen": 1436465, "num_tokens": 1436465.0, "step": 1845, "train_runtime": 526.0172, "train_tokens_per_second": 2730.833 }, { "epoch": 0.37, "grad_norm": 8.0, "learning_rate": 1.4531245599448099e-05, "loss": 0.3978, "mean_token_accuracy": 0.8905661225318908, "num_input_tokens_seen": 1439406, "num_tokens": 1439406.0, "step": 1850, "train_runtime": 527.3748, "train_tokens_per_second": 2729.379 }, { "epoch": 0.371, "grad_norm": 8.6875, "learning_rate": 1.4502350090645919e-05, "loss": 0.5778, "mean_token_accuracy": 0.8246065855026246, "num_input_tokens_seen": 1442619, "num_tokens": 1442619.0, "step": 1855, "train_runtime": 528.6662, "train_tokens_per_second": 2728.79 }, { "epoch": 0.372, "grad_norm": 8.3125, "learning_rate": 1.4473407354321763e-05, "loss": 0.393, "mean_token_accuracy": 0.893315052986145, "num_input_tokens_seen": 1445476, "num_tokens": 1445476.0, "step": 1860, "train_runtime": 529.9658, "train_tokens_per_second": 2727.489 }, { "epoch": 0.373, "grad_norm": 13.6875, "learning_rate": 1.4444417694071242e-05, "loss": 0.5041, "mean_token_accuracy": 0.8483460783958435, "num_input_tokens_seen": 1448993, "num_tokens": 1448993.0, "step": 1865, "train_runtime": 531.2868, "train_tokens_per_second": 2727.327 }, { "epoch": 0.374, "grad_norm": 7.15625, "learning_rate": 1.4415381413982168e-05, "loss": 0.5209, "mean_token_accuracy": 0.8453391194343567, "num_input_tokens_seen": 1451752, "num_tokens": 1451752.0, "step": 1870, "train_runtime": 532.6034, "train_tokens_per_second": 2725.765 }, { "epoch": 0.375, "grad_norm": 4.65625, "learning_rate": 1.4386298818631388e-05, "loss": 0.33, "mean_token_accuracy": 0.9009163737297058, "num_input_tokens_seen": 1455937, "num_tokens": 1455937.0, "step": 1875, "train_runtime": 533.9923, "train_tokens_per_second": 2726.513 }, { "epoch": 0.376, "grad_norm": 4.34375, "learning_rate": 1.4357170213081556e-05, "loss": 0.2716, "mean_token_accuracy": 0.9017189979553223, "num_input_tokens_seen": 1459052, "num_tokens": 1459052.0, "step": 1880, "train_runtime": 535.2531, "train_tokens_per_second": 2725.91 }, { "epoch": 0.377, "grad_norm": 29.125, "learning_rate": 1.4327995902877972e-05, "loss": 0.4419, "mean_token_accuracy": 0.8788858532905579, "num_input_tokens_seen": 1461848, "num_tokens": 1461848.0, "step": 1885, "train_runtime": 536.5527, "train_tokens_per_second": 2724.519 }, { "epoch": 0.378, "grad_norm": 10.5625, "learning_rate": 1.4298776194045337e-05, "loss": 0.5692, "mean_token_accuracy": 0.8311569213867187, "num_input_tokens_seen": 1465946, "num_tokens": 1465946.0, "step": 1890, "train_runtime": 537.8933, "train_tokens_per_second": 2725.347 }, { "epoch": 0.379, "grad_norm": 4.1875, "learning_rate": 1.4269511393084572e-05, "loss": 0.3279, "mean_token_accuracy": 0.894130539894104, "num_input_tokens_seen": 1469856, "num_tokens": 1469856.0, "step": 1895, "train_runtime": 539.2565, "train_tokens_per_second": 2725.709 }, { "epoch": 0.38, "grad_norm": 7.03125, "learning_rate": 1.4240201806969594e-05, "loss": 0.507, "mean_token_accuracy": 0.859413743019104, "num_input_tokens_seen": 1472214, "num_tokens": 1472214.0, "step": 1900, "train_runtime": 540.5186, "train_tokens_per_second": 2723.706 }, { "epoch": 0.381, "grad_norm": 9.0, "learning_rate": 1.4210847743144087e-05, "loss": 0.5106, "mean_token_accuracy": 0.8437602400779725, "num_input_tokens_seen": 1476482, "num_tokens": 1476482.0, "step": 1905, "train_runtime": 541.9684, "train_tokens_per_second": 2724.295 }, { "epoch": 0.382, "grad_norm": 4.53125, "learning_rate": 1.4181449509518292e-05, "loss": 0.3337, "mean_token_accuracy": 0.9039829969406128, "num_input_tokens_seen": 1478646, "num_tokens": 1478646.0, "step": 1910, "train_runtime": 543.2997, "train_tokens_per_second": 2721.603 }, { "epoch": 0.383, "grad_norm": 7.59375, "learning_rate": 1.4152007414465771e-05, "loss": 0.3767, "mean_token_accuracy": 0.8849639058113098, "num_input_tokens_seen": 1481200, "num_tokens": 1481200.0, "step": 1915, "train_runtime": 544.6117, "train_tokens_per_second": 2719.736 }, { "epoch": 0.384, "grad_norm": 3.75, "learning_rate": 1.4122521766820172e-05, "loss": 0.3022, "mean_token_accuracy": 0.9061004638671875, "num_input_tokens_seen": 1484292, "num_tokens": 1484292.0, "step": 1920, "train_runtime": 545.943, "train_tokens_per_second": 2718.767 }, { "epoch": 0.385, "grad_norm": 5.375, "learning_rate": 1.409299287587198e-05, "loss": 0.1544, "mean_token_accuracy": 0.9460370540618896, "num_input_tokens_seen": 1488742, "num_tokens": 1488742.0, "step": 1925, "train_runtime": 547.2891, "train_tokens_per_second": 2720.211 }, { "epoch": 0.386, "grad_norm": 5.71875, "learning_rate": 1.406342105136529e-05, "loss": 0.2631, "mean_token_accuracy": 0.920508861541748, "num_input_tokens_seen": 1491722, "num_tokens": 1491722.0, "step": 1930, "train_runtime": 548.5948, "train_tokens_per_second": 2719.169 }, { "epoch": 0.387, "grad_norm": 5.3125, "learning_rate": 1.403380660349455e-05, "loss": 0.3088, "mean_token_accuracy": 0.9029843926429748, "num_input_tokens_seen": 1497495, "num_tokens": 1497495.0, "step": 1935, "train_runtime": 550.079, "train_tokens_per_second": 2722.327 }, { "epoch": 0.388, "grad_norm": 3.28125, "learning_rate": 1.4004149842901305e-05, "loss": 0.3075, "mean_token_accuracy": 0.9011724710464477, "num_input_tokens_seen": 1501518, "num_tokens": 1501518.0, "step": 1940, "train_runtime": 551.4578, "train_tokens_per_second": 2722.816 }, { "epoch": 0.389, "grad_norm": 3.546875, "learning_rate": 1.3974451080670934e-05, "loss": 0.273, "mean_token_accuracy": 0.9133571267127991, "num_input_tokens_seen": 1506021, "num_tokens": 1506021.0, "step": 1945, "train_runtime": 552.868, "train_tokens_per_second": 2724.015 }, { "epoch": 0.39, "grad_norm": 4.25, "learning_rate": 1.3944710628329409e-05, "loss": 0.3808, "mean_token_accuracy": 0.8746022820472718, "num_input_tokens_seen": 1510356, "num_tokens": 1510356.0, "step": 1950, "train_runtime": 554.231, "train_tokens_per_second": 2725.138 }, { "epoch": 0.391, "grad_norm": 5.40625, "learning_rate": 1.3914928797839996e-05, "loss": 0.1708, "mean_token_accuracy": 0.9450325250625611, "num_input_tokens_seen": 1514863, "num_tokens": 1514863.0, "step": 1955, "train_runtime": 555.7294, "train_tokens_per_second": 2725.901 }, { "epoch": 0.392, "grad_norm": 18.75, "learning_rate": 1.3885105901600006e-05, "loss": 0.4176, "mean_token_accuracy": 0.8832451820373535, "num_input_tokens_seen": 1517922, "num_tokens": 1517922.0, "step": 1960, "train_runtime": 557.0567, "train_tokens_per_second": 2724.897 }, { "epoch": 0.393, "grad_norm": 3.71875, "learning_rate": 1.3855242252437511e-05, "loss": 0.5018, "mean_token_accuracy": 0.8400147557258606, "num_input_tokens_seen": 1522079, "num_tokens": 1522079.0, "step": 1965, "train_runtime": 561.2358, "train_tokens_per_second": 2712.013 }, { "epoch": 0.394, "grad_norm": 6.6875, "learning_rate": 1.3825338163608055e-05, "loss": 0.215, "mean_token_accuracy": 0.9346987128257751, "num_input_tokens_seen": 1526446, "num_tokens": 1526446.0, "step": 1970, "train_runtime": 562.7258, "train_tokens_per_second": 2712.593 }, { "epoch": 0.395, "grad_norm": 4.53125, "learning_rate": 1.3795393948791382e-05, "loss": 0.3247, "mean_token_accuracy": 0.8956392765045166, "num_input_tokens_seen": 1530525, "num_tokens": 1530525.0, "step": 1975, "train_runtime": 564.1552, "train_tokens_per_second": 2712.95 }, { "epoch": 0.396, "grad_norm": 3.8125, "learning_rate": 1.3765409922088137e-05, "loss": 0.3458, "mean_token_accuracy": 0.9032378315925598, "num_input_tokens_seen": 1535636, "num_tokens": 1535636.0, "step": 1980, "train_runtime": 565.6427, "train_tokens_per_second": 2714.851 }, { "epoch": 0.397, "grad_norm": 6.125, "learning_rate": 1.373538639801657e-05, "loss": 0.3706, "mean_token_accuracy": 0.8847505211830139, "num_input_tokens_seen": 1538853, "num_tokens": 1538853.0, "step": 1985, "train_runtime": 566.9658, "train_tokens_per_second": 2714.19 }, { "epoch": 0.398, "grad_norm": 7.28125, "learning_rate": 1.370532369150924e-05, "loss": 0.1554, "mean_token_accuracy": 0.9498430013656616, "num_input_tokens_seen": 1543328, "num_tokens": 1543328.0, "step": 1990, "train_runtime": 568.4525, "train_tokens_per_second": 2714.964 }, { "epoch": 0.399, "grad_norm": 5.03125, "learning_rate": 1.3675222117909716e-05, "loss": 0.517, "mean_token_accuracy": 0.8419507265090942, "num_input_tokens_seen": 1546968, "num_tokens": 1546968.0, "step": 1995, "train_runtime": 569.8434, "train_tokens_per_second": 2714.725 }, { "epoch": 0.4, "grad_norm": 9.0625, "learning_rate": 1.3645081992969262e-05, "loss": 0.566, "mean_token_accuracy": 0.8237776041030884, "num_input_tokens_seen": 1549944, "num_tokens": 1549944.0, "step": 2000, "train_runtime": 571.105, "train_tokens_per_second": 2713.939 }, { "epoch": 0.401, "grad_norm": 14.0625, "learning_rate": 1.3614903632843523e-05, "loss": 0.3439, "mean_token_accuracy": 0.895255196094513, "num_input_tokens_seen": 1555583, "num_tokens": 1555583.0, "step": 2005, "train_runtime": 572.6316, "train_tokens_per_second": 2716.551 }, { "epoch": 0.402, "grad_norm": 7.125, "learning_rate": 1.3584687354089222e-05, "loss": 0.2885, "mean_token_accuracy": 0.8994025588035583, "num_input_tokens_seen": 1559040, "num_tokens": 1559040.0, "step": 2010, "train_runtime": 574.0407, "train_tokens_per_second": 2715.905 }, { "epoch": 0.403, "grad_norm": 5.96875, "learning_rate": 1.3554433473660818e-05, "loss": 0.3643, "mean_token_accuracy": 0.8918593525886536, "num_input_tokens_seen": 1564254, "num_tokens": 1564254.0, "step": 2015, "train_runtime": 575.5929, "train_tokens_per_second": 2717.64 }, { "epoch": 0.404, "grad_norm": 9.5625, "learning_rate": 1.3524142308907205e-05, "loss": 0.3808, "mean_token_accuracy": 0.8884156346321106, "num_input_tokens_seen": 1569780, "num_tokens": 1569780.0, "step": 2020, "train_runtime": 577.0471, "train_tokens_per_second": 2720.367 }, { "epoch": 0.405, "grad_norm": 5.03125, "learning_rate": 1.3493814177568365e-05, "loss": 0.3054, "mean_token_accuracy": 0.8950094819068909, "num_input_tokens_seen": 1575934, "num_tokens": 1575934.0, "step": 2025, "train_runtime": 578.557, "train_tokens_per_second": 2723.904 }, { "epoch": 0.406, "grad_norm": 20.125, "learning_rate": 1.3463449397772045e-05, "loss": 0.4324, "mean_token_accuracy": 0.8652998685836792, "num_input_tokens_seen": 1579396, "num_tokens": 1579396.0, "step": 2030, "train_runtime": 579.8813, "train_tokens_per_second": 2723.654 }, { "epoch": 0.407, "grad_norm": 4.8125, "learning_rate": 1.3433048288030424e-05, "loss": 0.3024, "mean_token_accuracy": 0.9057637214660644, "num_input_tokens_seen": 1582928, "num_tokens": 1582928.0, "step": 2035, "train_runtime": 581.185, "train_tokens_per_second": 2723.621 }, { "epoch": 0.408, "grad_norm": 27.875, "learning_rate": 1.3402611167236748e-05, "loss": 0.3592, "mean_token_accuracy": 0.8928472757339477, "num_input_tokens_seen": 1586220, "num_tokens": 1586220.0, "step": 2040, "train_runtime": 582.4366, "train_tokens_per_second": 2723.421 }, { "epoch": 0.409, "grad_norm": 11.4375, "learning_rate": 1.3372138354662018e-05, "loss": 0.3689, "mean_token_accuracy": 0.8966154336929322, "num_input_tokens_seen": 1591013, "num_tokens": 1591013.0, "step": 2045, "train_runtime": 583.9526, "train_tokens_per_second": 2724.559 }, { "epoch": 0.41, "grad_norm": 4.59375, "learning_rate": 1.3341630169951616e-05, "loss": 0.325, "mean_token_accuracy": 0.8969388484954834, "num_input_tokens_seen": 1595716, "num_tokens": 1595716.0, "step": 2050, "train_runtime": 585.3872, "train_tokens_per_second": 2725.915 }, { "epoch": 0.411, "grad_norm": 11.25, "learning_rate": 1.3311086933121961e-05, "loss": 0.3898, "mean_token_accuracy": 0.8860350251197815, "num_input_tokens_seen": 1601957, "num_tokens": 1601957.0, "step": 2055, "train_runtime": 586.9583, "train_tokens_per_second": 2729.252 }, { "epoch": 0.412, "grad_norm": 4.78125, "learning_rate": 1.3280508964557162e-05, "loss": 0.2818, "mean_token_accuracy": 0.9157093048095704, "num_input_tokens_seen": 1604860, "num_tokens": 1604860.0, "step": 2060, "train_runtime": 588.2678, "train_tokens_per_second": 2728.111 }, { "epoch": 0.413, "grad_norm": 5.375, "learning_rate": 1.3249896585005628e-05, "loss": 0.3648, "mean_token_accuracy": 0.8904520988464355, "num_input_tokens_seen": 1608139, "num_tokens": 1608139.0, "step": 2065, "train_runtime": 589.5731, "train_tokens_per_second": 2727.633 }, { "epoch": 0.414, "grad_norm": 8.1875, "learning_rate": 1.3219250115576745e-05, "loss": 0.3784, "mean_token_accuracy": 0.8896842718124389, "num_input_tokens_seen": 1611440, "num_tokens": 1611440.0, "step": 2070, "train_runtime": 590.8942, "train_tokens_per_second": 2727.121 }, { "epoch": 0.415, "grad_norm": 14.8125, "learning_rate": 1.3188569877737474e-05, "loss": 0.4041, "mean_token_accuracy": 0.8682814717292786, "num_input_tokens_seen": 1615742, "num_tokens": 1615742.0, "step": 2075, "train_runtime": 592.2675, "train_tokens_per_second": 2728.061 }, { "epoch": 0.416, "grad_norm": 7.28125, "learning_rate": 1.3157856193308988e-05, "loss": 0.4649, "mean_token_accuracy": 0.8379541397094726, "num_input_tokens_seen": 1619172, "num_tokens": 1619172.0, "step": 2080, "train_runtime": 593.6879, "train_tokens_per_second": 2727.312 }, { "epoch": 0.417, "grad_norm": 4.875, "learning_rate": 1.312710938446331e-05, "loss": 0.3506, "mean_token_accuracy": 0.8798881530761719, "num_input_tokens_seen": 1621582, "num_tokens": 1621582.0, "step": 2085, "train_runtime": 594.9829, "train_tokens_per_second": 2725.426 }, { "epoch": 0.418, "grad_norm": 6.75, "learning_rate": 1.309632977371991e-05, "loss": 0.2691, "mean_token_accuracy": 0.9161506056785583, "num_input_tokens_seen": 1629038, "num_tokens": 1629038.0, "step": 2090, "train_runtime": 596.6444, "train_tokens_per_second": 2730.333 }, { "epoch": 0.419, "grad_norm": 10.125, "learning_rate": 1.3065517683942339e-05, "loss": 0.4228, "mean_token_accuracy": 0.8642907261848449, "num_input_tokens_seen": 1633285, "num_tokens": 1633285.0, "step": 2095, "train_runtime": 598.0125, "train_tokens_per_second": 2731.189 }, { "epoch": 0.42, "grad_norm": 2.8125, "learning_rate": 1.3034673438334841e-05, "loss": 0.4915, "mean_token_accuracy": 0.8387593388557434, "num_input_tokens_seen": 1637038, "num_tokens": 1637038.0, "step": 2100, "train_runtime": 599.4016, "train_tokens_per_second": 2731.12 }, { "epoch": 0.421, "grad_norm": 8.8125, "learning_rate": 1.3003797360438961e-05, "loss": 0.5165, "mean_token_accuracy": 0.8393840789794922, "num_input_tokens_seen": 1641127, "num_tokens": 1641127.0, "step": 2105, "train_runtime": 600.8437, "train_tokens_per_second": 2731.371 }, { "epoch": 0.422, "grad_norm": 3.765625, "learning_rate": 1.297288977413014e-05, "loss": 0.2927, "mean_token_accuracy": 0.8987934708595275, "num_input_tokens_seen": 1644768, "num_tokens": 1644768.0, "step": 2110, "train_runtime": 602.1758, "train_tokens_per_second": 2731.375 }, { "epoch": 0.423, "grad_norm": 4.25, "learning_rate": 1.2941951003614337e-05, "loss": 0.3698, "mean_token_accuracy": 0.8691359996795655, "num_input_tokens_seen": 1647662, "num_tokens": 1647662.0, "step": 2115, "train_runtime": 603.4884, "train_tokens_per_second": 2730.23 }, { "epoch": 0.424, "grad_norm": 4.0, "learning_rate": 1.2910981373424614e-05, "loss": 0.3312, "mean_token_accuracy": 0.8966627955436707, "num_input_tokens_seen": 1651660, "num_tokens": 1651660.0, "step": 2120, "train_runtime": 606.3131, "train_tokens_per_second": 2724.104 }, { "epoch": 0.425, "grad_norm": 7.3125, "learning_rate": 1.2879981208417735e-05, "loss": 0.4595, "mean_token_accuracy": 0.8588733911514282, "num_input_tokens_seen": 1655353, "num_tokens": 1655353.0, "step": 2125, "train_runtime": 607.7494, "train_tokens_per_second": 2723.743 }, { "epoch": 0.426, "grad_norm": 18.5, "learning_rate": 1.2848950833770764e-05, "loss": 0.6376, "mean_token_accuracy": 0.801353394985199, "num_input_tokens_seen": 1660222, "num_tokens": 1660222.0, "step": 2130, "train_runtime": 609.1909, "train_tokens_per_second": 2725.29 }, { "epoch": 0.427, "grad_norm": 2.765625, "learning_rate": 1.2817890574977648e-05, "loss": 0.3933, "mean_token_accuracy": 0.882224977016449, "num_input_tokens_seen": 1663399, "num_tokens": 1663399.0, "step": 2135, "train_runtime": 610.5507, "train_tokens_per_second": 2724.424 }, { "epoch": 0.428, "grad_norm": 6.21875, "learning_rate": 1.2786800757845802e-05, "loss": 0.3157, "mean_token_accuracy": 0.9001128673553467, "num_input_tokens_seen": 1667418, "num_tokens": 1667418.0, "step": 2140, "train_runtime": 611.8995, "train_tokens_per_second": 2724.987 }, { "epoch": 0.429, "grad_norm": 5.28125, "learning_rate": 1.2755681708492696e-05, "loss": 0.2464, "mean_token_accuracy": 0.9160644888877869, "num_input_tokens_seen": 1670855, "num_tokens": 1670855.0, "step": 2145, "train_runtime": 613.2198, "train_tokens_per_second": 2724.724 }, { "epoch": 0.43, "grad_norm": 3.90625, "learning_rate": 1.2724533753342433e-05, "loss": 0.2774, "mean_token_accuracy": 0.9104950308799744, "num_input_tokens_seen": 1674522, "num_tokens": 1674522.0, "step": 2150, "train_runtime": 614.6569, "train_tokens_per_second": 2724.32 }, { "epoch": 0.431, "grad_norm": 11.125, "learning_rate": 1.2693357219122331e-05, "loss": 0.3511, "mean_token_accuracy": 0.8901524662971496, "num_input_tokens_seen": 1678198, "num_tokens": 1678198.0, "step": 2155, "train_runtime": 615.9782, "train_tokens_per_second": 2724.444 }, { "epoch": 0.432, "grad_norm": 8.125, "learning_rate": 1.266215243285947e-05, "loss": 0.327, "mean_token_accuracy": 0.9042328000068665, "num_input_tokens_seen": 1682748, "num_tokens": 1682748.0, "step": 2160, "train_runtime": 617.9256, "train_tokens_per_second": 2723.221 }, { "epoch": 0.433, "grad_norm": 2.46875, "learning_rate": 1.2630919721877299e-05, "loss": 0.2358, "mean_token_accuracy": 0.9289053440093994, "num_input_tokens_seen": 1688172, "num_tokens": 1688172.0, "step": 2165, "train_runtime": 620.0438, "train_tokens_per_second": 2722.666 }, { "epoch": 0.434, "grad_norm": 3.84375, "learning_rate": 1.2599659413792176e-05, "loss": 0.3456, "mean_token_accuracy": 0.897593104839325, "num_input_tokens_seen": 1691462, "num_tokens": 1691462.0, "step": 2170, "train_runtime": 621.4001, "train_tokens_per_second": 2722.018 }, { "epoch": 0.435, "grad_norm": 7.71875, "learning_rate": 1.2568371836509936e-05, "loss": 0.315, "mean_token_accuracy": 0.9076021313667297, "num_input_tokens_seen": 1695409, "num_tokens": 1695409.0, "step": 2175, "train_runtime": 623.0091, "train_tokens_per_second": 2721.323 }, { "epoch": 0.436, "grad_norm": 4.9375, "learning_rate": 1.2537057318222468e-05, "loss": 0.4477, "mean_token_accuracy": 0.858843207359314, "num_input_tokens_seen": 1699994, "num_tokens": 1699994.0, "step": 2180, "train_runtime": 624.3544, "train_tokens_per_second": 2722.803 }, { "epoch": 0.437, "grad_norm": 3.875, "learning_rate": 1.2505716187404242e-05, "loss": 0.1781, "mean_token_accuracy": 0.9425875663757324, "num_input_tokens_seen": 1704186, "num_tokens": 1704186.0, "step": 2185, "train_runtime": 625.7278, "train_tokens_per_second": 2723.526 }, { "epoch": 0.438, "grad_norm": 3.65625, "learning_rate": 1.2474348772808897e-05, "loss": 0.4269, "mean_token_accuracy": 0.8716431856155396, "num_input_tokens_seen": 1707752, "num_tokens": 1707752.0, "step": 2190, "train_runtime": 627.0085, "train_tokens_per_second": 2723.651 }, { "epoch": 0.439, "grad_norm": 8.4375, "learning_rate": 1.2442955403465768e-05, "loss": 0.3852, "mean_token_accuracy": 0.8761202812194824, "num_input_tokens_seen": 1709560, "num_tokens": 1709560.0, "step": 2195, "train_runtime": 628.3225, "train_tokens_per_second": 2720.832 }, { "epoch": 0.44, "grad_norm": 3.359375, "learning_rate": 1.2411536408676443e-05, "loss": 0.4103, "mean_token_accuracy": 0.8680987000465393, "num_input_tokens_seen": 1713200, "num_tokens": 1713200.0, "step": 2200, "train_runtime": 629.6795, "train_tokens_per_second": 2720.749 }, { "epoch": 0.441, "grad_norm": 3.421875, "learning_rate": 1.238009211801131e-05, "loss": 0.3001, "mean_token_accuracy": 0.9097898602485657, "num_input_tokens_seen": 1716480, "num_tokens": 1716480.0, "step": 2205, "train_runtime": 630.9552, "train_tokens_per_second": 2720.447 }, { "epoch": 0.442, "grad_norm": 4.75, "learning_rate": 1.23486228613061e-05, "loss": 0.246, "mean_token_accuracy": 0.9165369153022767, "num_input_tokens_seen": 1719056, "num_tokens": 1719056.0, "step": 2210, "train_runtime": 632.2446, "train_tokens_per_second": 2718.973 }, { "epoch": 0.443, "grad_norm": 10.25, "learning_rate": 1.2317128968658424e-05, "loss": 0.3063, "mean_token_accuracy": 0.8894321918487549, "num_input_tokens_seen": 1723673, "num_tokens": 1723673.0, "step": 2215, "train_runtime": 633.6711, "train_tokens_per_second": 2720.138 }, { "epoch": 0.444, "grad_norm": 6.78125, "learning_rate": 1.2285610770424311e-05, "loss": 0.2168, "mean_token_accuracy": 0.937922191619873, "num_input_tokens_seen": 1728402, "num_tokens": 1728402.0, "step": 2220, "train_runtime": 635.1646, "train_tokens_per_second": 2721.188 }, { "epoch": 0.445, "grad_norm": 7.375, "learning_rate": 1.225406859721475e-05, "loss": 0.2319, "mean_token_accuracy": 0.9208413362503052, "num_input_tokens_seen": 1733450, "num_tokens": 1733450.0, "step": 2225, "train_runtime": 636.6397, "train_tokens_per_second": 2722.812 }, { "epoch": 0.446, "grad_norm": 5.21875, "learning_rate": 1.222250277989221e-05, "loss": 0.3974, "mean_token_accuracy": 0.8743439197540284, "num_input_tokens_seen": 1737542, "num_tokens": 1737542.0, "step": 2230, "train_runtime": 637.9849, "train_tokens_per_second": 2723.484 }, { "epoch": 0.447, "grad_norm": 6.65625, "learning_rate": 1.2190913649567185e-05, "loss": 0.3102, "mean_token_accuracy": 0.9004545211791992, "num_input_tokens_seen": 1741076, "num_tokens": 1741076.0, "step": 2235, "train_runtime": 639.3524, "train_tokens_per_second": 2723.187 }, { "epoch": 0.448, "grad_norm": 14.5, "learning_rate": 1.2159301537594691e-05, "loss": 0.4398, "mean_token_accuracy": 0.8797985434532165, "num_input_tokens_seen": 1746190, "num_tokens": 1746190.0, "step": 2240, "train_runtime": 640.8048, "train_tokens_per_second": 2724.995 }, { "epoch": 0.449, "grad_norm": 6.65625, "learning_rate": 1.2127666775570837e-05, "loss": 0.3422, "mean_token_accuracy": 0.8996375560760498, "num_input_tokens_seen": 1749955, "num_tokens": 1749955.0, "step": 2245, "train_runtime": 642.1944, "train_tokens_per_second": 2724.962 }, { "epoch": 0.45, "grad_norm": 5.09375, "learning_rate": 1.2096009695329298e-05, "loss": 0.3634, "mean_token_accuracy": 0.8894420862197876, "num_input_tokens_seen": 1753028, "num_tokens": 1753028.0, "step": 2250, "train_runtime": 643.4737, "train_tokens_per_second": 2724.32 }, { "epoch": 0.451, "grad_norm": 5.4375, "learning_rate": 1.206433062893787e-05, "loss": 0.3893, "mean_token_accuracy": 0.880610990524292, "num_input_tokens_seen": 1756933, "num_tokens": 1756933.0, "step": 2255, "train_runtime": 644.7948, "train_tokens_per_second": 2724.794 }, { "epoch": 0.452, "grad_norm": 3.765625, "learning_rate": 1.2032629908694969e-05, "loss": 0.2505, "mean_token_accuracy": 0.9136337995529175, "num_input_tokens_seen": 1759856, "num_tokens": 1759856.0, "step": 2260, "train_runtime": 646.0434, "train_tokens_per_second": 2724.052 }, { "epoch": 0.453, "grad_norm": 7.6875, "learning_rate": 1.200090786712615e-05, "loss": 0.3275, "mean_token_accuracy": 0.8906514167785644, "num_input_tokens_seen": 1764510, "num_tokens": 1764510.0, "step": 2265, "train_runtime": 647.4446, "train_tokens_per_second": 2725.345 }, { "epoch": 0.454, "grad_norm": 5.3125, "learning_rate": 1.1969164836980618e-05, "loss": 0.4931, "mean_token_accuracy": 0.8605551838874816, "num_input_tokens_seen": 1768298, "num_tokens": 1768298.0, "step": 2270, "train_runtime": 648.8236, "train_tokens_per_second": 2725.391 }, { "epoch": 0.455, "grad_norm": 6.40625, "learning_rate": 1.193740115122774e-05, "loss": 0.2532, "mean_token_accuracy": 0.9197658658027649, "num_input_tokens_seen": 1772960, "num_tokens": 1772960.0, "step": 2275, "train_runtime": 650.2546, "train_tokens_per_second": 2726.563 }, { "epoch": 0.456, "grad_norm": 3.5, "learning_rate": 1.190561714305355e-05, "loss": 0.3234, "mean_token_accuracy": 0.8924820899963379, "num_input_tokens_seen": 1776280, "num_tokens": 1776280.0, "step": 2280, "train_runtime": 651.5396, "train_tokens_per_second": 2726.281 }, { "epoch": 0.457, "grad_norm": 6.0625, "learning_rate": 1.187381314585725e-05, "loss": 0.3403, "mean_token_accuracy": 0.8993414878845215, "num_input_tokens_seen": 1779938, "num_tokens": 1779938.0, "step": 2285, "train_runtime": 652.883, "train_tokens_per_second": 2726.274 }, { "epoch": 0.458, "grad_norm": 20.25, "learning_rate": 1.184198949324772e-05, "loss": 0.4745, "mean_token_accuracy": 0.8561808824539184, "num_input_tokens_seen": 1784186, "num_tokens": 1784186.0, "step": 2290, "train_runtime": 654.276, "train_tokens_per_second": 2726.962 }, { "epoch": 0.459, "grad_norm": 4.875, "learning_rate": 1.1810146519040023e-05, "loss": 0.4238, "mean_token_accuracy": 0.8682867407798767, "num_input_tokens_seen": 1786977, "num_tokens": 1786977.0, "step": 2295, "train_runtime": 655.6088, "train_tokens_per_second": 2725.676 }, { "epoch": 0.46, "grad_norm": 3.859375, "learning_rate": 1.1778284557251887e-05, "loss": 0.4535, "mean_token_accuracy": 0.8526682019233703, "num_input_tokens_seen": 1790946, "num_tokens": 1790946.0, "step": 2300, "train_runtime": 656.9665, "train_tokens_per_second": 2726.084 }, { "epoch": 0.461, "grad_norm": 12.6875, "learning_rate": 1.1746403942100215e-05, "loss": 0.3479, "mean_token_accuracy": 0.8904070377349853, "num_input_tokens_seen": 1794165, "num_tokens": 1794165.0, "step": 2305, "train_runtime": 658.2943, "train_tokens_per_second": 2725.476 }, { "epoch": 0.462, "grad_norm": 7.78125, "learning_rate": 1.1714505007997576e-05, "loss": 0.4484, "mean_token_accuracy": 0.8657440900802612, "num_input_tokens_seen": 1798028, "num_tokens": 1798028.0, "step": 2310, "train_runtime": 659.719, "train_tokens_per_second": 2725.445 }, { "epoch": 0.463, "grad_norm": 3.5625, "learning_rate": 1.1682588089548692e-05, "loss": 0.3318, "mean_token_accuracy": 0.8864567637443542, "num_input_tokens_seen": 1803273, "num_tokens": 1803273.0, "step": 2315, "train_runtime": 661.128, "train_tokens_per_second": 2727.57 }, { "epoch": 0.464, "grad_norm": 6.3125, "learning_rate": 1.1650653521546937e-05, "loss": 0.3026, "mean_token_accuracy": 0.909605062007904, "num_input_tokens_seen": 1807836, "num_tokens": 1807836.0, "step": 2320, "train_runtime": 662.5541, "train_tokens_per_second": 2728.586 }, { "epoch": 0.465, "grad_norm": 9.5625, "learning_rate": 1.1618701638970815e-05, "loss": 0.2707, "mean_token_accuracy": 0.9235754489898682, "num_input_tokens_seen": 1813175, "num_tokens": 1813175.0, "step": 2325, "train_runtime": 663.9877, "train_tokens_per_second": 2730.736 }, { "epoch": 0.466, "grad_norm": 7.84375, "learning_rate": 1.1586732776980456e-05, "loss": 0.6452, "mean_token_accuracy": 0.8138893008232116, "num_input_tokens_seen": 1816534, "num_tokens": 1816534.0, "step": 2330, "train_runtime": 665.3076, "train_tokens_per_second": 2730.367 }, { "epoch": 0.467, "grad_norm": 3.359375, "learning_rate": 1.1554747270914098e-05, "loss": 0.2903, "mean_token_accuracy": 0.9146389365196228, "num_input_tokens_seen": 1818386, "num_tokens": 1818386.0, "step": 2335, "train_runtime": 666.6267, "train_tokens_per_second": 2727.743 }, { "epoch": 0.468, "grad_norm": 30.625, "learning_rate": 1.1522745456284557e-05, "loss": 0.3631, "mean_token_accuracy": 0.8874744057655335, "num_input_tokens_seen": 1822114, "num_tokens": 1822114.0, "step": 2340, "train_runtime": 668.0065, "train_tokens_per_second": 2727.689 }, { "epoch": 0.469, "grad_norm": 8.75, "learning_rate": 1.1490727668775735e-05, "loss": 0.2962, "mean_token_accuracy": 0.9023106217384338, "num_input_tokens_seen": 1825782, "num_tokens": 1825782.0, "step": 2345, "train_runtime": 669.2954, "train_tokens_per_second": 2727.917 }, { "epoch": 0.47, "grad_norm": 4.9375, "learning_rate": 1.1458694244239067e-05, "loss": 0.323, "mean_token_accuracy": 0.9046676993370056, "num_input_tokens_seen": 1828342, "num_tokens": 1828342.0, "step": 2350, "train_runtime": 670.5872, "train_tokens_per_second": 2726.479 }, { "epoch": 0.471, "grad_norm": 11.8125, "learning_rate": 1.1426645518690015e-05, "loss": 0.3935, "mean_token_accuracy": 0.8803214073181153, "num_input_tokens_seen": 1832299, "num_tokens": 1832299.0, "step": 2355, "train_runtime": 672.0513, "train_tokens_per_second": 2726.427 }, { "epoch": 0.472, "grad_norm": 3.359375, "learning_rate": 1.1394581828304555e-05, "loss": 0.2134, "mean_token_accuracy": 0.9245081186294556, "num_input_tokens_seen": 1835406, "num_tokens": 1835406.0, "step": 2360, "train_runtime": 673.3617, "train_tokens_per_second": 2725.735 }, { "epoch": 0.473, "grad_norm": 10.875, "learning_rate": 1.136250350941562e-05, "loss": 0.3911, "mean_token_accuracy": 0.8812428951263428, "num_input_tokens_seen": 1839612, "num_tokens": 1839612.0, "step": 2365, "train_runtime": 674.707, "train_tokens_per_second": 2726.535 }, { "epoch": 0.474, "grad_norm": 10.4375, "learning_rate": 1.1330410898509594e-05, "loss": 0.5944, "mean_token_accuracy": 0.8154242634773254, "num_input_tokens_seen": 1842890, "num_tokens": 1842890.0, "step": 2370, "train_runtime": 675.9921, "train_tokens_per_second": 2726.201 }, { "epoch": 0.475, "grad_norm": 10.6875, "learning_rate": 1.129830433222278e-05, "loss": 0.4383, "mean_token_accuracy": 0.8632968544960022, "num_input_tokens_seen": 1846875, "num_tokens": 1846875.0, "step": 2375, "train_runtime": 677.4377, "train_tokens_per_second": 2726.265 }, { "epoch": 0.476, "grad_norm": 3.40625, "learning_rate": 1.1266184147337873e-05, "loss": 0.3283, "mean_token_accuracy": 0.8883636236190796, "num_input_tokens_seen": 1850510, "num_tokens": 1850510.0, "step": 2380, "train_runtime": 678.7793, "train_tokens_per_second": 2726.232 }, { "epoch": 0.477, "grad_norm": 3.859375, "learning_rate": 1.1234050680780407e-05, "loss": 0.2106, "mean_token_accuracy": 0.9282655954360962, "num_input_tokens_seen": 1856223, "num_tokens": 1856223.0, "step": 2385, "train_runtime": 680.322, "train_tokens_per_second": 2728.448 }, { "epoch": 0.478, "grad_norm": 8.375, "learning_rate": 1.1201904269615242e-05, "loss": 0.3364, "mean_token_accuracy": 0.8887192964553833, "num_input_tokens_seen": 1859714, "num_tokens": 1859714.0, "step": 2390, "train_runtime": 681.6606, "train_tokens_per_second": 2728.211 }, { "epoch": 0.479, "grad_norm": 4.21875, "learning_rate": 1.116974525104302e-05, "loss": 0.3169, "mean_token_accuracy": 0.8991916060447693, "num_input_tokens_seen": 1862226, "num_tokens": 1862226.0, "step": 2395, "train_runtime": 682.9295, "train_tokens_per_second": 2726.82 }, { "epoch": 0.48, "grad_norm": 6.34375, "learning_rate": 1.113757396239663e-05, "loss": 0.3231, "mean_token_accuracy": 0.8956782579421997, "num_input_tokens_seen": 1865332, "num_tokens": 1865332.0, "step": 2400, "train_runtime": 684.2588, "train_tokens_per_second": 2726.062 }, { "epoch": 0.481, "grad_norm": 2.984375, "learning_rate": 1.110539074113766e-05, "loss": 0.1984, "mean_token_accuracy": 0.9350772500038147, "num_input_tokens_seen": 1872063, "num_tokens": 1872063.0, "step": 2405, "train_runtime": 685.8589, "train_tokens_per_second": 2729.516 }, { "epoch": 0.482, "grad_norm": 3.75, "learning_rate": 1.1073195924852882e-05, "loss": 0.1286, "mean_token_accuracy": 0.9600299835205078, "num_input_tokens_seen": 1876068, "num_tokens": 1876068.0, "step": 2410, "train_runtime": 687.3116, "train_tokens_per_second": 2729.574 }, { "epoch": 0.483, "grad_norm": 10.9375, "learning_rate": 1.1040989851250678e-05, "loss": 0.2822, "mean_token_accuracy": 0.9010962128639222, "num_input_tokens_seen": 1879868, "num_tokens": 1879868.0, "step": 2415, "train_runtime": 688.7756, "train_tokens_per_second": 2729.289 }, { "epoch": 0.484, "grad_norm": 8.8125, "learning_rate": 1.1008772858157524e-05, "loss": 0.4418, "mean_token_accuracy": 0.8386808395385742, "num_input_tokens_seen": 1882700, "num_tokens": 1882700.0, "step": 2420, "train_runtime": 690.052, "train_tokens_per_second": 2728.345 }, { "epoch": 0.485, "grad_norm": 78.0, "learning_rate": 1.097654528351443e-05, "loss": 0.4002, "mean_token_accuracy": 0.8745073199272155, "num_input_tokens_seen": 1887781, "num_tokens": 1887781.0, "step": 2425, "train_runtime": 691.5443, "train_tokens_per_second": 2729.805 }, { "epoch": 0.486, "grad_norm": 5.96875, "learning_rate": 1.0944307465373405e-05, "loss": 0.4093, "mean_token_accuracy": 0.8656105160713196, "num_input_tokens_seen": 1890280, "num_tokens": 1890280.0, "step": 2430, "train_runtime": 692.8733, "train_tokens_per_second": 2728.176 }, { "epoch": 0.487, "grad_norm": 6.1875, "learning_rate": 1.0912059741893908e-05, "loss": 0.2581, "mean_token_accuracy": 0.9143229007720948, "num_input_tokens_seen": 1894403, "num_tokens": 1894403.0, "step": 2435, "train_runtime": 694.2312, "train_tokens_per_second": 2728.778 }, { "epoch": 0.488, "grad_norm": 2.4375, "learning_rate": 1.0879802451339298e-05, "loss": 0.1761, "mean_token_accuracy": 0.9427258849143982, "num_input_tokens_seen": 1899114, "num_tokens": 1899114.0, "step": 2440, "train_runtime": 695.6432, "train_tokens_per_second": 2730.012 }, { "epoch": 0.489, "grad_norm": 11.8125, "learning_rate": 1.0847535932073288e-05, "loss": 0.4768, "mean_token_accuracy": 0.8531417846679688, "num_input_tokens_seen": 1902441, "num_tokens": 1902441.0, "step": 2445, "train_runtime": 697.3634, "train_tokens_per_second": 2728.048 }, { "epoch": 0.49, "grad_norm": 4.6875, "learning_rate": 1.0815260522556394e-05, "loss": 0.53, "mean_token_accuracy": 0.8308880448341369, "num_input_tokens_seen": 1905926, "num_tokens": 1905926.0, "step": 2450, "train_runtime": 698.6974, "train_tokens_per_second": 2727.828 }, { "epoch": 0.491, "grad_norm": 8.8125, "learning_rate": 1.0782976561342398e-05, "loss": 0.3702, "mean_token_accuracy": 0.8827953457832336, "num_input_tokens_seen": 1909564, "num_tokens": 1909564.0, "step": 2455, "train_runtime": 700.0188, "train_tokens_per_second": 2727.875 }, { "epoch": 0.492, "grad_norm": 4.59375, "learning_rate": 1.075068438707477e-05, "loss": 0.3214, "mean_token_accuracy": 0.9007907152175904, "num_input_tokens_seen": 1913256, "num_tokens": 1913256.0, "step": 2460, "train_runtime": 701.3623, "train_tokens_per_second": 2727.914 }, { "epoch": 0.493, "grad_norm": 3.875, "learning_rate": 1.0718384338483141e-05, "loss": 0.2007, "mean_token_accuracy": 0.9343870997428894, "num_input_tokens_seen": 1917796, "num_tokens": 1917796.0, "step": 2465, "train_runtime": 702.8286, "train_tokens_per_second": 2728.682 }, { "epoch": 0.494, "grad_norm": 6.46875, "learning_rate": 1.0686076754379734e-05, "loss": 0.2874, "mean_token_accuracy": 0.903351616859436, "num_input_tokens_seen": 1921078, "num_tokens": 1921078.0, "step": 2470, "train_runtime": 704.224, "train_tokens_per_second": 2727.936 }, { "epoch": 0.495, "grad_norm": 8.4375, "learning_rate": 1.0653761973655819e-05, "loss": 0.3153, "mean_token_accuracy": 0.9074290990829468, "num_input_tokens_seen": 1925885, "num_tokens": 1925885.0, "step": 2475, "train_runtime": 705.598, "train_tokens_per_second": 2729.437 }, { "epoch": 0.496, "grad_norm": 4.625, "learning_rate": 1.0621440335278152e-05, "loss": 0.3725, "mean_token_accuracy": 0.8811562776565551, "num_input_tokens_seen": 1929442, "num_tokens": 1929442.0, "step": 2480, "train_runtime": 706.9224, "train_tokens_per_second": 2729.355 }, { "epoch": 0.497, "grad_norm": 12.0625, "learning_rate": 1.0589112178285432e-05, "loss": 0.2856, "mean_token_accuracy": 0.9045382142066956, "num_input_tokens_seen": 1932920, "num_tokens": 1932920.0, "step": 2485, "train_runtime": 708.2074, "train_tokens_per_second": 2729.314 }, { "epoch": 0.498, "grad_norm": 3.328125, "learning_rate": 1.0556777841784725e-05, "loss": 0.4299, "mean_token_accuracy": 0.8647878646850586, "num_input_tokens_seen": 1937324, "num_tokens": 1937324.0, "step": 2490, "train_runtime": 709.6296, "train_tokens_per_second": 2730.05 }, { "epoch": 0.499, "grad_norm": 8.75, "learning_rate": 1.0524437664947918e-05, "loss": 0.3427, "mean_token_accuracy": 0.8960558652877808, "num_input_tokens_seen": 1940871, "num_tokens": 1940871.0, "step": 2495, "train_runtime": 710.9772, "train_tokens_per_second": 2729.864 }, { "epoch": 0.5, "grad_norm": 2.921875, "learning_rate": 1.0492091987008167e-05, "loss": 0.3799, "mean_token_accuracy": 0.872166121006012, "num_input_tokens_seen": 1945806, "num_tokens": 1945806.0, "step": 2500, "train_runtime": 712.3921, "train_tokens_per_second": 2731.37 }, { "epoch": 0.501, "grad_norm": 5.5, "learning_rate": 1.0459741147256325e-05, "loss": 0.3892, "mean_token_accuracy": 0.8810149550437927, "num_input_tokens_seen": 1948975, "num_tokens": 1948975.0, "step": 2505, "train_runtime": 713.7453, "train_tokens_per_second": 2730.631 }, { "epoch": 0.502, "grad_norm": 6.71875, "learning_rate": 1.0427385485037398e-05, "loss": 0.3536, "mean_token_accuracy": 0.8940821766853333, "num_input_tokens_seen": 1952610, "num_tokens": 1952610.0, "step": 2510, "train_runtime": 715.0741, "train_tokens_per_second": 2730.64 }, { "epoch": 0.503, "grad_norm": 6.0, "learning_rate": 1.0395025339746965e-05, "loss": 0.3218, "mean_token_accuracy": 0.9000921010971069, "num_input_tokens_seen": 1955431, "num_tokens": 1955431.0, "step": 2515, "train_runtime": 716.4096, "train_tokens_per_second": 2729.488 }, { "epoch": 0.504, "grad_norm": 2.609375, "learning_rate": 1.0362661050827643e-05, "loss": 0.1851, "mean_token_accuracy": 0.9424342274665832, "num_input_tokens_seen": 1958754, "num_tokens": 1958754.0, "step": 2520, "train_runtime": 717.6978, "train_tokens_per_second": 2729.218 }, { "epoch": 0.505, "grad_norm": 3.25, "learning_rate": 1.0330292957765502e-05, "loss": 0.4686, "mean_token_accuracy": 0.8564263105392456, "num_input_tokens_seen": 1962567, "num_tokens": 1962567.0, "step": 2525, "train_runtime": 719.0436, "train_tokens_per_second": 2729.413 }, { "epoch": 0.506, "grad_norm": 8.125, "learning_rate": 1.0297921400086528e-05, "loss": 0.3334, "mean_token_accuracy": 0.8883252382278443, "num_input_tokens_seen": 1966912, "num_tokens": 1966912.0, "step": 2530, "train_runtime": 720.4221, "train_tokens_per_second": 2730.222 }, { "epoch": 0.507, "grad_norm": 6.8125, "learning_rate": 1.0265546717353041e-05, "loss": 0.2982, "mean_token_accuracy": 0.9091978430747986, "num_input_tokens_seen": 1971718, "num_tokens": 1971718.0, "step": 2535, "train_runtime": 721.8232, "train_tokens_per_second": 2731.58 }, { "epoch": 0.508, "grad_norm": 5.375, "learning_rate": 1.0233169249160145e-05, "loss": 0.3082, "mean_token_accuracy": 0.9036029815673828, "num_input_tokens_seen": 1976108, "num_tokens": 1976108.0, "step": 2540, "train_runtime": 723.216, "train_tokens_per_second": 2732.39 }, { "epoch": 0.509, "grad_norm": 10.5625, "learning_rate": 1.0200789335132157e-05, "loss": 0.3147, "mean_token_accuracy": 0.9058708906173706, "num_input_tokens_seen": 1979711, "num_tokens": 1979711.0, "step": 2545, "train_runtime": 724.5415, "train_tokens_per_second": 2732.364 }, { "epoch": 0.51, "grad_norm": 6.40625, "learning_rate": 1.0168407314919057e-05, "loss": 0.4523, "mean_token_accuracy": 0.8608191013336182, "num_input_tokens_seen": 1981880, "num_tokens": 1981880.0, "step": 2550, "train_runtime": 725.8505, "train_tokens_per_second": 2730.424 }, { "epoch": 0.511, "grad_norm": 4.65625, "learning_rate": 1.013602352819291e-05, "loss": 0.2402, "mean_token_accuracy": 0.9129250407218933, "num_input_tokens_seen": 1985077, "num_tokens": 1985077.0, "step": 2555, "train_runtime": 727.1583, "train_tokens_per_second": 2729.91 }, { "epoch": 0.512, "grad_norm": 3.5, "learning_rate": 1.0103638314644322e-05, "loss": 0.3501, "mean_token_accuracy": 0.8848968863487243, "num_input_tokens_seen": 1989964, "num_tokens": 1989964.0, "step": 2560, "train_runtime": 728.5984, "train_tokens_per_second": 2731.222 }, { "epoch": 0.513, "grad_norm": 7.9375, "learning_rate": 1.0071252013978852e-05, "loss": 0.2858, "mean_token_accuracy": 0.9029408931732178, "num_input_tokens_seen": 1993978, "num_tokens": 1993978.0, "step": 2565, "train_runtime": 729.9925, "train_tokens_per_second": 2731.505 }, { "epoch": 0.514, "grad_norm": 8.3125, "learning_rate": 1.0038864965913469e-05, "loss": 0.3757, "mean_token_accuracy": 0.8791045665740966, "num_input_tokens_seen": 1996440, "num_tokens": 1996440.0, "step": 2570, "train_runtime": 731.3021, "train_tokens_per_second": 2729.98 }, { "epoch": 0.515, "grad_norm": 3.078125, "learning_rate": 1.0006477510172984e-05, "loss": 0.2601, "mean_token_accuracy": 0.9146520256996155, "num_input_tokens_seen": 2001747, "num_tokens": 2001747.0, "step": 2575, "train_runtime": 732.7179, "train_tokens_per_second": 2731.948 }, { "epoch": 0.516, "grad_norm": 5.53125, "learning_rate": 9.974089986486488e-06, "loss": 0.2761, "mean_token_accuracy": 0.9190154075622559, "num_input_tokens_seen": 2006802, "num_tokens": 2006802.0, "step": 2580, "train_runtime": 734.1289, "train_tokens_per_second": 2733.583 }, { "epoch": 0.517, "grad_norm": 7.28125, "learning_rate": 9.941702734583771e-06, "loss": 0.3861, "mean_token_accuracy": 0.8763120174407959, "num_input_tokens_seen": 2011826, "num_tokens": 2011826.0, "step": 2585, "train_runtime": 735.5993, "train_tokens_per_second": 2734.948 }, { "epoch": 0.518, "grad_norm": 4.125, "learning_rate": 9.90931609419178e-06, "loss": 0.2543, "mean_token_accuracy": 0.9148181676864624, "num_input_tokens_seen": 2016638, "num_tokens": 2016638.0, "step": 2590, "train_runtime": 737.0679, "train_tokens_per_second": 2736.027 }, { "epoch": 0.519, "grad_norm": 4.3125, "learning_rate": 9.876930405031047e-06, "loss": 0.3798, "mean_token_accuracy": 0.8855784177780152, "num_input_tokens_seen": 2018793, "num_tokens": 2018793.0, "step": 2595, "train_runtime": 738.3812, "train_tokens_per_second": 2734.08 }, { "epoch": 0.52, "grad_norm": 6.34375, "learning_rate": 9.844546006812135e-06, "loss": 0.3263, "mean_token_accuracy": 0.8903733730316162, "num_input_tokens_seen": 2022236, "num_tokens": 2022236.0, "step": 2600, "train_runtime": 739.6934, "train_tokens_per_second": 2733.884 }, { "epoch": 0.521, "grad_norm": 161.0, "learning_rate": 9.812163239232051e-06, "loss": 0.1424, "mean_token_accuracy": 0.9516665697097778, "num_input_tokens_seen": 2027633, "num_tokens": 2027633.0, "step": 2605, "train_runtime": 741.161, "train_tokens_per_second": 2735.752 }, { "epoch": 0.522, "grad_norm": 7.03125, "learning_rate": 9.779782441970702e-06, "loss": 0.3092, "mean_token_accuracy": 0.9006954908370972, "num_input_tokens_seen": 2031598, "num_tokens": 2031598.0, "step": 2610, "train_runtime": 742.5235, "train_tokens_per_second": 2736.072 }, { "epoch": 0.523, "grad_norm": 17.5, "learning_rate": 9.747403954687334e-06, "loss": 0.317, "mean_token_accuracy": 0.900297999382019, "num_input_tokens_seen": 2034162, "num_tokens": 2034162.0, "step": 2615, "train_runtime": 743.8661, "train_tokens_per_second": 2734.581 }, { "epoch": 0.524, "grad_norm": 12.1875, "learning_rate": 9.715028117016955e-06, "loss": 0.2965, "mean_token_accuracy": 0.9078072428703308, "num_input_tokens_seen": 2037398, "num_tokens": 2037398.0, "step": 2620, "train_runtime": 745.2748, "train_tokens_per_second": 2733.754 }, { "epoch": 0.525, "grad_norm": 6.09375, "learning_rate": 9.682655268566783e-06, "loss": 0.3894, "mean_token_accuracy": 0.8821518540382385, "num_input_tokens_seen": 2040406, "num_tokens": 2040406.0, "step": 2625, "train_runtime": 746.6411, "train_tokens_per_second": 2732.78 }, { "epoch": 0.526, "grad_norm": 7.25, "learning_rate": 9.650285748912678e-06, "loss": 0.371, "mean_token_accuracy": 0.8767060995101928, "num_input_tokens_seen": 2044550, "num_tokens": 2044550.0, "step": 2630, "train_runtime": 747.9943, "train_tokens_per_second": 2733.377 }, { "epoch": 0.527, "grad_norm": 4.5, "learning_rate": 9.617919897595586e-06, "loss": 0.3225, "mean_token_accuracy": 0.9044132232666016, "num_input_tokens_seen": 2047204, "num_tokens": 2047204.0, "step": 2635, "train_runtime": 749.3298, "train_tokens_per_second": 2732.047 }, { "epoch": 0.528, "grad_norm": 4.5625, "learning_rate": 9.58555805411797e-06, "loss": 0.3147, "mean_token_accuracy": 0.8911243796348571, "num_input_tokens_seen": 2050930, "num_tokens": 2050930.0, "step": 2640, "train_runtime": 750.6278, "train_tokens_per_second": 2732.286 }, { "epoch": 0.529, "grad_norm": 4.46875, "learning_rate": 9.553200557940254e-06, "loss": 0.2866, "mean_token_accuracy": 0.911807405948639, "num_input_tokens_seen": 2054293, "num_tokens": 2054293.0, "step": 2645, "train_runtime": 751.9089, "train_tokens_per_second": 2732.104 }, { "epoch": 0.53, "grad_norm": 1.7578125, "learning_rate": 9.520847748477266e-06, "loss": 0.1908, "mean_token_accuracy": 0.9312750101089478, "num_input_tokens_seen": 2058384, "num_tokens": 2058384.0, "step": 2650, "train_runtime": 753.3389, "train_tokens_per_second": 2732.348 }, { "epoch": 0.531, "grad_norm": 3.390625, "learning_rate": 9.488499965094664e-06, "loss": 0.2616, "mean_token_accuracy": 0.9270828843116761, "num_input_tokens_seen": 2063011, "num_tokens": 2063011.0, "step": 2655, "train_runtime": 754.8166, "train_tokens_per_second": 2733.129 }, { "epoch": 0.532, "grad_norm": 4.3125, "learning_rate": 9.45615754710539e-06, "loss": 0.1984, "mean_token_accuracy": 0.9340363264083862, "num_input_tokens_seen": 2066558, "num_tokens": 2066558.0, "step": 2660, "train_runtime": 756.1717, "train_tokens_per_second": 2732.922 }, { "epoch": 0.533, "grad_norm": 6.28125, "learning_rate": 9.423820833766108e-06, "loss": 0.2156, "mean_token_accuracy": 0.9327269673347474, "num_input_tokens_seen": 2071480, "num_tokens": 2071480.0, "step": 2665, "train_runtime": 757.6154, "train_tokens_per_second": 2734.211 }, { "epoch": 0.534, "grad_norm": 4.15625, "learning_rate": 9.391490164273635e-06, "loss": 0.27, "mean_token_accuracy": 0.9164454817771912, "num_input_tokens_seen": 2075132, "num_tokens": 2075132.0, "step": 2670, "train_runtime": 758.927, "train_tokens_per_second": 2734.297 }, { "epoch": 0.535, "grad_norm": 3.671875, "learning_rate": 9.359165877761396e-06, "loss": 0.517, "mean_token_accuracy": 0.8415798783302307, "num_input_tokens_seen": 2078933, "num_tokens": 2078933.0, "step": 2675, "train_runtime": 760.2848, "train_tokens_per_second": 2734.414 }, { "epoch": 0.536, "grad_norm": 7.25, "learning_rate": 9.32684831329586e-06, "loss": 0.3329, "mean_token_accuracy": 0.8914009690284729, "num_input_tokens_seen": 2082080, "num_tokens": 2082080.0, "step": 2680, "train_runtime": 761.6673, "train_tokens_per_second": 2733.582 }, { "epoch": 0.537, "grad_norm": 8.0625, "learning_rate": 9.29453780987299e-06, "loss": 0.5562, "mean_token_accuracy": 0.8424725890159607, "num_input_tokens_seen": 2084521, "num_tokens": 2084521.0, "step": 2685, "train_runtime": 762.9662, "train_tokens_per_second": 2732.128 }, { "epoch": 0.538, "grad_norm": 2.859375, "learning_rate": 9.262234706414677e-06, "loss": 0.3704, "mean_token_accuracy": 0.8849583745002747, "num_input_tokens_seen": 2090224, "num_tokens": 2090224.0, "step": 2690, "train_runtime": 764.464, "train_tokens_per_second": 2734.235 }, { "epoch": 0.539, "grad_norm": 3.734375, "learning_rate": 9.229939341765188e-06, "loss": 0.2784, "mean_token_accuracy": 0.9101876974105835, "num_input_tokens_seen": 2095143, "num_tokens": 2095143.0, "step": 2695, "train_runtime": 765.8663, "train_tokens_per_second": 2735.651 }, { "epoch": 0.54, "grad_norm": 13.0, "learning_rate": 9.197652054687619e-06, "loss": 0.473, "mean_token_accuracy": 0.8612375617027282, "num_input_tokens_seen": 2099962, "num_tokens": 2099962.0, "step": 2700, "train_runtime": 767.3163, "train_tokens_per_second": 2736.762 }, { "epoch": 0.541, "grad_norm": 8.0625, "learning_rate": 9.165373183860329e-06, "loss": 0.2021, "mean_token_accuracy": 0.9331565737724304, "num_input_tokens_seen": 2104377, "num_tokens": 2104377.0, "step": 2705, "train_runtime": 768.7538, "train_tokens_per_second": 2737.387 }, { "epoch": 0.542, "grad_norm": 7.5625, "learning_rate": 9.133103067873403e-06, "loss": 0.2708, "mean_token_accuracy": 0.9133798360824585, "num_input_tokens_seen": 2108444, "num_tokens": 2108444.0, "step": 2710, "train_runtime": 770.134, "train_tokens_per_second": 2737.763 }, { "epoch": 0.543, "grad_norm": 8.5625, "learning_rate": 9.100842045225084e-06, "loss": 0.3815, "mean_token_accuracy": 0.879783546924591, "num_input_tokens_seen": 2114675, "num_tokens": 2114675.0, "step": 2715, "train_runtime": 771.7081, "train_tokens_per_second": 2740.253 }, { "epoch": 0.544, "grad_norm": 7.96875, "learning_rate": 9.06859045431824e-06, "loss": 0.4946, "mean_token_accuracy": 0.8550596237182617, "num_input_tokens_seen": 2118268, "num_tokens": 2118268.0, "step": 2720, "train_runtime": 773.0591, "train_tokens_per_second": 2740.111 }, { "epoch": 0.545, "grad_norm": 3.09375, "learning_rate": 9.036348633456791e-06, "loss": 0.2816, "mean_token_accuracy": 0.9101481914520264, "num_input_tokens_seen": 2122460, "num_tokens": 2122460.0, "step": 2725, "train_runtime": 774.4821, "train_tokens_per_second": 2740.489 }, { "epoch": 0.546, "grad_norm": 3.90625, "learning_rate": 9.004116920842188e-06, "loss": 0.3193, "mean_token_accuracy": 0.9087582945823669, "num_input_tokens_seen": 2127256, "num_tokens": 2127256.0, "step": 2730, "train_runtime": 775.9449, "train_tokens_per_second": 2741.504 }, { "epoch": 0.547, "grad_norm": 4.4375, "learning_rate": 8.971895654569842e-06, "loss": 0.2198, "mean_token_accuracy": 0.9261224389076232, "num_input_tokens_seen": 2132097, "num_tokens": 2132097.0, "step": 2735, "train_runtime": 777.4458, "train_tokens_per_second": 2742.438 }, { "epoch": 0.548, "grad_norm": 9.4375, "learning_rate": 8.939685172625588e-06, "loss": 0.5609, "mean_token_accuracy": 0.8266632080078125, "num_input_tokens_seen": 2135434, "num_tokens": 2135434.0, "step": 2740, "train_runtime": 778.8266, "train_tokens_per_second": 2741.861 }, { "epoch": 0.549, "grad_norm": 6.4375, "learning_rate": 8.907485812882137e-06, "loss": 0.2729, "mean_token_accuracy": 0.9141210079193115, "num_input_tokens_seen": 2140102, "num_tokens": 2140102.0, "step": 2745, "train_runtime": 780.283, "train_tokens_per_second": 2742.725 }, { "epoch": 0.55, "grad_norm": 2.890625, "learning_rate": 8.875297913095544e-06, "loss": 0.2716, "mean_token_accuracy": 0.9095355033874511, "num_input_tokens_seen": 2144010, "num_tokens": 2144010.0, "step": 2750, "train_runtime": 781.6408, "train_tokens_per_second": 2742.961 }, { "epoch": 0.551, "grad_norm": 25.375, "learning_rate": 8.843121810901643e-06, "loss": 0.3446, "mean_token_accuracy": 0.8885926485061646, "num_input_tokens_seen": 2148184, "num_tokens": 2148184.0, "step": 2755, "train_runtime": 782.9917, "train_tokens_per_second": 2743.559 }, { "epoch": 0.552, "grad_norm": 4.71875, "learning_rate": 8.81095784381252e-06, "loss": 0.1767, "mean_token_accuracy": 0.9396568655967712, "num_input_tokens_seen": 2152144, "num_tokens": 2152144.0, "step": 2760, "train_runtime": 784.4028, "train_tokens_per_second": 2743.672 }, { "epoch": 0.553, "grad_norm": 3.53125, "learning_rate": 8.778806349212968e-06, "loss": 0.2962, "mean_token_accuracy": 0.9033411979675293, "num_input_tokens_seen": 2155930, "num_tokens": 2155930.0, "step": 2765, "train_runtime": 785.7158, "train_tokens_per_second": 2743.906 }, { "epoch": 0.554, "grad_norm": 7.53125, "learning_rate": 8.746667664356957e-06, "loss": 0.2633, "mean_token_accuracy": 0.9189417362213135, "num_input_tokens_seen": 2159292, "num_tokens": 2159292.0, "step": 2770, "train_runtime": 787.052, "train_tokens_per_second": 2743.519 }, { "epoch": 0.555, "grad_norm": 1.7734375, "learning_rate": 8.71454212636408e-06, "loss": 0.3954, "mean_token_accuracy": 0.8854796290397644, "num_input_tokens_seen": 2164327, "num_tokens": 2164327.0, "step": 2775, "train_runtime": 788.4928, "train_tokens_per_second": 2744.891 }, { "epoch": 0.556, "grad_norm": 3.453125, "learning_rate": 8.682430072216029e-06, "loss": 0.2052, "mean_token_accuracy": 0.9303106904029846, "num_input_tokens_seen": 2168586, "num_tokens": 2168586.0, "step": 2780, "train_runtime": 789.8553, "train_tokens_per_second": 2745.548 }, { "epoch": 0.557, "grad_norm": 7.46875, "learning_rate": 8.650331838753057e-06, "loss": 0.1803, "mean_token_accuracy": 0.9357792019844056, "num_input_tokens_seen": 2171577, "num_tokens": 2171577.0, "step": 2785, "train_runtime": 791.1529, "train_tokens_per_second": 2744.826 }, { "epoch": 0.558, "grad_norm": 8.375, "learning_rate": 8.618247762670445e-06, "loss": 0.3155, "mean_token_accuracy": 0.9034866452217102, "num_input_tokens_seen": 2174292, "num_tokens": 2174292.0, "step": 2790, "train_runtime": 792.4933, "train_tokens_per_second": 2743.609 }, { "epoch": 0.559, "grad_norm": 5.15625, "learning_rate": 8.586178180514968e-06, "loss": 0.3455, "mean_token_accuracy": 0.8897446870803833, "num_input_tokens_seen": 2178060, "num_tokens": 2178060.0, "step": 2795, "train_runtime": 793.794, "train_tokens_per_second": 2743.86 }, { "epoch": 0.56, "grad_norm": 33.0, "learning_rate": 8.554123428681367e-06, "loss": 0.3634, "mean_token_accuracy": 0.8930135488510131, "num_input_tokens_seen": 2181172, "num_tokens": 2181172.0, "step": 2800, "train_runtime": 795.0978, "train_tokens_per_second": 2743.275 }, { "epoch": 0.561, "grad_norm": 4.21875, "learning_rate": 8.522083843408823e-06, "loss": 0.3064, "mean_token_accuracy": 0.9036557078361511, "num_input_tokens_seen": 2187054, "num_tokens": 2187054.0, "step": 2805, "train_runtime": 796.5412, "train_tokens_per_second": 2745.688 }, { "epoch": 0.562, "grad_norm": 10.4375, "learning_rate": 8.490059760777425e-06, "loss": 0.4711, "mean_token_accuracy": 0.8565178871154785, "num_input_tokens_seen": 2191610, "num_tokens": 2191610.0, "step": 2810, "train_runtime": 797.943, "train_tokens_per_second": 2746.575 }, { "epoch": 0.563, "grad_norm": 17.0, "learning_rate": 8.458051516704644e-06, "loss": 0.3141, "mean_token_accuracy": 0.901203989982605, "num_input_tokens_seen": 2195511, "num_tokens": 2195511.0, "step": 2815, "train_runtime": 799.2894, "train_tokens_per_second": 2746.829 }, { "epoch": 0.564, "grad_norm": 5.3125, "learning_rate": 8.426059446941817e-06, "loss": 0.3093, "mean_token_accuracy": 0.8986282587051392, "num_input_tokens_seen": 2198888, "num_tokens": 2198888.0, "step": 2820, "train_runtime": 800.6138, "train_tokens_per_second": 2746.503 }, { "epoch": 0.565, "grad_norm": 9.875, "learning_rate": 8.394083887070614e-06, "loss": 0.5256, "mean_token_accuracy": 0.8546699762344361, "num_input_tokens_seen": 2202901, "num_tokens": 2202901.0, "step": 2825, "train_runtime": 802.0483, "train_tokens_per_second": 2746.594 }, { "epoch": 0.566, "grad_norm": 4.4375, "learning_rate": 8.36212517249953e-06, "loss": 0.2962, "mean_token_accuracy": 0.9131210207939148, "num_input_tokens_seen": 2208830, "num_tokens": 2208830.0, "step": 2830, "train_runtime": 803.6175, "train_tokens_per_second": 2748.609 }, { "epoch": 0.567, "grad_norm": 2.5625, "learning_rate": 8.330183638460356e-06, "loss": 0.2926, "mean_token_accuracy": 0.9007268190383911, "num_input_tokens_seen": 2213110, "num_tokens": 2213110.0, "step": 2835, "train_runtime": 805.0622, "train_tokens_per_second": 2748.992 }, { "epoch": 0.568, "grad_norm": 10.8125, "learning_rate": 8.29825962000467e-06, "loss": 0.3701, "mean_token_accuracy": 0.8899049639701844, "num_input_tokens_seen": 2218640, "num_tokens": 2218640.0, "step": 2840, "train_runtime": 806.5229, "train_tokens_per_second": 2750.87 }, { "epoch": 0.569, "grad_norm": 4.34375, "learning_rate": 8.266353452000326e-06, "loss": 0.382, "mean_token_accuracy": 0.8881470680236816, "num_input_tokens_seen": 2221791, "num_tokens": 2221791.0, "step": 2845, "train_runtime": 807.8182, "train_tokens_per_second": 2750.36 }, { "epoch": 0.57, "grad_norm": 2.640625, "learning_rate": 8.234465469127919e-06, "loss": 0.3631, "mean_token_accuracy": 0.8875722289085388, "num_input_tokens_seen": 2227204, "num_tokens": 2227204.0, "step": 2850, "train_runtime": 809.2915, "train_tokens_per_second": 2752.042 }, { "epoch": 0.571, "grad_norm": 6.375, "learning_rate": 8.202596005877307e-06, "loss": 0.3913, "mean_token_accuracy": 0.8793536067008972, "num_input_tokens_seen": 2229754, "num_tokens": 2229754.0, "step": 2855, "train_runtime": 810.5728, "train_tokens_per_second": 2750.838 }, { "epoch": 0.572, "grad_norm": 3.875, "learning_rate": 8.170745396544072e-06, "loss": 0.3181, "mean_token_accuracy": 0.8987621665000916, "num_input_tokens_seen": 2232400, "num_tokens": 2232400.0, "step": 2860, "train_runtime": 811.8384, "train_tokens_per_second": 2749.808 }, { "epoch": 0.573, "grad_norm": 15.375, "learning_rate": 8.138913975226044e-06, "loss": 0.3917, "mean_token_accuracy": 0.882588016986847, "num_input_tokens_seen": 2235940, "num_tokens": 2235940.0, "step": 2865, "train_runtime": 813.1771, "train_tokens_per_second": 2749.635 }, { "epoch": 0.574, "grad_norm": 9.5625, "learning_rate": 8.10710207581976e-06, "loss": 0.6168, "mean_token_accuracy": 0.8131334543228149, "num_input_tokens_seen": 2239848, "num_tokens": 2239848.0, "step": 2870, "train_runtime": 814.5321, "train_tokens_per_second": 2749.858 }, { "epoch": 0.575, "grad_norm": 7.65625, "learning_rate": 8.075310032017e-06, "loss": 0.2913, "mean_token_accuracy": 0.9075251936912536, "num_input_tokens_seen": 2243107, "num_tokens": 2243107.0, "step": 2875, "train_runtime": 815.8432, "train_tokens_per_second": 2749.434 }, { "epoch": 0.576, "grad_norm": 4.8125, "learning_rate": 8.043538177301256e-06, "loss": 0.2825, "mean_token_accuracy": 0.9150280237197876, "num_input_tokens_seen": 2248224, "num_tokens": 2248224.0, "step": 2880, "train_runtime": 817.2458, "train_tokens_per_second": 2750.976 }, { "epoch": 0.577, "grad_norm": 3.921875, "learning_rate": 8.01178684494425e-06, "loss": 0.2365, "mean_token_accuracy": 0.916808819770813, "num_input_tokens_seen": 2251901, "num_tokens": 2251901.0, "step": 2885, "train_runtime": 818.5509, "train_tokens_per_second": 2751.082 }, { "epoch": 0.578, "grad_norm": 8.75, "learning_rate": 7.980056368002435e-06, "loss": 0.5782, "mean_token_accuracy": 0.8338030934333801, "num_input_tokens_seen": 2256838, "num_tokens": 2256838.0, "step": 2890, "train_runtime": 820.3061, "train_tokens_per_second": 2751.214 }, { "epoch": 0.579, "grad_norm": 2.84375, "learning_rate": 7.948347079313494e-06, "loss": 0.4847, "mean_token_accuracy": 0.8506197094917297, "num_input_tokens_seen": 2261683, "num_tokens": 2261683.0, "step": 2895, "train_runtime": 821.8113, "train_tokens_per_second": 2752.071 }, { "epoch": 0.58, "grad_norm": 4.96875, "learning_rate": 7.916659311492871e-06, "loss": 0.2977, "mean_token_accuracy": 0.9099519848823547, "num_input_tokens_seen": 2266082, "num_tokens": 2266082.0, "step": 2900, "train_runtime": 823.207, "train_tokens_per_second": 2752.749 }, { "epoch": 0.581, "grad_norm": 7.875, "learning_rate": 7.88499339693025e-06, "loss": 0.4597, "mean_token_accuracy": 0.8564813733100891, "num_input_tokens_seen": 2269436, "num_tokens": 2269436.0, "step": 2905, "train_runtime": 824.5934, "train_tokens_per_second": 2752.188 }, { "epoch": 0.582, "grad_norm": 4.53125, "learning_rate": 7.85334966778609e-06, "loss": 0.3529, "mean_token_accuracy": 0.8892533659934998, "num_input_tokens_seen": 2273402, "num_tokens": 2273402.0, "step": 2910, "train_runtime": 825.9396, "train_tokens_per_second": 2752.504 }, { "epoch": 0.583, "grad_norm": 3.5625, "learning_rate": 7.82172845598814e-06, "loss": 0.1979, "mean_token_accuracy": 0.9269068241119385, "num_input_tokens_seen": 2278761, "num_tokens": 2278761.0, "step": 2915, "train_runtime": 827.4218, "train_tokens_per_second": 2754.05 }, { "epoch": 0.584, "grad_norm": 8.75, "learning_rate": 7.790130093227943e-06, "loss": 0.3909, "mean_token_accuracy": 0.8864853501319885, "num_input_tokens_seen": 2283514, "num_tokens": 2283514.0, "step": 2920, "train_runtime": 828.8398, "train_tokens_per_second": 2755.073 }, { "epoch": 0.585, "grad_norm": 4.25, "learning_rate": 7.758554910957378e-06, "loss": 0.1853, "mean_token_accuracy": 0.9374913811683655, "num_input_tokens_seen": 2287083, "num_tokens": 2287083.0, "step": 2925, "train_runtime": 830.1861, "train_tokens_per_second": 2754.904 }, { "epoch": 0.586, "grad_norm": 7.59375, "learning_rate": 7.727003240385163e-06, "loss": 0.2752, "mean_token_accuracy": 0.911455261707306, "num_input_tokens_seen": 2290106, "num_tokens": 2290106.0, "step": 2930, "train_runtime": 831.4722, "train_tokens_per_second": 2754.278 }, { "epoch": 0.587, "grad_norm": 7.25, "learning_rate": 7.695475412473393e-06, "loss": 0.4331, "mean_token_accuracy": 0.8567111015319824, "num_input_tokens_seen": 2293430, "num_tokens": 2293430.0, "step": 2935, "train_runtime": 832.7922, "train_tokens_per_second": 2753.904 }, { "epoch": 0.588, "grad_norm": 13.125, "learning_rate": 7.663971757934064e-06, "loss": 0.2839, "mean_token_accuracy": 0.9103299260139466, "num_input_tokens_seen": 2296804, "num_tokens": 2296804.0, "step": 2940, "train_runtime": 834.1294, "train_tokens_per_second": 2753.534 }, { "epoch": 0.589, "grad_norm": 4.5, "learning_rate": 7.632492607225604e-06, "loss": 0.3914, "mean_token_accuracy": 0.8721371531486511, "num_input_tokens_seen": 2300971, "num_tokens": 2300971.0, "step": 2945, "train_runtime": 835.4696, "train_tokens_per_second": 2754.105 }, { "epoch": 0.59, "grad_norm": 6.6875, "learning_rate": 7.60103829054941e-06, "loss": 0.367, "mean_token_accuracy": 0.887525987625122, "num_input_tokens_seen": 2304674, "num_tokens": 2304674.0, "step": 2950, "train_runtime": 836.7759, "train_tokens_per_second": 2754.231 }, { "epoch": 0.591, "grad_norm": 3.796875, "learning_rate": 7.569609137846376e-06, "loss": 0.3642, "mean_token_accuracy": 0.8813823699951172, "num_input_tokens_seen": 2309715, "num_tokens": 2309715.0, "step": 2955, "train_runtime": 838.2291, "train_tokens_per_second": 2755.47 }, { "epoch": 0.592, "grad_norm": 5.28125, "learning_rate": 7.538205478793448e-06, "loss": 0.365, "mean_token_accuracy": 0.8773080229759216, "num_input_tokens_seen": 2313504, "num_tokens": 2313504.0, "step": 2960, "train_runtime": 839.6669, "train_tokens_per_second": 2755.264 }, { "epoch": 0.593, "grad_norm": 8.875, "learning_rate": 7.506827642800146e-06, "loss": 0.4273, "mean_token_accuracy": 0.8636172771453857, "num_input_tokens_seen": 2315729, "num_tokens": 2315729.0, "step": 2965, "train_runtime": 840.9277, "train_tokens_per_second": 2753.779 }, { "epoch": 0.594, "grad_norm": 4.25, "learning_rate": 7.475475959005123e-06, "loss": 0.3329, "mean_token_accuracy": 0.8950678110122681, "num_input_tokens_seen": 2319634, "num_tokens": 2319634.0, "step": 2970, "train_runtime": 842.2668, "train_tokens_per_second": 2754.037 }, { "epoch": 0.595, "grad_norm": 4.625, "learning_rate": 7.444150756272704e-06, "loss": 0.3655, "mean_token_accuracy": 0.8852131962776184, "num_input_tokens_seen": 2322721, "num_tokens": 2322721.0, "step": 2975, "train_runtime": 843.5716, "train_tokens_per_second": 2753.437 }, { "epoch": 0.596, "grad_norm": 3.953125, "learning_rate": 7.4128523631894464e-06, "loss": 0.3281, "mean_token_accuracy": 0.8951844811439514, "num_input_tokens_seen": 2325626, "num_tokens": 2325626.0, "step": 2980, "train_runtime": 844.8914, "train_tokens_per_second": 2752.574 }, { "epoch": 0.597, "grad_norm": 8.625, "learning_rate": 7.38158110806068e-06, "loss": 0.4341, "mean_token_accuracy": 0.8627596855163574, "num_input_tokens_seen": 2330890, "num_tokens": 2330890.0, "step": 2985, "train_runtime": 846.5638, "train_tokens_per_second": 2753.354 }, { "epoch": 0.598, "grad_norm": 3.546875, "learning_rate": 7.350337318907075e-06, "loss": 0.3416, "mean_token_accuracy": 0.8937448143959046, "num_input_tokens_seen": 2334948, "num_tokens": 2334948.0, "step": 2990, "train_runtime": 847.9336, "train_tokens_per_second": 2753.692 }, { "epoch": 0.599, "grad_norm": 15.375, "learning_rate": 7.319121323461198e-06, "loss": 0.4357, "mean_token_accuracy": 0.8765780925750732, "num_input_tokens_seen": 2337743, "num_tokens": 2337743.0, "step": 2995, "train_runtime": 849.2413, "train_tokens_per_second": 2752.743 }, { "epoch": 0.6, "grad_norm": 5.0625, "learning_rate": 7.287933449164068e-06, "loss": 0.4273, "mean_token_accuracy": 0.862437629699707, "num_input_tokens_seen": 2341106, "num_tokens": 2341106.0, "step": 3000, "train_runtime": 850.5899, "train_tokens_per_second": 2752.332 }, { "epoch": 0.601, "grad_norm": 10.375, "learning_rate": 7.256774023161728e-06, "loss": 0.33, "mean_token_accuracy": 0.8905609846115112, "num_input_tokens_seen": 2344263, "num_tokens": 2344263.0, "step": 3005, "train_runtime": 851.9074, "train_tokens_per_second": 2751.782 }, { "epoch": 0.602, "grad_norm": 3.15625, "learning_rate": 7.225643372301812e-06, "loss": 0.2519, "mean_token_accuracy": 0.9171380519866943, "num_input_tokens_seen": 2347474, "num_tokens": 2347474.0, "step": 3010, "train_runtime": 853.2182, "train_tokens_per_second": 2751.317 }, { "epoch": 0.603, "grad_norm": 1.703125, "learning_rate": 7.194541823130119e-06, "loss": 0.2569, "mean_token_accuracy": 0.9189850211143493, "num_input_tokens_seen": 2351794, "num_tokens": 2351794.0, "step": 3015, "train_runtime": 854.5771, "train_tokens_per_second": 2751.997 }, { "epoch": 0.604, "grad_norm": 3.125, "learning_rate": 7.163469701887182e-06, "loss": 0.1702, "mean_token_accuracy": 0.9497987151145935, "num_input_tokens_seen": 2357566, "num_tokens": 2357566.0, "step": 3020, "train_runtime": 856.1259, "train_tokens_per_second": 2753.761 }, { "epoch": 0.605, "grad_norm": 68.5, "learning_rate": 7.132427334504846e-06, "loss": 0.4578, "mean_token_accuracy": 0.8649806618690491, "num_input_tokens_seen": 2360735, "num_tokens": 2360735.0, "step": 3025, "train_runtime": 857.4583, "train_tokens_per_second": 2753.178 }, { "epoch": 0.606, "grad_norm": 4.46875, "learning_rate": 7.1014150466028605e-06, "loss": 0.2932, "mean_token_accuracy": 0.9023116111755372, "num_input_tokens_seen": 2364472, "num_tokens": 2364472.0, "step": 3030, "train_runtime": 858.8063, "train_tokens_per_second": 2753.208 }, { "epoch": 0.607, "grad_norm": 3.109375, "learning_rate": 7.070433163485451e-06, "loss": 0.3154, "mean_token_accuracy": 0.90224689245224, "num_input_tokens_seen": 2368664, "num_tokens": 2368664.0, "step": 3035, "train_runtime": 860.2512, "train_tokens_per_second": 2753.456 }, { "epoch": 0.608, "grad_norm": 7.78125, "learning_rate": 7.039482010137908e-06, "loss": 0.3535, "mean_token_accuracy": 0.8888671278953553, "num_input_tokens_seen": 2371304, "num_tokens": 2371304.0, "step": 3040, "train_runtime": 861.5774, "train_tokens_per_second": 2752.282 }, { "epoch": 0.609, "grad_norm": 3.484375, "learning_rate": 7.008561911223186e-06, "loss": 0.2422, "mean_token_accuracy": 0.9156134366989136, "num_input_tokens_seen": 2375440, "num_tokens": 2375440.0, "step": 3045, "train_runtime": 862.9755, "train_tokens_per_second": 2752.616 }, { "epoch": 0.61, "grad_norm": 5.65625, "learning_rate": 6.977673191078487e-06, "loss": 0.4059, "mean_token_accuracy": 0.8834887027740479, "num_input_tokens_seen": 2379628, "num_tokens": 2379628.0, "step": 3050, "train_runtime": 864.3662, "train_tokens_per_second": 2753.032 }, { "epoch": 0.611, "grad_norm": 3.734375, "learning_rate": 6.946816173711878e-06, "loss": 0.1923, "mean_token_accuracy": 0.9388649463653564, "num_input_tokens_seen": 2383782, "num_tokens": 2383782.0, "step": 3055, "train_runtime": 865.7884, "train_tokens_per_second": 2753.308 }, { "epoch": 0.612, "grad_norm": 2.75, "learning_rate": 6.915991182798865e-06, "loss": 0.2522, "mean_token_accuracy": 0.9157787203788758, "num_input_tokens_seen": 2388484, "num_tokens": 2388484.0, "step": 3060, "train_runtime": 867.1652, "train_tokens_per_second": 2754.359 }, { "epoch": 0.613, "grad_norm": 6.90625, "learning_rate": 6.885198541679016e-06, "loss": 0.3192, "mean_token_accuracy": 0.8873642444610595, "num_input_tokens_seen": 2391530, "num_tokens": 2391530.0, "step": 3065, "train_runtime": 868.6039, "train_tokens_per_second": 2753.303 }, { "epoch": 0.614, "grad_norm": 5.1875, "learning_rate": 6.8544385733525665e-06, "loss": 0.3743, "mean_token_accuracy": 0.8712251782417297, "num_input_tokens_seen": 2395256, "num_tokens": 2395256.0, "step": 3070, "train_runtime": 869.9307, "train_tokens_per_second": 2753.387 }, { "epoch": 0.615, "grad_norm": 3.03125, "learning_rate": 6.823711600477025e-06, "loss": 0.3604, "mean_token_accuracy": 0.9011762142181396, "num_input_tokens_seen": 2399371, "num_tokens": 2399371.0, "step": 3075, "train_runtime": 871.2661, "train_tokens_per_second": 2753.89 }, { "epoch": 0.616, "grad_norm": 3.890625, "learning_rate": 6.793017945363804e-06, "loss": 0.1941, "mean_token_accuracy": 0.9396929144859314, "num_input_tokens_seen": 2406496, "num_tokens": 2406496.0, "step": 3080, "train_runtime": 872.8676, "train_tokens_per_second": 2757.0 }, { "epoch": 0.617, "grad_norm": 7.71875, "learning_rate": 6.76235792997482e-06, "loss": 0.3346, "mean_token_accuracy": 0.877459180355072, "num_input_tokens_seen": 2409499, "num_tokens": 2409499.0, "step": 3085, "train_runtime": 874.1933, "train_tokens_per_second": 2756.254 }, { "epoch": 0.618, "grad_norm": 8.125, "learning_rate": 6.731731875919123e-06, "loss": 0.2349, "mean_token_accuracy": 0.919044840335846, "num_input_tokens_seen": 2412902, "num_tokens": 2412902.0, "step": 3090, "train_runtime": 875.5686, "train_tokens_per_second": 2755.811 }, { "epoch": 0.619, "grad_norm": 4.3125, "learning_rate": 6.7011401044495304e-06, "loss": 0.2686, "mean_token_accuracy": 0.9095120310783387, "num_input_tokens_seen": 2416741, "num_tokens": 2416741.0, "step": 3095, "train_runtime": 876.9212, "train_tokens_per_second": 2755.939 }, { "epoch": 0.62, "grad_norm": 8.0625, "learning_rate": 6.670582936459249e-06, "loss": 0.4129, "mean_token_accuracy": 0.860441792011261, "num_input_tokens_seen": 2420928, "num_tokens": 2420928.0, "step": 3100, "train_runtime": 878.2514, "train_tokens_per_second": 2756.532 }, { "epoch": 0.621, "grad_norm": 10.1875, "learning_rate": 6.6400606924785095e-06, "loss": 0.4096, "mean_token_accuracy": 0.875682020187378, "num_input_tokens_seen": 2424204, "num_tokens": 2424204.0, "step": 3105, "train_runtime": 879.5695, "train_tokens_per_second": 2756.126 }, { "epoch": 0.622, "grad_norm": 2.078125, "learning_rate": 6.609573692671209e-06, "loss": 0.2261, "mean_token_accuracy": 0.927798056602478, "num_input_tokens_seen": 2427774, "num_tokens": 2427774.0, "step": 3110, "train_runtime": 880.9303, "train_tokens_per_second": 2755.921 }, { "epoch": 0.623, "grad_norm": 3.8125, "learning_rate": 6.579122256831551e-06, "loss": 0.2674, "mean_token_accuracy": 0.9181647658348083, "num_input_tokens_seen": 2432188, "num_tokens": 2432188.0, "step": 3115, "train_runtime": 882.4167, "train_tokens_per_second": 2756.281 }, { "epoch": 0.624, "grad_norm": 7.46875, "learning_rate": 6.54870670438069e-06, "loss": 0.3714, "mean_token_accuracy": 0.8828112602233886, "num_input_tokens_seen": 2435714, "num_tokens": 2435714.0, "step": 3120, "train_runtime": 883.7891, "train_tokens_per_second": 2755.99 }, { "epoch": 0.625, "grad_norm": 3.5625, "learning_rate": 6.518327354363374e-06, "loss": 0.327, "mean_token_accuracy": 0.8874168872833252, "num_input_tokens_seen": 2438537, "num_tokens": 2438537.0, "step": 3125, "train_runtime": 885.0989, "train_tokens_per_second": 2755.101 }, { "epoch": 0.626, "grad_norm": 6.65625, "learning_rate": 6.487984525444613e-06, "loss": 0.3872, "mean_token_accuracy": 0.8769646167755127, "num_input_tokens_seen": 2442108, "num_tokens": 2442108.0, "step": 3130, "train_runtime": 886.3862, "train_tokens_per_second": 2755.129 }, { "epoch": 0.627, "grad_norm": 7.03125, "learning_rate": 6.4576785359063225e-06, "loss": 0.436, "mean_token_accuracy": 0.8662237882614136, "num_input_tokens_seen": 2445528, "num_tokens": 2445528.0, "step": 3135, "train_runtime": 887.7867, "train_tokens_per_second": 2754.635 }, { "epoch": 0.628, "grad_norm": 7.03125, "learning_rate": 6.42740970364399e-06, "loss": 0.3346, "mean_token_accuracy": 0.8945701479911804, "num_input_tokens_seen": 2448240, "num_tokens": 2448240.0, "step": 3140, "train_runtime": 889.1283, "train_tokens_per_second": 2753.529 }, { "epoch": 0.629, "grad_norm": 13.3125, "learning_rate": 6.397178346163348e-06, "loss": 0.2934, "mean_token_accuracy": 0.9032702684402466, "num_input_tokens_seen": 2451164, "num_tokens": 2451164.0, "step": 3145, "train_runtime": 890.415, "train_tokens_per_second": 2752.833 }, { "epoch": 0.63, "grad_norm": 7.25, "learning_rate": 6.36698478057703e-06, "loss": 0.3852, "mean_token_accuracy": 0.8814346790313721, "num_input_tokens_seen": 2456070, "num_tokens": 2456070.0, "step": 3150, "train_runtime": 891.8452, "train_tokens_per_second": 2753.92 }, { "epoch": 0.631, "grad_norm": 4.5, "learning_rate": 6.33682932360125e-06, "loss": 0.1961, "mean_token_accuracy": 0.9360305190086364, "num_input_tokens_seen": 2460912, "num_tokens": 2460912.0, "step": 3155, "train_runtime": 893.295, "train_tokens_per_second": 2754.871 }, { "epoch": 0.632, "grad_norm": 9.125, "learning_rate": 6.306712291552484e-06, "loss": 0.3917, "mean_token_accuracy": 0.8746399044990539, "num_input_tokens_seen": 2465142, "num_tokens": 2465142.0, "step": 3160, "train_runtime": 894.6366, "train_tokens_per_second": 2755.467 }, { "epoch": 0.633, "grad_norm": 3.203125, "learning_rate": 6.276634000344144e-06, "loss": 0.3054, "mean_token_accuracy": 0.9040336608886719, "num_input_tokens_seen": 2468040, "num_tokens": 2468040.0, "step": 3165, "train_runtime": 895.9072, "train_tokens_per_second": 2754.794 }, { "epoch": 0.634, "grad_norm": 8.25, "learning_rate": 6.246594765483274e-06, "loss": 0.3508, "mean_token_accuracy": 0.8945856809616088, "num_input_tokens_seen": 2471450, "num_tokens": 2471450.0, "step": 3170, "train_runtime": 897.2615, "train_tokens_per_second": 2754.437 }, { "epoch": 0.635, "grad_norm": 2.109375, "learning_rate": 6.216594902067233e-06, "loss": 0.4102, "mean_token_accuracy": 0.8840370416641236, "num_input_tokens_seen": 2475022, "num_tokens": 2475022.0, "step": 3175, "train_runtime": 898.562, "train_tokens_per_second": 2754.426 }, { "epoch": 0.636, "grad_norm": 6.09375, "learning_rate": 6.186634724780394e-06, "loss": 0.484, "mean_token_accuracy": 0.8442889213562011, "num_input_tokens_seen": 2478884, "num_tokens": 2478884.0, "step": 3180, "train_runtime": 899.8634, "train_tokens_per_second": 2754.734 }, { "epoch": 0.637, "grad_norm": 34.5, "learning_rate": 6.156714547890838e-06, "loss": 0.3682, "mean_token_accuracy": 0.8814856529235839, "num_input_tokens_seen": 2484246, "num_tokens": 2484246.0, "step": 3185, "train_runtime": 901.3183, "train_tokens_per_second": 2756.236 }, { "epoch": 0.638, "grad_norm": 2.578125, "learning_rate": 6.126834685247065e-06, "loss": 0.1761, "mean_token_accuracy": 0.9449016928672791, "num_input_tokens_seen": 2490012, "num_tokens": 2490012.0, "step": 3190, "train_runtime": 902.9176, "train_tokens_per_second": 2757.74 }, { "epoch": 0.639, "grad_norm": 3.6875, "learning_rate": 6.0969954502746916e-06, "loss": 0.2051, "mean_token_accuracy": 0.9335859656333924, "num_input_tokens_seen": 2494429, "num_tokens": 2494429.0, "step": 3195, "train_runtime": 904.328, "train_tokens_per_second": 2758.323 }, { "epoch": 0.64, "grad_norm": 3.390625, "learning_rate": 6.067197155973172e-06, "loss": 0.2533, "mean_token_accuracy": 0.9152822256088257, "num_input_tokens_seen": 2499062, "num_tokens": 2499062.0, "step": 3200, "train_runtime": 905.7703, "train_tokens_per_second": 2759.046 }, { "epoch": 0.641, "grad_norm": 12.4375, "learning_rate": 6.037440114912521e-06, "loss": 0.4165, "mean_token_accuracy": 0.8669069051742554, "num_input_tokens_seen": 2502264, "num_tokens": 2502264.0, "step": 3205, "train_runtime": 907.0549, "train_tokens_per_second": 2758.669 }, { "epoch": 0.642, "grad_norm": 6.875, "learning_rate": 6.00772463923001e-06, "loss": 0.3767, "mean_token_accuracy": 0.8786419272422791, "num_input_tokens_seen": 2505688, "num_tokens": 2505688.0, "step": 3210, "train_runtime": 908.3298, "train_tokens_per_second": 2758.566 }, { "epoch": 0.643, "grad_norm": 4.75, "learning_rate": 5.9780510406269245e-06, "loss": 0.305, "mean_token_accuracy": 0.8994436860084534, "num_input_tokens_seen": 2509861, "num_tokens": 2509861.0, "step": 3215, "train_runtime": 909.7082, "train_tokens_per_second": 2758.974 }, { "epoch": 0.644, "grad_norm": 2.5625, "learning_rate": 5.948419630365269e-06, "loss": 0.1769, "mean_token_accuracy": 0.9397383809089661, "num_input_tokens_seen": 2514856, "num_tokens": 2514856.0, "step": 3220, "train_runtime": 911.154, "train_tokens_per_second": 2760.078 }, { "epoch": 0.645, "grad_norm": 3.203125, "learning_rate": 5.918830719264514e-06, "loss": 0.2758, "mean_token_accuracy": 0.917659056186676, "num_input_tokens_seen": 2519273, "num_tokens": 2519273.0, "step": 3225, "train_runtime": 912.5974, "train_tokens_per_second": 2760.552 }, { "epoch": 0.646, "grad_norm": 5.3125, "learning_rate": 5.889284617698339e-06, "loss": 0.3704, "mean_token_accuracy": 0.8827759981155395, "num_input_tokens_seen": 2523240, "num_tokens": 2523240.0, "step": 3230, "train_runtime": 913.9056, "train_tokens_per_second": 2760.942 }, { "epoch": 0.647, "grad_norm": 2.234375, "learning_rate": 5.8597816355913685e-06, "loss": 0.4025, "mean_token_accuracy": 0.8680074214935303, "num_input_tokens_seen": 2528804, "num_tokens": 2528804.0, "step": 3235, "train_runtime": 915.379, "train_tokens_per_second": 2762.576 }, { "epoch": 0.648, "grad_norm": 4.96875, "learning_rate": 5.830322082415922e-06, "loss": 0.1836, "mean_token_accuracy": 0.9428686738014221, "num_input_tokens_seen": 2533210, "num_tokens": 2533210.0, "step": 3240, "train_runtime": 916.7595, "train_tokens_per_second": 2763.222 }, { "epoch": 0.649, "grad_norm": 7.125, "learning_rate": 5.800906267188773e-06, "loss": 0.3459, "mean_token_accuracy": 0.894824206829071, "num_input_tokens_seen": 2537565, "num_tokens": 2537565.0, "step": 3245, "train_runtime": 918.1283, "train_tokens_per_second": 2763.846 }, { "epoch": 0.65, "grad_norm": 10.6875, "learning_rate": 5.771534498467908e-06, "loss": 0.3937, "mean_token_accuracy": 0.8750831842422485, "num_input_tokens_seen": 2540520, "num_tokens": 2540520.0, "step": 3250, "train_runtime": 919.4092, "train_tokens_per_second": 2763.209 }, { "epoch": 0.651, "grad_norm": 3.640625, "learning_rate": 5.742207084349274e-06, "loss": 0.3339, "mean_token_accuracy": 0.8889687776565551, "num_input_tokens_seen": 2543962, "num_tokens": 2543962.0, "step": 3255, "train_runtime": 920.7381, "train_tokens_per_second": 2762.959 }, { "epoch": 0.652, "grad_norm": 10.3125, "learning_rate": 5.712924332463575e-06, "loss": 0.3176, "mean_token_accuracy": 0.8995662212371827, "num_input_tokens_seen": 2548902, "num_tokens": 2548902.0, "step": 3260, "train_runtime": 922.1963, "train_tokens_per_second": 2763.947 }, { "epoch": 0.653, "grad_norm": 4.375, "learning_rate": 5.683686549973018e-06, "loss": 0.2883, "mean_token_accuracy": 0.9100780248641968, "num_input_tokens_seen": 2553223, "num_tokens": 2553223.0, "step": 3265, "train_runtime": 923.5837, "train_tokens_per_second": 2764.474 }, { "epoch": 0.654, "grad_norm": 9.5625, "learning_rate": 5.654494043568109e-06, "loss": 0.4546, "mean_token_accuracy": 0.8544560074806213, "num_input_tokens_seen": 2557384, "num_tokens": 2557384.0, "step": 3270, "train_runtime": 925.032, "train_tokens_per_second": 2764.644 }, { "epoch": 0.655, "grad_norm": 6.5, "learning_rate": 5.625347119464422e-06, "loss": 0.2469, "mean_token_accuracy": 0.9170319080352783, "num_input_tokens_seen": 2561743, "num_tokens": 2561743.0, "step": 3275, "train_runtime": 926.4708, "train_tokens_per_second": 2765.055 }, { "epoch": 0.656, "grad_norm": 3.453125, "learning_rate": 5.596246083399402e-06, "loss": 0.2415, "mean_token_accuracy": 0.916782808303833, "num_input_tokens_seen": 2565478, "num_tokens": 2565478.0, "step": 3280, "train_runtime": 927.7785, "train_tokens_per_second": 2765.184 }, { "epoch": 0.657, "grad_norm": 3.203125, "learning_rate": 5.567191240629151e-06, "loss": 0.1579, "mean_token_accuracy": 0.9449813842773438, "num_input_tokens_seen": 2569940, "num_tokens": 2569940.0, "step": 3285, "train_runtime": 929.235, "train_tokens_per_second": 2765.651 }, { "epoch": 0.658, "grad_norm": 3.96875, "learning_rate": 5.538182895925212e-06, "loss": 0.2962, "mean_token_accuracy": 0.9031290292739869, "num_input_tokens_seen": 2574928, "num_tokens": 2574928.0, "step": 3290, "train_runtime": 930.6492, "train_tokens_per_second": 2766.808 }, { "epoch": 0.659, "grad_norm": 4.375, "learning_rate": 5.509221353571404e-06, "loss": 0.4228, "mean_token_accuracy": 0.8604089975357055, "num_input_tokens_seen": 2577678, "num_tokens": 2577678.0, "step": 3295, "train_runtime": 931.9849, "train_tokens_per_second": 2765.794 }, { "epoch": 0.66, "grad_norm": 3.234375, "learning_rate": 5.4803069173605915e-06, "loss": 0.2955, "mean_token_accuracy": 0.9032185673713684, "num_input_tokens_seen": 2583484, "num_tokens": 2583484.0, "step": 3300, "train_runtime": 933.4199, "train_tokens_per_second": 2767.762 }, { "epoch": 0.661, "grad_norm": 6.09375, "learning_rate": 5.451439890591539e-06, "loss": 0.4481, "mean_token_accuracy": 0.8796740770339966, "num_input_tokens_seen": 2587179, "num_tokens": 2587179.0, "step": 3305, "train_runtime": 934.7519, "train_tokens_per_second": 2767.771 }, { "epoch": 0.662, "grad_norm": 7.375, "learning_rate": 5.422620576065689e-06, "loss": 0.374, "mean_token_accuracy": 0.8821952700614929, "num_input_tokens_seen": 2590916, "num_tokens": 2590916.0, "step": 3310, "train_runtime": 936.1073, "train_tokens_per_second": 2767.755 }, { "epoch": 0.663, "grad_norm": 3.59375, "learning_rate": 5.3938492760840176e-06, "loss": 0.2994, "mean_token_accuracy": 0.8990121722221375, "num_input_tokens_seen": 2595320, "num_tokens": 2595320.0, "step": 3315, "train_runtime": 937.592, "train_tokens_per_second": 2768.07 }, { "epoch": 0.664, "grad_norm": 9.625, "learning_rate": 5.365126292443852e-06, "loss": 0.4401, "mean_token_accuracy": 0.8689446687698364, "num_input_tokens_seen": 2601572, "num_tokens": 2601572.0, "step": 3320, "train_runtime": 939.1836, "train_tokens_per_second": 2770.036 }, { "epoch": 0.665, "grad_norm": 3.84375, "learning_rate": 5.336451926435688e-06, "loss": 0.3809, "mean_token_accuracy": 0.8833752989768981, "num_input_tokens_seen": 2605803, "num_tokens": 2605803.0, "step": 3325, "train_runtime": 940.5566, "train_tokens_per_second": 2770.49 }, { "epoch": 0.666, "grad_norm": 5.5, "learning_rate": 5.307826478840068e-06, "loss": 0.186, "mean_token_accuracy": 0.935472822189331, "num_input_tokens_seen": 2610304, "num_tokens": 2610304.0, "step": 3330, "train_runtime": 941.9849, "train_tokens_per_second": 2771.068 }, { "epoch": 0.667, "grad_norm": 6.40625, "learning_rate": 5.279250249924384e-06, "loss": 0.4925, "mean_token_accuracy": 0.8474063634872436, "num_input_tokens_seen": 2614387, "num_tokens": 2614387.0, "step": 3335, "train_runtime": 943.4066, "train_tokens_per_second": 2771.22 }, { "epoch": 0.668, "grad_norm": 5.59375, "learning_rate": 5.2507235394397595e-06, "loss": 0.4127, "mean_token_accuracy": 0.8839301109313965, "num_input_tokens_seen": 2618490, "num_tokens": 2618490.0, "step": 3340, "train_runtime": 944.7883, "train_tokens_per_second": 2771.51 }, { "epoch": 0.669, "grad_norm": 7.125, "learning_rate": 5.222246646617886e-06, "loss": 0.4054, "mean_token_accuracy": 0.8763983726501465, "num_input_tokens_seen": 2622844, "num_tokens": 2622844.0, "step": 3345, "train_runtime": 946.1417, "train_tokens_per_second": 2772.147 }, { "epoch": 0.67, "grad_norm": 28.125, "learning_rate": 5.193819870167893e-06, "loss": 0.3226, "mean_token_accuracy": 0.907047963142395, "num_input_tokens_seen": 2626900, "num_tokens": 2626900.0, "step": 3350, "train_runtime": 947.5257, "train_tokens_per_second": 2772.379 }, { "epoch": 0.671, "grad_norm": 16.75, "learning_rate": 5.165443508273218e-06, "loss": 0.3651, "mean_token_accuracy": 0.8953990697860718, "num_input_tokens_seen": 2632345, "num_tokens": 2632345.0, "step": 3355, "train_runtime": 949.1306, "train_tokens_per_second": 2773.428 }, { "epoch": 0.672, "grad_norm": 3.8125, "learning_rate": 5.137117858588472e-06, "loss": 0.3094, "mean_token_accuracy": 0.906135892868042, "num_input_tokens_seen": 2637752, "num_tokens": 2637752.0, "step": 3360, "train_runtime": 950.6149, "train_tokens_per_second": 2774.785 }, { "epoch": 0.673, "grad_norm": 4.4375, "learning_rate": 5.10884321823631e-06, "loss": 0.2436, "mean_token_accuracy": 0.9176534175872803, "num_input_tokens_seen": 2641384, "num_tokens": 2641384.0, "step": 3365, "train_runtime": 951.9498, "train_tokens_per_second": 2774.709 }, { "epoch": 0.674, "grad_norm": 7.625, "learning_rate": 5.080619883804333e-06, "loss": 0.4407, "mean_token_accuracy": 0.8576617956161499, "num_input_tokens_seen": 2645638, "num_tokens": 2645638.0, "step": 3370, "train_runtime": 953.3831, "train_tokens_per_second": 2775.0 }, { "epoch": 0.675, "grad_norm": 108.5, "learning_rate": 5.0524481513419675e-06, "loss": 0.3547, "mean_token_accuracy": 0.891556441783905, "num_input_tokens_seen": 2649136, "num_tokens": 2649136.0, "step": 3375, "train_runtime": 954.7465, "train_tokens_per_second": 2774.701 }, { "epoch": 0.676, "grad_norm": 6.75, "learning_rate": 5.02432831635735e-06, "loss": 0.3567, "mean_token_accuracy": 0.8808706641197205, "num_input_tokens_seen": 2652614, "num_tokens": 2652614.0, "step": 3380, "train_runtime": 956.0891, "train_tokens_per_second": 2774.442 }, { "epoch": 0.677, "grad_norm": 4.09375, "learning_rate": 4.99626067381425e-06, "loss": 0.4477, "mean_token_accuracy": 0.8606111526489257, "num_input_tokens_seen": 2655188, "num_tokens": 2655188.0, "step": 3385, "train_runtime": 957.3923, "train_tokens_per_second": 2773.354 }, { "epoch": 0.678, "grad_norm": 11.125, "learning_rate": 4.96824551812895e-06, "loss": 0.4798, "mean_token_accuracy": 0.8548410296440124, "num_input_tokens_seen": 2658882, "num_tokens": 2658882.0, "step": 3390, "train_runtime": 958.7663, "train_tokens_per_second": 2773.233 }, { "epoch": 0.679, "grad_norm": 10.8125, "learning_rate": 4.9402831431671834e-06, "loss": 0.1844, "mean_token_accuracy": 0.9484425187110901, "num_input_tokens_seen": 2664346, "num_tokens": 2664346.0, "step": 3395, "train_runtime": 960.2975, "train_tokens_per_second": 2774.501 }, { "epoch": 0.68, "grad_norm": 9.9375, "learning_rate": 4.912373842241025e-06, "loss": 0.3825, "mean_token_accuracy": 0.8716008782386779, "num_input_tokens_seen": 2668864, "num_tokens": 2668864.0, "step": 3400, "train_runtime": 961.6915, "train_tokens_per_second": 2775.177 }, { "epoch": 0.681, "grad_norm": 5.5625, "learning_rate": 4.884517908105837e-06, "loss": 0.1545, "mean_token_accuracy": 0.9467914342880249, "num_input_tokens_seen": 2672352, "num_tokens": 2672352.0, "step": 3405, "train_runtime": 962.9898, "train_tokens_per_second": 2775.057 }, { "epoch": 0.682, "grad_norm": 14.9375, "learning_rate": 4.856715632957193e-06, "loss": 0.3868, "mean_token_accuracy": 0.8927870035171509, "num_input_tokens_seen": 2675610, "num_tokens": 2675610.0, "step": 3410, "train_runtime": 964.307, "train_tokens_per_second": 2774.646 }, { "epoch": 0.683, "grad_norm": 5.09375, "learning_rate": 4.828967308427795e-06, "loss": 0.183, "mean_token_accuracy": 0.939356529712677, "num_input_tokens_seen": 2679928, "num_tokens": 2679928.0, "step": 3415, "train_runtime": 965.7486, "train_tokens_per_second": 2774.975 }, { "epoch": 0.684, "grad_norm": 6.6875, "learning_rate": 4.801273225584445e-06, "loss": 0.2434, "mean_token_accuracy": 0.9167834639549255, "num_input_tokens_seen": 2682378, "num_tokens": 2682378.0, "step": 3420, "train_runtime": 967.0906, "train_tokens_per_second": 2773.657 }, { "epoch": 0.685, "grad_norm": 4.34375, "learning_rate": 4.77363367492496e-06, "loss": 0.2581, "mean_token_accuracy": 0.9097264528274536, "num_input_tokens_seen": 2685097, "num_tokens": 2685097.0, "step": 3425, "train_runtime": 968.3723, "train_tokens_per_second": 2772.794 }, { "epoch": 0.686, "grad_norm": 3.890625, "learning_rate": 4.74604894637515e-06, "loss": 0.364, "mean_token_accuracy": 0.8854797840118408, "num_input_tokens_seen": 2689244, "num_tokens": 2689244.0, "step": 3430, "train_runtime": 969.822, "train_tokens_per_second": 2772.925 }, { "epoch": 0.687, "grad_norm": 10.3125, "learning_rate": 4.718519329285771e-06, "loss": 0.386, "mean_token_accuracy": 0.8777982831001282, "num_input_tokens_seen": 2692598, "num_tokens": 2692598.0, "step": 3435, "train_runtime": 971.1764, "train_tokens_per_second": 2772.512 }, { "epoch": 0.688, "grad_norm": 4.03125, "learning_rate": 4.69104511242947e-06, "loss": 0.2297, "mean_token_accuracy": 0.9273438096046448, "num_input_tokens_seen": 2696980, "num_tokens": 2696980.0, "step": 3440, "train_runtime": 972.6552, "train_tokens_per_second": 2772.802 }, { "epoch": 0.689, "grad_norm": 6.625, "learning_rate": 4.663626583997789e-06, "loss": 0.2871, "mean_token_accuracy": 0.9031723976135254, "num_input_tokens_seen": 2699663, "num_tokens": 2699663.0, "step": 3445, "train_runtime": 973.9617, "train_tokens_per_second": 2771.837 }, { "epoch": 0.69, "grad_norm": 6.53125, "learning_rate": 4.63626403159811e-06, "loss": 0.5139, "mean_token_accuracy": 0.8457622289657593, "num_input_tokens_seen": 2702784, "num_tokens": 2702784.0, "step": 3450, "train_runtime": 975.3219, "train_tokens_per_second": 2771.171 }, { "epoch": 0.691, "grad_norm": 25.0, "learning_rate": 4.608957742250667e-06, "loss": 0.3532, "mean_token_accuracy": 0.8892346858978272, "num_input_tokens_seen": 2707049, "num_tokens": 2707049.0, "step": 3455, "train_runtime": 976.7571, "train_tokens_per_second": 2771.466 }, { "epoch": 0.692, "grad_norm": 8.6875, "learning_rate": 4.581708002385506e-06, "loss": 0.4362, "mean_token_accuracy": 0.853585398197174, "num_input_tokens_seen": 2710538, "num_tokens": 2710538.0, "step": 3460, "train_runtime": 978.1441, "train_tokens_per_second": 2771.103 }, { "epoch": 0.693, "grad_norm": 6.5625, "learning_rate": 4.554515097839511e-06, "loss": 0.1878, "mean_token_accuracy": 0.9397881865501404, "num_input_tokens_seen": 2715131, "num_tokens": 2715131.0, "step": 3465, "train_runtime": 979.5725, "train_tokens_per_second": 2771.751 }, { "epoch": 0.694, "grad_norm": 23.0, "learning_rate": 4.527379313853381e-06, "loss": 0.6032, "mean_token_accuracy": 0.8142884612083435, "num_input_tokens_seen": 2717306, "num_tokens": 2717306.0, "step": 3470, "train_runtime": 980.8865, "train_tokens_per_second": 2770.255 }, { "epoch": 0.695, "grad_norm": 10.125, "learning_rate": 4.500300935068647e-06, "loss": 0.512, "mean_token_accuracy": 0.859369158744812, "num_input_tokens_seen": 2720206, "num_tokens": 2720206.0, "step": 3475, "train_runtime": 982.1962, "train_tokens_per_second": 2769.514 }, { "epoch": 0.696, "grad_norm": 3.828125, "learning_rate": 4.473280245524696e-06, "loss": 0.2564, "mean_token_accuracy": 0.9131362438201904, "num_input_tokens_seen": 2722612, "num_tokens": 2722612.0, "step": 3480, "train_runtime": 983.4924, "train_tokens_per_second": 2768.31 }, { "epoch": 0.697, "grad_norm": 4.5, "learning_rate": 4.4463175286557654e-06, "loss": 0.373, "mean_token_accuracy": 0.8748133301734924, "num_input_tokens_seen": 2725358, "num_tokens": 2725358.0, "step": 3485, "train_runtime": 984.7799, "train_tokens_per_second": 2767.479 }, { "epoch": 0.698, "grad_norm": 4.6875, "learning_rate": 4.419413067288006e-06, "loss": 0.2823, "mean_token_accuracy": 0.9013235330581665, "num_input_tokens_seen": 2729168, "num_tokens": 2729168.0, "step": 3490, "train_runtime": 986.1178, "train_tokens_per_second": 2767.588 }, { "epoch": 0.699, "grad_norm": 3.203125, "learning_rate": 4.39256714363648e-06, "loss": 0.4262, "mean_token_accuracy": 0.8625032901763916, "num_input_tokens_seen": 2733504, "num_tokens": 2733504.0, "step": 3495, "train_runtime": 987.4796, "train_tokens_per_second": 2768.162 }, { "epoch": 0.7, "grad_norm": 10.3125, "learning_rate": 4.3657800393022255e-06, "loss": 0.3551, "mean_token_accuracy": 0.8802049040794373, "num_input_tokens_seen": 2736658, "num_tokens": 2736658.0, "step": 3500, "train_runtime": 988.7866, "train_tokens_per_second": 2767.693 }, { "epoch": 0.701, "grad_norm": 5.65625, "learning_rate": 4.339052035269291e-06, "loss": 0.3689, "mean_token_accuracy": 0.8729144096374511, "num_input_tokens_seen": 2740467, "num_tokens": 2740467.0, "step": 3505, "train_runtime": 990.1571, "train_tokens_per_second": 2767.709 }, { "epoch": 0.702, "grad_norm": 3.46875, "learning_rate": 4.312383411901796e-06, "loss": 0.4096, "mean_token_accuracy": 0.87277193069458, "num_input_tokens_seen": 2744846, "num_tokens": 2744846.0, "step": 3510, "train_runtime": 991.5939, "train_tokens_per_second": 2768.115 }, { "epoch": 0.703, "grad_norm": 8.9375, "learning_rate": 4.2857744489409725e-06, "loss": 0.5364, "mean_token_accuracy": 0.8391360402107239, "num_input_tokens_seen": 2748277, "num_tokens": 2748277.0, "step": 3515, "train_runtime": 992.9488, "train_tokens_per_second": 2767.793 }, { "epoch": 0.704, "grad_norm": 11.3125, "learning_rate": 4.259225425502256e-06, "loss": 0.5347, "mean_token_accuracy": 0.8366138219833374, "num_input_tokens_seen": 2752030, "num_tokens": 2752030.0, "step": 3520, "train_runtime": 994.2751, "train_tokens_per_second": 2767.876 }, { "epoch": 0.705, "grad_norm": 7.78125, "learning_rate": 4.2327366200723404e-06, "loss": 0.2262, "mean_token_accuracy": 0.9387398838996888, "num_input_tokens_seen": 2755937, "num_tokens": 2755937.0, "step": 3525, "train_runtime": 995.628, "train_tokens_per_second": 2768.039 }, { "epoch": 0.706, "grad_norm": 7.96875, "learning_rate": 4.206308310506255e-06, "loss": 0.4264, "mean_token_accuracy": 0.8746701240539551, "num_input_tokens_seen": 2760456, "num_tokens": 2760456.0, "step": 3530, "train_runtime": 997.0265, "train_tokens_per_second": 2768.689 }, { "epoch": 0.707, "grad_norm": 6.0625, "learning_rate": 4.179940774024469e-06, "loss": 0.4686, "mean_token_accuracy": 0.8607106566429138, "num_input_tokens_seen": 2763679, "num_tokens": 2763679.0, "step": 3535, "train_runtime": 998.4436, "train_tokens_per_second": 2767.987 }, { "epoch": 0.708, "grad_norm": 20.75, "learning_rate": 4.153634287209955e-06, "loss": 0.4304, "mean_token_accuracy": 0.8699228286743164, "num_input_tokens_seen": 2767006, "num_tokens": 2767006.0, "step": 3540, "train_runtime": 999.7717, "train_tokens_per_second": 2767.638 }, { "epoch": 0.709, "grad_norm": 2.359375, "learning_rate": 4.127389126005319e-06, "loss": 0.1548, "mean_token_accuracy": 0.9474752426147461, "num_input_tokens_seen": 2774985, "num_tokens": 2774985.0, "step": 3545, "train_runtime": 1001.4951, "train_tokens_per_second": 2770.842 }, { "epoch": 0.71, "grad_norm": 3.59375, "learning_rate": 4.101205565709876e-06, "loss": 0.2025, "mean_token_accuracy": 0.9248441815376282, "num_input_tokens_seen": 2778724, "num_tokens": 2778724.0, "step": 3550, "train_runtime": 1002.8838, "train_tokens_per_second": 2770.734 }, { "epoch": 0.711, "grad_norm": 5.84375, "learning_rate": 4.0750838809767875e-06, "loss": 0.2078, "mean_token_accuracy": 0.9335240125656128, "num_input_tokens_seen": 2784257, "num_tokens": 2784257.0, "step": 3555, "train_runtime": 1004.3557, "train_tokens_per_second": 2772.182 }, { "epoch": 0.712, "grad_norm": 5.03125, "learning_rate": 4.049024345810169e-06, "loss": 0.2667, "mean_token_accuracy": 0.9217629790306091, "num_input_tokens_seen": 2789116, "num_tokens": 2789116.0, "step": 3560, "train_runtime": 1005.7811, "train_tokens_per_second": 2773.085 }, { "epoch": 0.713, "grad_norm": 3.0625, "learning_rate": 4.0230272335622065e-06, "loss": 0.4358, "mean_token_accuracy": 0.8588399767875672, "num_input_tokens_seen": 2792473, "num_tokens": 2792473.0, "step": 3565, "train_runtime": 1007.0787, "train_tokens_per_second": 2772.845 }, { "epoch": 0.714, "grad_norm": 2.421875, "learning_rate": 3.997092816930313e-06, "loss": 0.1391, "mean_token_accuracy": 0.9536702156066894, "num_input_tokens_seen": 2796638, "num_tokens": 2796638.0, "step": 3570, "train_runtime": 1008.4241, "train_tokens_per_second": 2773.276 }, { "epoch": 0.715, "grad_norm": 5.0, "learning_rate": 3.971221367954239e-06, "loss": 0.3249, "mean_token_accuracy": 0.8901459455490113, "num_input_tokens_seen": 2801279, "num_tokens": 2801279.0, "step": 3575, "train_runtime": 1009.8029, "train_tokens_per_second": 2774.085 }, { "epoch": 0.716, "grad_norm": 5.15625, "learning_rate": 3.945413158013249e-06, "loss": 0.2546, "mean_token_accuracy": 0.919957947731018, "num_input_tokens_seen": 2805886, "num_tokens": 2805886.0, "step": 3580, "train_runtime": 1011.1918, "train_tokens_per_second": 2774.831 }, { "epoch": 0.717, "grad_norm": 4.8125, "learning_rate": 3.919668457823248e-06, "loss": 0.3787, "mean_token_accuracy": 0.8772375106811523, "num_input_tokens_seen": 2808936, "num_tokens": 2808936.0, "step": 3585, "train_runtime": 1012.5885, "train_tokens_per_second": 2774.015 }, { "epoch": 0.718, "grad_norm": 4.5625, "learning_rate": 3.893987537433961e-06, "loss": 0.343, "mean_token_accuracy": 0.8920956134796143, "num_input_tokens_seen": 2812328, "num_tokens": 2812328.0, "step": 3590, "train_runtime": 1013.9535, "train_tokens_per_second": 2773.626 }, { "epoch": 0.719, "grad_norm": 10.375, "learning_rate": 3.8683706662260945e-06, "loss": 0.4493, "mean_token_accuracy": 0.8601218342781067, "num_input_tokens_seen": 2815210, "num_tokens": 2815210.0, "step": 3595, "train_runtime": 1015.2715, "train_tokens_per_second": 2772.864 }, { "epoch": 0.72, "grad_norm": 8.75, "learning_rate": 3.842818112908498e-06, "loss": 0.5157, "mean_token_accuracy": 0.8321190714836121, "num_input_tokens_seen": 2818306, "num_tokens": 2818306.0, "step": 3600, "train_runtime": 1016.9556, "train_tokens_per_second": 2771.317 }, { "epoch": 0.721, "grad_norm": 3.359375, "learning_rate": 3.817330145515374e-06, "loss": 0.2531, "mean_token_accuracy": 0.9102673768997193, "num_input_tokens_seen": 2821372, "num_tokens": 2821372.0, "step": 3605, "train_runtime": 1018.2573, "train_tokens_per_second": 2770.785 }, { "epoch": 0.722, "grad_norm": 7.4375, "learning_rate": 3.79190703140343e-06, "loss": 0.4277, "mean_token_accuracy": 0.8665948748588562, "num_input_tokens_seen": 2823818, "num_tokens": 2823818.0, "step": 3610, "train_runtime": 1019.5831, "train_tokens_per_second": 2769.581 }, { "epoch": 0.723, "grad_norm": 2.9375, "learning_rate": 3.766549037249112e-06, "loss": 0.2295, "mean_token_accuracy": 0.9277599573135376, "num_input_tokens_seen": 2827065, "num_tokens": 2827065.0, "step": 3615, "train_runtime": 1020.9293, "train_tokens_per_second": 2769.11 }, { "epoch": 0.724, "grad_norm": 4.15625, "learning_rate": 3.741256429045771e-06, "loss": 0.3421, "mean_token_accuracy": 0.879614782333374, "num_input_tokens_seen": 2832334, "num_tokens": 2832334.0, "step": 3620, "train_runtime": 1022.3385, "train_tokens_per_second": 2770.446 }, { "epoch": 0.725, "grad_norm": 4.125, "learning_rate": 3.7160294721009026e-06, "loss": 0.4141, "mean_token_accuracy": 0.8620014786720276, "num_input_tokens_seen": 2836425, "num_tokens": 2836425.0, "step": 3625, "train_runtime": 1023.7178, "train_tokens_per_second": 2770.71 }, { "epoch": 0.726, "grad_norm": 2.5625, "learning_rate": 3.690868431033352e-06, "loss": 0.2136, "mean_token_accuracy": 0.9258363723754883, "num_input_tokens_seen": 2840516, "num_tokens": 2840516.0, "step": 3630, "train_runtime": 1025.0671, "train_tokens_per_second": 2771.054 }, { "epoch": 0.727, "grad_norm": 5.09375, "learning_rate": 3.6657735697705267e-06, "loss": 0.264, "mean_token_accuracy": 0.9151167035102844, "num_input_tokens_seen": 2845294, "num_tokens": 2845294.0, "step": 3635, "train_runtime": 1026.5403, "train_tokens_per_second": 2771.731 }, { "epoch": 0.728, "grad_norm": 15.8125, "learning_rate": 3.6407451515456537e-06, "loss": 0.4253, "mean_token_accuracy": 0.8653265714645386, "num_input_tokens_seen": 2848748, "num_tokens": 2848748.0, "step": 3640, "train_runtime": 1027.844, "train_tokens_per_second": 2771.576 }, { "epoch": 0.729, "grad_norm": 4.65625, "learning_rate": 3.6157834388949907e-06, "loss": 0.2766, "mean_token_accuracy": 0.902829909324646, "num_input_tokens_seen": 2852421, "num_tokens": 2852421.0, "step": 3645, "train_runtime": 1029.2336, "train_tokens_per_second": 2771.403 }, { "epoch": 0.73, "grad_norm": 7.59375, "learning_rate": 3.5908886936550967e-06, "loss": 0.1788, "mean_token_accuracy": 0.9357803940773011, "num_input_tokens_seen": 2855998, "num_tokens": 2855998.0, "step": 3650, "train_runtime": 1030.5568, "train_tokens_per_second": 2771.316 }, { "epoch": 0.731, "grad_norm": 11.125, "learning_rate": 3.5660611769600604e-06, "loss": 0.273, "mean_token_accuracy": 0.9125803232192993, "num_input_tokens_seen": 2860364, "num_tokens": 2860364.0, "step": 3655, "train_runtime": 1031.9322, "train_tokens_per_second": 2771.853 }, { "epoch": 0.732, "grad_norm": 22.25, "learning_rate": 3.541301149238798e-06, "loss": 0.5187, "mean_token_accuracy": 0.8467337250709533, "num_input_tokens_seen": 2864068, "num_tokens": 2864068.0, "step": 3660, "train_runtime": 1033.2731, "train_tokens_per_second": 2771.84 }, { "epoch": 0.733, "grad_norm": 4.8125, "learning_rate": 3.5166088702122738e-06, "loss": 0.2162, "mean_token_accuracy": 0.9303208470344544, "num_input_tokens_seen": 2867153, "num_tokens": 2867153.0, "step": 3665, "train_runtime": 1034.7363, "train_tokens_per_second": 2770.902 }, { "epoch": 0.734, "grad_norm": 7.8125, "learning_rate": 3.491984598890812e-06, "loss": 0.3071, "mean_token_accuracy": 0.9058497548103333, "num_input_tokens_seen": 2872084, "num_tokens": 2872084.0, "step": 3670, "train_runtime": 1036.1894, "train_tokens_per_second": 2771.775 }, { "epoch": 0.735, "grad_norm": 3.578125, "learning_rate": 3.4674285935713715e-06, "loss": 0.3966, "mean_token_accuracy": 0.8840309977531433, "num_input_tokens_seen": 2876557, "num_tokens": 2876557.0, "step": 3675, "train_runtime": 1037.7001, "train_tokens_per_second": 2772.05 }, { "epoch": 0.736, "grad_norm": 5.90625, "learning_rate": 3.442941111834822e-06, "loss": 0.3389, "mean_token_accuracy": 0.8837473750114441, "num_input_tokens_seen": 2880272, "num_tokens": 2880272.0, "step": 3680, "train_runtime": 1039.0062, "train_tokens_per_second": 2772.141 }, { "epoch": 0.737, "grad_norm": 9.375, "learning_rate": 3.418522410543266e-06, "loss": 0.2667, "mean_token_accuracy": 0.9165398716926575, "num_input_tokens_seen": 2884856, "num_tokens": 2884856.0, "step": 3685, "train_runtime": 1040.3972, "train_tokens_per_second": 2772.841 }, { "epoch": 0.738, "grad_norm": 7.375, "learning_rate": 3.3941727458373177e-06, "loss": 0.4182, "mean_token_accuracy": 0.8687753677368164, "num_input_tokens_seen": 2887162, "num_tokens": 2887162.0, "step": 3690, "train_runtime": 1041.7116, "train_tokens_per_second": 2771.556 }, { "epoch": 0.739, "grad_norm": 11.3125, "learning_rate": 3.3698923731334453e-06, "loss": 0.4597, "mean_token_accuracy": 0.8570756077766418, "num_input_tokens_seen": 2890708, "num_tokens": 2890708.0, "step": 3695, "train_runtime": 1043.078, "train_tokens_per_second": 2771.325 }, { "epoch": 0.74, "grad_norm": 7.125, "learning_rate": 3.3456815471212634e-06, "loss": 0.3472, "mean_token_accuracy": 0.8940904617309571, "num_input_tokens_seen": 2893300, "num_tokens": 2893300.0, "step": 3700, "train_runtime": 1044.3782, "train_tokens_per_second": 2770.357 }, { "epoch": 0.741, "grad_norm": 5.25, "learning_rate": 3.321540521760883e-06, "loss": 0.3555, "mean_token_accuracy": 0.8916504502296447, "num_input_tokens_seen": 2897537, "num_tokens": 2897537.0, "step": 3705, "train_runtime": 1046.1147, "train_tokens_per_second": 2769.808 }, { "epoch": 0.742, "grad_norm": 4.125, "learning_rate": 3.297469550280239e-06, "loss": 0.2841, "mean_token_accuracy": 0.8998814940452575, "num_input_tokens_seen": 2901730, "num_tokens": 2901730.0, "step": 3710, "train_runtime": 1047.4641, "train_tokens_per_second": 2770.243 }, { "epoch": 0.743, "grad_norm": 6.65625, "learning_rate": 3.2734688851724273e-06, "loss": 0.5782, "mean_token_accuracy": 0.8412968397140503, "num_input_tokens_seen": 2905951, "num_tokens": 2905951.0, "step": 3715, "train_runtime": 1048.8383, "train_tokens_per_second": 2770.638 }, { "epoch": 0.744, "grad_norm": 12.5625, "learning_rate": 3.249538778193074e-06, "loss": 0.3793, "mean_token_accuracy": 0.8889817953109741, "num_input_tokens_seen": 2909616, "num_tokens": 2909616.0, "step": 3720, "train_runtime": 1050.2841, "train_tokens_per_second": 2770.313 }, { "epoch": 0.745, "grad_norm": 4.25, "learning_rate": 3.2256794803576707e-06, "loss": 0.5671, "mean_token_accuracy": 0.8283018827438354, "num_input_tokens_seen": 2914386, "num_tokens": 2914386.0, "step": 3725, "train_runtime": 1051.7587, "train_tokens_per_second": 2770.964 }, { "epoch": 0.746, "grad_norm": 3.859375, "learning_rate": 3.201891241938969e-06, "loss": 0.2352, "mean_token_accuracy": 0.9228219985961914, "num_input_tokens_seen": 2919166, "num_tokens": 2919166.0, "step": 3730, "train_runtime": 1053.1772, "train_tokens_per_second": 2771.771 }, { "epoch": 0.747, "grad_norm": 8.5625, "learning_rate": 3.178174312464326e-06, "loss": 0.3731, "mean_token_accuracy": 0.8829388618469238, "num_input_tokens_seen": 2922949, "num_tokens": 2922949.0, "step": 3735, "train_runtime": 1054.5427, "train_tokens_per_second": 2771.769 }, { "epoch": 0.748, "grad_norm": 2.265625, "learning_rate": 3.1545289407131128e-06, "loss": 0.1556, "mean_token_accuracy": 0.9552117943763733, "num_input_tokens_seen": 2928360, "num_tokens": 2928360.0, "step": 3740, "train_runtime": 1056.0783, "train_tokens_per_second": 2772.863 }, { "epoch": 0.749, "grad_norm": 4.5625, "learning_rate": 3.130955374714094e-06, "loss": 0.4207, "mean_token_accuracy": 0.874616289138794, "num_input_tokens_seen": 2932436, "num_tokens": 2932436.0, "step": 3745, "train_runtime": 1057.5165, "train_tokens_per_second": 2772.946 }, { "epoch": 0.75, "grad_norm": 4.84375, "learning_rate": 3.107453861742815e-06, "loss": 0.3466, "mean_token_accuracy": 0.8758419036865235, "num_input_tokens_seen": 2937186, "num_tokens": 2937186.0, "step": 3750, "train_runtime": 1058.9285, "train_tokens_per_second": 2773.734 }, { "epoch": 0.751, "grad_norm": 5.75, "learning_rate": 3.0840246483190338e-06, "loss": 0.3401, "mean_token_accuracy": 0.8849533319473266, "num_input_tokens_seen": 2940894, "num_tokens": 2940894.0, "step": 3755, "train_runtime": 1060.2885, "train_tokens_per_second": 2773.673 }, { "epoch": 0.752, "grad_norm": 3.59375, "learning_rate": 3.060667980204104e-06, "loss": 0.2244, "mean_token_accuracy": 0.9195482492446899, "num_input_tokens_seen": 2944292, "num_tokens": 2944292.0, "step": 3760, "train_runtime": 1061.6077, "train_tokens_per_second": 2773.428 }, { "epoch": 0.753, "grad_norm": 7.5625, "learning_rate": 3.037384102398431e-06, "loss": 0.3041, "mean_token_accuracy": 0.9016895890235901, "num_input_tokens_seen": 2948853, "num_tokens": 2948853.0, "step": 3765, "train_runtime": 1063.0207, "train_tokens_per_second": 2774.032 }, { "epoch": 0.754, "grad_norm": 12.6875, "learning_rate": 3.014173259138867e-06, "loss": 0.4997, "mean_token_accuracy": 0.8601667523384094, "num_input_tokens_seen": 2953428, "num_tokens": 2953428.0, "step": 3770, "train_runtime": 1064.4018, "train_tokens_per_second": 2774.73 }, { "epoch": 0.755, "grad_norm": 4.46875, "learning_rate": 2.9910356938961782e-06, "loss": 0.2623, "mean_token_accuracy": 0.9154385805130005, "num_input_tokens_seen": 2957552, "num_tokens": 2957552.0, "step": 3775, "train_runtime": 1065.7976, "train_tokens_per_second": 2774.966 }, { "epoch": 0.756, "grad_norm": 4.75, "learning_rate": 2.9679716493724795e-06, "loss": 0.4466, "mean_token_accuracy": 0.860525107383728, "num_input_tokens_seen": 2961048, "num_tokens": 2961048.0, "step": 3780, "train_runtime": 1067.0795, "train_tokens_per_second": 2774.908 }, { "epoch": 0.757, "grad_norm": 4.3125, "learning_rate": 2.944981367498677e-06, "loss": 0.3373, "mean_token_accuracy": 0.8950571656227112, "num_input_tokens_seen": 2964790, "num_tokens": 2964790.0, "step": 3785, "train_runtime": 1068.4651, "train_tokens_per_second": 2774.812 }, { "epoch": 0.758, "grad_norm": 10.5, "learning_rate": 2.9220650894319557e-06, "loss": 0.3829, "mean_token_accuracy": 0.8816712498664856, "num_input_tokens_seen": 2969796, "num_tokens": 2969796.0, "step": 3790, "train_runtime": 1069.9619, "train_tokens_per_second": 2775.609 }, { "epoch": 0.759, "grad_norm": 4.1875, "learning_rate": 2.899223055553221e-06, "loss": 0.2664, "mean_token_accuracy": 0.9140034437179565, "num_input_tokens_seen": 2975932, "num_tokens": 2975932.0, "step": 3795, "train_runtime": 1071.4617, "train_tokens_per_second": 2777.451 }, { "epoch": 0.76, "grad_norm": 2.21875, "learning_rate": 2.8764555054646083e-06, "loss": 0.1013, "mean_token_accuracy": 0.9654404520988464, "num_input_tokens_seen": 2980516, "num_tokens": 2980516.0, "step": 3800, "train_runtime": 1072.9051, "train_tokens_per_second": 2777.987 }, { "epoch": 0.761, "grad_norm": 2.859375, "learning_rate": 2.853762677986932e-06, "loss": 0.2939, "mean_token_accuracy": 0.9137189149856567, "num_input_tokens_seen": 2984786, "num_tokens": 2984786.0, "step": 3805, "train_runtime": 1074.3717, "train_tokens_per_second": 2778.169 }, { "epoch": 0.762, "grad_norm": 3.765625, "learning_rate": 2.8311448111572304e-06, "loss": 0.4282, "mean_token_accuracy": 0.8652047395706177, "num_input_tokens_seen": 2989174, "num_tokens": 2989174.0, "step": 3810, "train_runtime": 1077.5519, "train_tokens_per_second": 2774.042 }, { "epoch": 0.763, "grad_norm": 3.765625, "learning_rate": 2.808602142226212e-06, "loss": 0.3479, "mean_token_accuracy": 0.8864624619483947, "num_input_tokens_seen": 2993244, "num_tokens": 2993244.0, "step": 3815, "train_runtime": 1078.9017, "train_tokens_per_second": 2774.344 }, { "epoch": 0.764, "grad_norm": 2.796875, "learning_rate": 2.786134907655814e-06, "loss": 0.19, "mean_token_accuracy": 0.9388697266578674, "num_input_tokens_seen": 2996464, "num_tokens": 2996464.0, "step": 3820, "train_runtime": 1080.2027, "train_tokens_per_second": 2773.983 }, { "epoch": 0.765, "grad_norm": 7.0625, "learning_rate": 2.7637433431166903e-06, "loss": 0.3602, "mean_token_accuracy": 0.8952722072601318, "num_input_tokens_seen": 3000014, "num_tokens": 3000014.0, "step": 3825, "train_runtime": 1081.6036, "train_tokens_per_second": 2773.672 }, { "epoch": 0.766, "grad_norm": 8.3125, "learning_rate": 2.741427683485759e-06, "loss": 0.3474, "mean_token_accuracy": 0.8827329277992249, "num_input_tokens_seen": 3002864, "num_tokens": 3002864.0, "step": 3830, "train_runtime": 1084.0228, "train_tokens_per_second": 2770.112 }, { "epoch": 0.767, "grad_norm": 8.25, "learning_rate": 2.7191881628437335e-06, "loss": 0.3934, "mean_token_accuracy": 0.88448246717453, "num_input_tokens_seen": 3006467, "num_tokens": 3006467.0, "step": 3835, "train_runtime": 1085.3565, "train_tokens_per_second": 2770.027 }, { "epoch": 0.768, "grad_norm": 7.6875, "learning_rate": 2.6970250144726563e-06, "loss": 0.3684, "mean_token_accuracy": 0.8838754773139954, "num_input_tokens_seen": 3009130, "num_tokens": 3009130.0, "step": 3840, "train_runtime": 1086.6475, "train_tokens_per_second": 2769.187 }, { "epoch": 0.769, "grad_norm": 9.0, "learning_rate": 2.674938470853472e-06, "loss": 0.4809, "mean_token_accuracy": 0.847998857498169, "num_input_tokens_seen": 3012880, "num_tokens": 3012880.0, "step": 3845, "train_runtime": 1088.0341, "train_tokens_per_second": 2769.105 }, { "epoch": 0.77, "grad_norm": 7.28125, "learning_rate": 2.652928763663567e-06, "loss": 0.5472, "mean_token_accuracy": 0.8193679928779602, "num_input_tokens_seen": 3015978, "num_tokens": 3015978.0, "step": 3850, "train_runtime": 1089.3575, "train_tokens_per_second": 2768.584 }, { "epoch": 0.771, "grad_norm": 6.46875, "learning_rate": 2.6309961237743587e-06, "loss": 0.2156, "mean_token_accuracy": 0.9317720651626586, "num_input_tokens_seen": 3019950, "num_tokens": 3019950.0, "step": 3855, "train_runtime": 1090.7808, "train_tokens_per_second": 2768.613 }, { "epoch": 0.772, "grad_norm": 3.546875, "learning_rate": 2.6091407812488567e-06, "loss": 0.3119, "mean_token_accuracy": 0.900924003124237, "num_input_tokens_seen": 3024094, "num_tokens": 3024094.0, "step": 3860, "train_runtime": 1092.1358, "train_tokens_per_second": 2768.973 }, { "epoch": 0.773, "grad_norm": 4.09375, "learning_rate": 2.5873629653392653e-06, "loss": 0.3023, "mean_token_accuracy": 0.9061660766601562, "num_input_tokens_seen": 3027760, "num_tokens": 3027760.0, "step": 3865, "train_runtime": 1093.5226, "train_tokens_per_second": 2768.813 }, { "epoch": 0.774, "grad_norm": 4.6875, "learning_rate": 2.5656629044845714e-06, "loss": 0.2418, "mean_token_accuracy": 0.9165300607681275, "num_input_tokens_seen": 3031828, "num_tokens": 3031828.0, "step": 3870, "train_runtime": 1095.3077, "train_tokens_per_second": 2768.015 }, { "epoch": 0.775, "grad_norm": 7.8125, "learning_rate": 2.5440408263081385e-06, "loss": 0.2437, "mean_token_accuracy": 0.9189160346984864, "num_input_tokens_seen": 3035197, "num_tokens": 3035197.0, "step": 3875, "train_runtime": 1096.6984, "train_tokens_per_second": 2767.577 }, { "epoch": 0.776, "grad_norm": 3.828125, "learning_rate": 2.5224969576153413e-06, "loss": 0.217, "mean_token_accuracy": 0.921456265449524, "num_input_tokens_seen": 3037548, "num_tokens": 3037548.0, "step": 3880, "train_runtime": 1098.0213, "train_tokens_per_second": 2766.383 }, { "epoch": 0.777, "grad_norm": 5.28125, "learning_rate": 2.501031524391163e-06, "loss": 0.4014, "mean_token_accuracy": 0.880898654460907, "num_input_tokens_seen": 3041844, "num_tokens": 3041844.0, "step": 3885, "train_runtime": 1099.3834, "train_tokens_per_second": 2766.864 }, { "epoch": 0.778, "grad_norm": 3.578125, "learning_rate": 2.479644751797845e-06, "loss": 0.2313, "mean_token_accuracy": 0.9285088419914246, "num_input_tokens_seen": 3047476, "num_tokens": 3047476.0, "step": 3890, "train_runtime": 1100.9235, "train_tokens_per_second": 2768.109 }, { "epoch": 0.779, "grad_norm": 6.15625, "learning_rate": 2.458336864172508e-06, "loss": 0.1587, "mean_token_accuracy": 0.9574881196022034, "num_input_tokens_seen": 3051063, "num_tokens": 3051063.0, "step": 3895, "train_runtime": 1102.327, "train_tokens_per_second": 2767.838 }, { "epoch": 0.78, "grad_norm": 3.828125, "learning_rate": 2.437108085024812e-06, "loss": 0.3234, "mean_token_accuracy": 0.8818744421005249, "num_input_tokens_seen": 3054292, "num_tokens": 3054292.0, "step": 3900, "train_runtime": 1103.634, "train_tokens_per_second": 2767.486 }, { "epoch": 0.781, "grad_norm": 6.3125, "learning_rate": 2.415958637034609e-06, "loss": 0.3635, "mean_token_accuracy": 0.8875663638114929, "num_input_tokens_seen": 3056591, "num_tokens": 3056591.0, "step": 3905, "train_runtime": 1104.9296, "train_tokens_per_second": 2766.322 }, { "epoch": 0.782, "grad_norm": 3.671875, "learning_rate": 2.3948887420495927e-06, "loss": 0.3729, "mean_token_accuracy": 0.8743338823318482, "num_input_tokens_seen": 3060838, "num_tokens": 3060838.0, "step": 3910, "train_runtime": 1106.3732, "train_tokens_per_second": 2766.551 }, { "epoch": 0.783, "grad_norm": 5.5625, "learning_rate": 2.3738986210829997e-06, "loss": 0.327, "mean_token_accuracy": 0.8890520572662354, "num_input_tokens_seen": 3064247, "num_tokens": 3064247.0, "step": 3915, "train_runtime": 1107.7318, "train_tokens_per_second": 2766.236 }, { "epoch": 0.784, "grad_norm": 9.5625, "learning_rate": 2.352988494311259e-06, "loss": 0.4669, "mean_token_accuracy": 0.8701505184173584, "num_input_tokens_seen": 3067434, "num_tokens": 3067434.0, "step": 3920, "train_runtime": 1109.13, "train_tokens_per_second": 2765.622 }, { "epoch": 0.785, "grad_norm": 7.78125, "learning_rate": 2.332158581071712e-06, "loss": 0.4112, "mean_token_accuracy": 0.8707880496978759, "num_input_tokens_seen": 3069581, "num_tokens": 3069581.0, "step": 3925, "train_runtime": 1110.427, "train_tokens_per_second": 2764.325 }, { "epoch": 0.786, "grad_norm": 7.1875, "learning_rate": 2.311409099860288e-06, "loss": 0.3337, "mean_token_accuracy": 0.8972350358963013, "num_input_tokens_seen": 3072086, "num_tokens": 3072086.0, "step": 3930, "train_runtime": 1111.7472, "train_tokens_per_second": 2763.295 }, { "epoch": 0.787, "grad_norm": 2.671875, "learning_rate": 2.2907402683292268e-06, "loss": 0.3882, "mean_token_accuracy": 0.8892125844955444, "num_input_tokens_seen": 3075338, "num_tokens": 3075338.0, "step": 3935, "train_runtime": 1113.0592, "train_tokens_per_second": 2762.96 }, { "epoch": 0.788, "grad_norm": 3.078125, "learning_rate": 2.270152303284795e-06, "loss": 0.3258, "mean_token_accuracy": 0.8842681050300598, "num_input_tokens_seen": 3078346, "num_tokens": 3078346.0, "step": 3940, "train_runtime": 1114.3647, "train_tokens_per_second": 2762.422 }, { "epoch": 0.789, "grad_norm": 3.796875, "learning_rate": 2.249645420684998e-06, "loss": 0.3456, "mean_token_accuracy": 0.8920680284500122, "num_input_tokens_seen": 3081824, "num_tokens": 3081824.0, "step": 3945, "train_runtime": 1115.7008, "train_tokens_per_second": 2762.232 }, { "epoch": 0.79, "grad_norm": 12.1875, "learning_rate": 2.2292198356373362e-06, "loss": 0.3551, "mean_token_accuracy": 0.8851681232452393, "num_input_tokens_seen": 3085328, "num_tokens": 3085328.0, "step": 3950, "train_runtime": 1117.0756, "train_tokens_per_second": 2761.969 }, { "epoch": 0.791, "grad_norm": 9.0, "learning_rate": 2.2088757623965263e-06, "loss": 0.5468, "mean_token_accuracy": 0.8371020436286927, "num_input_tokens_seen": 3089037, "num_tokens": 3089037.0, "step": 3955, "train_runtime": 1118.4156, "train_tokens_per_second": 2761.976 }, { "epoch": 0.792, "grad_norm": 3.40625, "learning_rate": 2.188613414362273e-06, "loss": 0.3415, "mean_token_accuracy": 0.8996511101722717, "num_input_tokens_seen": 3092730, "num_tokens": 3092730.0, "step": 3960, "train_runtime": 1119.758, "train_tokens_per_second": 2761.963 }, { "epoch": 0.793, "grad_norm": 2.65625, "learning_rate": 2.1684330040770183e-06, "loss": 0.2071, "mean_token_accuracy": 0.9298021793365479, "num_input_tokens_seen": 3097905, "num_tokens": 3097905.0, "step": 3965, "train_runtime": 1121.2528, "train_tokens_per_second": 2762.896 }, { "epoch": 0.794, "grad_norm": 2.546875, "learning_rate": 2.148334743223719e-06, "loss": 0.1894, "mean_token_accuracy": 0.9390358924865723, "num_input_tokens_seen": 3104334, "num_tokens": 3104334.0, "step": 3970, "train_runtime": 1122.8398, "train_tokens_per_second": 2764.717 }, { "epoch": 0.795, "grad_norm": 14.625, "learning_rate": 2.128318842623618e-06, "loss": 0.5697, "mean_token_accuracy": 0.837884783744812, "num_input_tokens_seen": 3108412, "num_tokens": 3108412.0, "step": 3975, "train_runtime": 1124.2824, "train_tokens_per_second": 2764.796 }, { "epoch": 0.796, "grad_norm": 4.46875, "learning_rate": 2.108385512234041e-06, "loss": 0.3593, "mean_token_accuracy": 0.888742995262146, "num_input_tokens_seen": 3111724, "num_tokens": 3111724.0, "step": 3980, "train_runtime": 1125.6149, "train_tokens_per_second": 2764.466 }, { "epoch": 0.797, "grad_norm": 3.375, "learning_rate": 2.088534961146197e-06, "loss": 0.3629, "mean_token_accuracy": 0.8773077130317688, "num_input_tokens_seen": 3116239, "num_tokens": 3116239.0, "step": 3985, "train_runtime": 1127.0657, "train_tokens_per_second": 2764.913 }, { "epoch": 0.798, "grad_norm": 4.03125, "learning_rate": 2.068767397582967e-06, "loss": 0.3494, "mean_token_accuracy": 0.9012076377868652, "num_input_tokens_seen": 3119224, "num_tokens": 3119224.0, "step": 3990, "train_runtime": 1128.3487, "train_tokens_per_second": 2764.415 }, { "epoch": 0.799, "grad_norm": 7.875, "learning_rate": 2.0490830288967443e-06, "loss": 0.384, "mean_token_accuracy": 0.8745863795280456, "num_input_tokens_seen": 3121855, "num_tokens": 3121855.0, "step": 3995, "train_runtime": 1129.6412, "train_tokens_per_second": 2763.581 }, { "epoch": 0.8, "grad_norm": 7.6875, "learning_rate": 2.029482061567237e-06, "loss": 0.5797, "mean_token_accuracy": 0.8229709982872009, "num_input_tokens_seen": 3124448, "num_tokens": 3124448.0, "step": 4000, "train_runtime": 1130.9352, "train_tokens_per_second": 2762.712 }, { "epoch": 0.801, "grad_norm": 9.125, "learning_rate": 2.0099647011993217e-06, "loss": 0.2546, "mean_token_accuracy": 0.9176838994026184, "num_input_tokens_seen": 3128270, "num_tokens": 3128270.0, "step": 4005, "train_runtime": 1132.3032, "train_tokens_per_second": 2762.749 }, { "epoch": 0.802, "grad_norm": 4.40625, "learning_rate": 1.990531152520869e-06, "loss": 0.28, "mean_token_accuracy": 0.9157652497291565, "num_input_tokens_seen": 3130864, "num_tokens": 3130864.0, "step": 4010, "train_runtime": 1133.5634, "train_tokens_per_second": 2761.966 }, { "epoch": 0.803, "grad_norm": 5.25, "learning_rate": 1.971181619380611e-06, "loss": 0.5143, "mean_token_accuracy": 0.851870846748352, "num_input_tokens_seen": 3134766, "num_tokens": 3134766.0, "step": 4015, "train_runtime": 1134.9203, "train_tokens_per_second": 2762.102 }, { "epoch": 0.804, "grad_norm": 4.21875, "learning_rate": 1.9519163047459978e-06, "loss": 0.3387, "mean_token_accuracy": 0.8888375878334045, "num_input_tokens_seen": 3137808, "num_tokens": 3137808.0, "step": 4020, "train_runtime": 1136.1886, "train_tokens_per_second": 2761.697 }, { "epoch": 0.805, "grad_norm": 7.46875, "learning_rate": 1.9327354107010566e-06, "loss": 0.2575, "mean_token_accuracy": 0.9188655018806458, "num_input_tokens_seen": 3142084, "num_tokens": 3142084.0, "step": 4025, "train_runtime": 1137.5543, "train_tokens_per_second": 2762.14 }, { "epoch": 0.806, "grad_norm": 4.53125, "learning_rate": 1.9136391384442964e-06, "loss": 0.1877, "mean_token_accuracy": 0.9391518592834472, "num_input_tokens_seen": 3144600, "num_tokens": 3144600.0, "step": 4030, "train_runtime": 1138.8505, "train_tokens_per_second": 2761.205 }, { "epoch": 0.807, "grad_norm": 4.625, "learning_rate": 1.894627688286571e-06, "loss": 0.507, "mean_token_accuracy": 0.8527899622917176, "num_input_tokens_seen": 3146870, "num_tokens": 3146870.0, "step": 4035, "train_runtime": 1140.1748, "train_tokens_per_second": 2759.989 }, { "epoch": 0.808, "grad_norm": 4.34375, "learning_rate": 1.875701259649002e-06, "loss": 0.2724, "mean_token_accuracy": 0.9080185890197754, "num_input_tokens_seen": 3149962, "num_tokens": 3149962.0, "step": 4040, "train_runtime": 1141.4721, "train_tokens_per_second": 2759.561 }, { "epoch": 0.809, "grad_norm": 9.6875, "learning_rate": 1.8568600510608659e-06, "loss": 0.3126, "mean_token_accuracy": 0.9075241446495056, "num_input_tokens_seen": 3154296, "num_tokens": 3154296.0, "step": 4045, "train_runtime": 1142.8476, "train_tokens_per_second": 2760.032 }, { "epoch": 0.81, "grad_norm": 6.34375, "learning_rate": 1.838104260157525e-06, "loss": 0.4273, "mean_token_accuracy": 0.8794667243957519, "num_input_tokens_seen": 3158310, "num_tokens": 3158310.0, "step": 4050, "train_runtime": 1144.2708, "train_tokens_per_second": 2760.107 }, { "epoch": 0.811, "grad_norm": 9.125, "learning_rate": 1.8194340836783565e-06, "loss": 0.1899, "mean_token_accuracy": 0.9318296313285828, "num_input_tokens_seen": 3160962, "num_tokens": 3160962.0, "step": 4055, "train_runtime": 1145.6072, "train_tokens_per_second": 2759.202 }, { "epoch": 0.812, "grad_norm": 2.65625, "learning_rate": 1.80084971746467e-06, "loss": 0.3635, "mean_token_accuracy": 0.8902220487594604, "num_input_tokens_seen": 3164042, "num_tokens": 3164042.0, "step": 4060, "train_runtime": 1146.8753, "train_tokens_per_second": 2758.837 }, { "epoch": 0.813, "grad_norm": 4.5625, "learning_rate": 1.7823513564576788e-06, "loss": 0.3988, "mean_token_accuracy": 0.8752343654632568, "num_input_tokens_seen": 3168245, "num_tokens": 3168245.0, "step": 4065, "train_runtime": 1148.2384, "train_tokens_per_second": 2759.222 }, { "epoch": 0.814, "grad_norm": 4.1875, "learning_rate": 1.7639391946964312e-06, "loss": 0.2284, "mean_token_accuracy": 0.9211568593978882, "num_input_tokens_seen": 3173158, "num_tokens": 3173158.0, "step": 4070, "train_runtime": 1149.675, "train_tokens_per_second": 2760.048 }, { "epoch": 0.815, "grad_norm": 28.0, "learning_rate": 1.7456134253157976e-06, "loss": 0.4756, "mean_token_accuracy": 0.8559181213378906, "num_input_tokens_seen": 3176866, "num_tokens": 3176866.0, "step": 4075, "train_runtime": 1151.0788, "train_tokens_per_second": 2759.903 }, { "epoch": 0.816, "grad_norm": 8.125, "learning_rate": 1.7273742405444217e-06, "loss": 0.409, "mean_token_accuracy": 0.8660372018814086, "num_input_tokens_seen": 3180980, "num_tokens": 3180980.0, "step": 4080, "train_runtime": 1152.4535, "train_tokens_per_second": 2760.181 }, { "epoch": 0.817, "grad_norm": 3.046875, "learning_rate": 1.709221831702723e-06, "loss": 0.3516, "mean_token_accuracy": 0.9014306187629699, "num_input_tokens_seen": 3184186, "num_tokens": 3184186.0, "step": 4085, "train_runtime": 1153.7844, "train_tokens_per_second": 2759.776 }, { "epoch": 0.818, "grad_norm": 5.6875, "learning_rate": 1.691156389200883e-06, "loss": 0.2816, "mean_token_accuracy": 0.9191635489463806, "num_input_tokens_seen": 3187386, "num_tokens": 3187386.0, "step": 4090, "train_runtime": 1155.0693, "train_tokens_per_second": 2759.476 }, { "epoch": 0.819, "grad_norm": 3.796875, "learning_rate": 1.6731781025368422e-06, "loss": 0.2357, "mean_token_accuracy": 0.9143314123153686, "num_input_tokens_seen": 3190595, "num_tokens": 3190595.0, "step": 4095, "train_runtime": 1156.3849, "train_tokens_per_second": 2759.112 }, { "epoch": 0.82, "grad_norm": 3.140625, "learning_rate": 1.6552871602943233e-06, "loss": 0.3959, "mean_token_accuracy": 0.8720271110534668, "num_input_tokens_seen": 3194100, "num_tokens": 3194100.0, "step": 4100, "train_runtime": 1157.7219, "train_tokens_per_second": 2758.953 }, { "epoch": 0.821, "grad_norm": 3.65625, "learning_rate": 1.6374837501408403e-06, "loss": 0.2699, "mean_token_accuracy": 0.9028887867927551, "num_input_tokens_seen": 3198534, "num_tokens": 3198534.0, "step": 4105, "train_runtime": 1159.0664, "train_tokens_per_second": 2759.578 }, { "epoch": 0.822, "grad_norm": 5.75, "learning_rate": 1.6197680588257435e-06, "loss": 0.2549, "mean_token_accuracy": 0.9016834855079651, "num_input_tokens_seen": 3201040, "num_tokens": 3201040.0, "step": 4110, "train_runtime": 1160.4743, "train_tokens_per_second": 2758.389 }, { "epoch": 0.823, "grad_norm": 5.75, "learning_rate": 1.602140272178253e-06, "loss": 0.381, "mean_token_accuracy": 0.8673596024513245, "num_input_tokens_seen": 3203860, "num_tokens": 3203860.0, "step": 4115, "train_runtime": 1161.8533, "train_tokens_per_second": 2757.543 }, { "epoch": 0.824, "grad_norm": 2.984375, "learning_rate": 1.5846005751055116e-06, "loss": 0.3252, "mean_token_accuracy": 0.8958292722702026, "num_input_tokens_seen": 3210026, "num_tokens": 3210026.0, "step": 4120, "train_runtime": 1163.5715, "train_tokens_per_second": 2758.77 }, { "epoch": 0.825, "grad_norm": 3.40625, "learning_rate": 1.5671491515906355e-06, "loss": 0.5474, "mean_token_accuracy": 0.8256628632545471, "num_input_tokens_seen": 3214301, "num_tokens": 3214301.0, "step": 4125, "train_runtime": 1166.5108, "train_tokens_per_second": 2755.483 }, { "epoch": 0.826, "grad_norm": 3.9375, "learning_rate": 1.5497861846908024e-06, "loss": 0.2627, "mean_token_accuracy": 0.9129902482032776, "num_input_tokens_seen": 3218558, "num_tokens": 3218558.0, "step": 4130, "train_runtime": 1167.8292, "train_tokens_per_second": 2756.018 }, { "epoch": 0.827, "grad_norm": 3.390625, "learning_rate": 1.5325118565353237e-06, "loss": 0.3232, "mean_token_accuracy": 0.8975612878799438, "num_input_tokens_seen": 3222119, "num_tokens": 3222119.0, "step": 4135, "train_runtime": 1169.1655, "train_tokens_per_second": 2755.913 }, { "epoch": 0.828, "grad_norm": 3.21875, "learning_rate": 1.51532634832372e-06, "loss": 0.214, "mean_token_accuracy": 0.9324235439300537, "num_input_tokens_seen": 3225800, "num_tokens": 3225800.0, "step": 4140, "train_runtime": 1170.4893, "train_tokens_per_second": 2755.942 }, { "epoch": 0.829, "grad_norm": 7.46875, "learning_rate": 1.498229840323847e-06, "loss": 0.4219, "mean_token_accuracy": 0.8757883906364441, "num_input_tokens_seen": 3229824, "num_tokens": 3229824.0, "step": 4145, "train_runtime": 1171.9453, "train_tokens_per_second": 2755.951 }, { "epoch": 0.83, "grad_norm": 13.375, "learning_rate": 1.4812225118699775e-06, "loss": 0.4358, "mean_token_accuracy": 0.8765614271163941, "num_input_tokens_seen": 3233038, "num_tokens": 3233038.0, "step": 4150, "train_runtime": 1173.2494, "train_tokens_per_second": 2755.627 }, { "epoch": 0.831, "grad_norm": 9.625, "learning_rate": 1.464304541360946e-06, "loss": 0.3037, "mean_token_accuracy": 0.893539571762085, "num_input_tokens_seen": 3239115, "num_tokens": 3239115.0, "step": 4155, "train_runtime": 1174.8329, "train_tokens_per_second": 2757.086 }, { "epoch": 0.832, "grad_norm": 15.5, "learning_rate": 1.4474761062582509e-06, "loss": 0.4396, "mean_token_accuracy": 0.8415980219841004, "num_input_tokens_seen": 3243082, "num_tokens": 3243082.0, "step": 4160, "train_runtime": 1176.2951, "train_tokens_per_second": 2757.031 }, { "epoch": 0.833, "grad_norm": 3.234375, "learning_rate": 1.4307373830842174e-06, "loss": 0.2215, "mean_token_accuracy": 0.9287956357002258, "num_input_tokens_seen": 3247464, "num_tokens": 3247464.0, "step": 4165, "train_runtime": 1177.7673, "train_tokens_per_second": 2757.305 }, { "epoch": 0.834, "grad_norm": 9.25, "learning_rate": 1.4140885474201315e-06, "loss": 0.31, "mean_token_accuracy": 0.8973621487617492, "num_input_tokens_seen": 3249828, "num_tokens": 3249828.0, "step": 4170, "train_runtime": 1179.0868, "train_tokens_per_second": 2756.225 }, { "epoch": 0.835, "grad_norm": 5.375, "learning_rate": 1.3975297739043992e-06, "loss": 0.2982, "mean_token_accuracy": 0.9112296462059021, "num_input_tokens_seen": 3253974, "num_tokens": 3253974.0, "step": 4175, "train_runtime": 1180.4537, "train_tokens_per_second": 2756.545 }, { "epoch": 0.836, "grad_norm": 4.25, "learning_rate": 1.3810612362307208e-06, "loss": 0.382, "mean_token_accuracy": 0.8847106218338012, "num_input_tokens_seen": 3261154, "num_tokens": 3261154.0, "step": 4180, "train_runtime": 1182.2176, "train_tokens_per_second": 2758.506 }, { "epoch": 0.837, "grad_norm": 2.359375, "learning_rate": 1.3646831071462606e-06, "loss": 0.2438, "mean_token_accuracy": 0.9227196335792541, "num_input_tokens_seen": 3265131, "num_tokens": 3265131.0, "step": 4185, "train_runtime": 1183.5835, "train_tokens_per_second": 2758.682 }, { "epoch": 0.838, "grad_norm": 9.375, "learning_rate": 1.3483955584498476e-06, "loss": 0.2765, "mean_token_accuracy": 0.9162901043891907, "num_input_tokens_seen": 3269764, "num_tokens": 3269764.0, "step": 4190, "train_runtime": 1184.9951, "train_tokens_per_second": 2759.306 }, { "epoch": 0.839, "grad_norm": 5.65625, "learning_rate": 1.3321987609901553e-06, "loss": 0.3241, "mean_token_accuracy": 0.8902681350708008, "num_input_tokens_seen": 3273445, "num_tokens": 3273445.0, "step": 4195, "train_runtime": 1186.3131, "train_tokens_per_second": 2759.343 }, { "epoch": 0.84, "grad_norm": 8.0, "learning_rate": 1.3160928846639275e-06, "loss": 0.5166, "mean_token_accuracy": 0.8576383709907531, "num_input_tokens_seen": 3276394, "num_tokens": 3276394.0, "step": 4200, "train_runtime": 1187.5905, "train_tokens_per_second": 2758.858 }, { "epoch": 0.841, "grad_norm": 6.75, "learning_rate": 1.3000780984141881e-06, "loss": 0.1689, "mean_token_accuracy": 0.9443981409072876, "num_input_tokens_seen": 3281139, "num_tokens": 3281139.0, "step": 4205, "train_runtime": 1189.0566, "train_tokens_per_second": 2759.447 }, { "epoch": 0.842, "grad_norm": 2.4375, "learning_rate": 1.2841545702284618e-06, "loss": 0.3304, "mean_token_accuracy": 0.8920741438865661, "num_input_tokens_seen": 3284768, "num_tokens": 3284768.0, "step": 4210, "train_runtime": 1190.3934, "train_tokens_per_second": 2759.397 }, { "epoch": 0.843, "grad_norm": 4.84375, "learning_rate": 1.2683224671370286e-06, "loss": 0.1813, "mean_token_accuracy": 0.937644624710083, "num_input_tokens_seen": 3288846, "num_tokens": 3288846.0, "step": 4215, "train_runtime": 1191.7584, "train_tokens_per_second": 2759.658 }, { "epoch": 0.844, "grad_norm": 5.375, "learning_rate": 1.252581955211155e-06, "loss": 0.2701, "mean_token_accuracy": 0.9016116261482239, "num_input_tokens_seen": 3291606, "num_tokens": 3291606.0, "step": 4220, "train_runtime": 1193.0287, "train_tokens_per_second": 2759.033 }, { "epoch": 0.845, "grad_norm": 3.546875, "learning_rate": 1.2369331995613664e-06, "loss": 0.3509, "mean_token_accuracy": 0.8878743886947632, "num_input_tokens_seen": 3295924, "num_tokens": 3295924.0, "step": 4225, "train_runtime": 1194.4242, "train_tokens_per_second": 2759.425 }, { "epoch": 0.846, "grad_norm": 3.6875, "learning_rate": 1.2213763643357002e-06, "loss": 0.2355, "mean_token_accuracy": 0.9221621990203858, "num_input_tokens_seen": 3300246, "num_tokens": 3300246.0, "step": 4230, "train_runtime": 1195.8116, "train_tokens_per_second": 2759.838 }, { "epoch": 0.847, "grad_norm": 6.75, "learning_rate": 1.2059116127179993e-06, "loss": 0.4343, "mean_token_accuracy": 0.8666230320930481, "num_input_tokens_seen": 3303692, "num_tokens": 3303692.0, "step": 4235, "train_runtime": 1197.1787, "train_tokens_per_second": 2759.565 }, { "epoch": 0.848, "grad_norm": 4.84375, "learning_rate": 1.1905391069261918e-06, "loss": 0.4503, "mean_token_accuracy": 0.8683754920959472, "num_input_tokens_seen": 3307054, "num_tokens": 3307054.0, "step": 4240, "train_runtime": 1198.4639, "train_tokens_per_second": 2759.411 }, { "epoch": 0.849, "grad_norm": 7.3125, "learning_rate": 1.1752590082105863e-06, "loss": 0.3762, "mean_token_accuracy": 0.8772651672363281, "num_input_tokens_seen": 3310443, "num_tokens": 3310443.0, "step": 4245, "train_runtime": 1199.7531, "train_tokens_per_second": 2759.27 }, { "epoch": 0.85, "grad_norm": 8.75, "learning_rate": 1.1600714768521903e-06, "loss": 0.3702, "mean_token_accuracy": 0.8974141120910645, "num_input_tokens_seen": 3312412, "num_tokens": 3312412.0, "step": 4250, "train_runtime": 1201.0519, "train_tokens_per_second": 2757.926 }, { "epoch": 0.851, "grad_norm": 6.90625, "learning_rate": 1.144976672161019e-06, "loss": 0.4951, "mean_token_accuracy": 0.840575349330902, "num_input_tokens_seen": 3316174, "num_tokens": 3316174.0, "step": 4255, "train_runtime": 1202.3897, "train_tokens_per_second": 2757.986 }, { "epoch": 0.852, "grad_norm": 5.5625, "learning_rate": 1.1299747524744309e-06, "loss": 0.2914, "mean_token_accuracy": 0.9068589806556702, "num_input_tokens_seen": 3319298, "num_tokens": 3319298.0, "step": 4260, "train_runtime": 1203.7656, "train_tokens_per_second": 2757.429 }, { "epoch": 0.853, "grad_norm": 4.53125, "learning_rate": 1.1150658751554667e-06, "loss": 0.331, "mean_token_accuracy": 0.8985235929489136, "num_input_tokens_seen": 3323042, "num_tokens": 3323042.0, "step": 4265, "train_runtime": 1205.0924, "train_tokens_per_second": 2757.5 }, { "epoch": 0.854, "grad_norm": 8.3125, "learning_rate": 1.100250196591195e-06, "loss": 0.3027, "mean_token_accuracy": 0.8970885396003723, "num_input_tokens_seen": 3328312, "num_tokens": 3328312.0, "step": 4270, "train_runtime": 1207.61, "train_tokens_per_second": 2756.115 }, { "epoch": 0.855, "grad_norm": 10.1875, "learning_rate": 1.08552787219107e-06, "loss": 0.5174, "mean_token_accuracy": 0.8433267951011658, "num_input_tokens_seen": 3332487, "num_tokens": 3332487.0, "step": 4275, "train_runtime": 1209.1442, "train_tokens_per_second": 2756.071 }, { "epoch": 0.856, "grad_norm": 5.15625, "learning_rate": 1.0708990563853127e-06, "loss": 0.3414, "mean_token_accuracy": 0.8825830101966858, "num_input_tokens_seen": 3337270, "num_tokens": 3337270.0, "step": 4280, "train_runtime": 1210.5924, "train_tokens_per_second": 2756.725 }, { "epoch": 0.857, "grad_norm": 4.46875, "learning_rate": 1.0563639026232742e-06, "loss": 0.5392, "mean_token_accuracy": 0.8398568868637085, "num_input_tokens_seen": 3341314, "num_tokens": 3341314.0, "step": 4285, "train_runtime": 1212.0296, "train_tokens_per_second": 2756.792 }, { "epoch": 0.858, "grad_norm": 3.015625, "learning_rate": 1.041922563371842e-06, "loss": 0.2923, "mean_token_accuracy": 0.914969265460968, "num_input_tokens_seen": 3345196, "num_tokens": 3345196.0, "step": 4290, "train_runtime": 1213.404, "train_tokens_per_second": 2756.869 }, { "epoch": 0.859, "grad_norm": 10.0, "learning_rate": 1.027575190113832e-06, "loss": 0.1727, "mean_token_accuracy": 0.9402429223060608, "num_input_tokens_seen": 3349747, "num_tokens": 3349747.0, "step": 4295, "train_runtime": 1214.885, "train_tokens_per_second": 2757.254 }, { "epoch": 0.86, "grad_norm": 7.96875, "learning_rate": 1.0133219333463983e-06, "loss": 0.3092, "mean_token_accuracy": 0.9057885646820069, "num_input_tokens_seen": 3354104, "num_tokens": 3354104.0, "step": 4300, "train_runtime": 1216.3196, "train_tokens_per_second": 2757.585 }, { "epoch": 0.861, "grad_norm": 11.0, "learning_rate": 9.991629425794624e-07, "loss": 0.3803, "mean_token_accuracy": 0.8866862058639526, "num_input_tokens_seen": 3358988, "num_tokens": 3358988.0, "step": 4305, "train_runtime": 1217.7779, "train_tokens_per_second": 2758.293 }, { "epoch": 0.862, "grad_norm": 3.640625, "learning_rate": 9.85098366334134e-07, "loss": 0.3917, "mean_token_accuracy": 0.8799158096313476, "num_input_tokens_seen": 3363230, "num_tokens": 3363230.0, "step": 4310, "train_runtime": 1219.1788, "train_tokens_per_second": 2758.603 }, { "epoch": 0.863, "grad_norm": 3.59375, "learning_rate": 9.711283521411674e-07, "loss": 0.235, "mean_token_accuracy": 0.9245396137237549, "num_input_tokens_seen": 3366784, "num_tokens": 3366784.0, "step": 4315, "train_runtime": 1220.5122, "train_tokens_per_second": 2758.501 }, { "epoch": 0.864, "grad_norm": 5.03125, "learning_rate": 9.57253046539396e-07, "loss": 0.2146, "mean_token_accuracy": 0.921067762374878, "num_input_tokens_seen": 3371146, "num_tokens": 3371146.0, "step": 4320, "train_runtime": 1221.8876, "train_tokens_per_second": 2758.966 }, { "epoch": 0.865, "grad_norm": 3.796875, "learning_rate": 9.434725950742119e-07, "loss": 0.2464, "mean_token_accuracy": 0.9166476488113403, "num_input_tokens_seen": 3377438, "num_tokens": 3377438.0, "step": 4325, "train_runtime": 1223.4895, "train_tokens_per_second": 2760.496 }, { "epoch": 0.866, "grad_norm": 19.25, "learning_rate": 9.297871422960336e-07, "loss": 0.4238, "mean_token_accuracy": 0.8603226780891419, "num_input_tokens_seen": 3381378, "num_tokens": 3381378.0, "step": 4330, "train_runtime": 1224.8749, "train_tokens_per_second": 2760.591 }, { "epoch": 0.867, "grad_norm": 6.25, "learning_rate": 9.161968317587788e-07, "loss": 0.4611, "mean_token_accuracy": 0.8517452478408813, "num_input_tokens_seen": 3385069, "num_tokens": 3385069.0, "step": 4335, "train_runtime": 1226.2239, "train_tokens_per_second": 2760.563 }, { "epoch": 0.868, "grad_norm": 6.5, "learning_rate": 9.027018060183801e-07, "loss": 0.2582, "mean_token_accuracy": 0.9111624598503113, "num_input_tokens_seen": 3388908, "num_tokens": 3388908.0, "step": 4340, "train_runtime": 1227.5572, "train_tokens_per_second": 2760.693 }, { "epoch": 0.869, "grad_norm": 3.5, "learning_rate": 8.893022066312674e-07, "loss": 0.2533, "mean_token_accuracy": 0.9163471460342407, "num_input_tokens_seen": 3394274, "num_tokens": 3394274.0, "step": 4345, "train_runtime": 1228.9573, "train_tokens_per_second": 2761.914 }, { "epoch": 0.87, "grad_norm": 7.75, "learning_rate": 8.759981741529e-07, "loss": 0.3648, "mean_token_accuracy": 0.8872136235237121, "num_input_tokens_seen": 3399078, "num_tokens": 3399078.0, "step": 4350, "train_runtime": 1230.4475, "train_tokens_per_second": 2762.473 }, { "epoch": 0.871, "grad_norm": 20.875, "learning_rate": 8.627898481362817e-07, "loss": 0.2885, "mean_token_accuracy": 0.9121281147003174, "num_input_tokens_seen": 3402331, "num_tokens": 3402331.0, "step": 4355, "train_runtime": 1231.7871, "train_tokens_per_second": 2762.11 }, { "epoch": 0.872, "grad_norm": 3.75, "learning_rate": 8.496773671305025e-07, "loss": 0.3259, "mean_token_accuracy": 0.8926708817481994, "num_input_tokens_seen": 3406052, "num_tokens": 3406052.0, "step": 4360, "train_runtime": 1233.1178, "train_tokens_per_second": 2762.147 }, { "epoch": 0.873, "grad_norm": 9.5625, "learning_rate": 8.366608686792854e-07, "loss": 0.2881, "mean_token_accuracy": 0.9154354572296143, "num_input_tokens_seen": 3409564, "num_tokens": 3409564.0, "step": 4365, "train_runtime": 1234.4054, "train_tokens_per_second": 2762.11 }, { "epoch": 0.874, "grad_norm": 7.84375, "learning_rate": 8.237404893195377e-07, "loss": 0.4914, "mean_token_accuracy": 0.8531810402870178, "num_input_tokens_seen": 3413720, "num_tokens": 3413720.0, "step": 4370, "train_runtime": 1235.823, "train_tokens_per_second": 2762.305 }, { "epoch": 0.875, "grad_norm": 7.3125, "learning_rate": 8.109163645799267e-07, "loss": 0.5133, "mean_token_accuracy": 0.8549060225486755, "num_input_tokens_seen": 3416424, "num_tokens": 3416424.0, "step": 4375, "train_runtime": 1237.1474, "train_tokens_per_second": 2761.534 }, { "epoch": 0.876, "grad_norm": 12.0, "learning_rate": 7.981886289794516e-07, "loss": 0.4984, "mean_token_accuracy": 0.8646793961524963, "num_input_tokens_seen": 3420612, "num_tokens": 3420612.0, "step": 4380, "train_runtime": 1238.5326, "train_tokens_per_second": 2761.826 }, { "epoch": 0.877, "grad_norm": 4.875, "learning_rate": 7.855574160260371e-07, "loss": 0.104, "mean_token_accuracy": 0.9641417503356934, "num_input_tokens_seen": 3425219, "num_tokens": 3425219.0, "step": 4385, "train_runtime": 1240.0198, "train_tokens_per_second": 2762.229 }, { "epoch": 0.878, "grad_norm": 4.59375, "learning_rate": 7.730228582151278e-07, "loss": 0.2469, "mean_token_accuracy": 0.9180797100067138, "num_input_tokens_seen": 3428686, "num_tokens": 3428686.0, "step": 4390, "train_runtime": 1241.3309, "train_tokens_per_second": 2762.105 }, { "epoch": 0.879, "grad_norm": 4.34375, "learning_rate": 7.60585087028305e-07, "loss": 0.3915, "mean_token_accuracy": 0.8775163769721985, "num_input_tokens_seen": 3432033, "num_tokens": 3432033.0, "step": 4395, "train_runtime": 1242.6663, "train_tokens_per_second": 2761.83 }, { "epoch": 0.88, "grad_norm": 12.8125, "learning_rate": 7.482442329319062e-07, "loss": 0.4014, "mean_token_accuracy": 0.8672354817390442, "num_input_tokens_seen": 3435452, "num_tokens": 3435452.0, "step": 4400, "train_runtime": 1243.9518, "train_tokens_per_second": 2761.724 }, { "epoch": 0.881, "grad_norm": 25.625, "learning_rate": 7.360004253756459e-07, "loss": 0.3877, "mean_token_accuracy": 0.8788410305976868, "num_input_tokens_seen": 3440478, "num_tokens": 3440478.0, "step": 4405, "train_runtime": 1245.3575, "train_tokens_per_second": 2762.643 }, { "epoch": 0.882, "grad_norm": 3.359375, "learning_rate": 7.238537927912747e-07, "loss": 0.2825, "mean_token_accuracy": 0.9077800393104554, "num_input_tokens_seen": 3445112, "num_tokens": 3445112.0, "step": 4410, "train_runtime": 1246.7461, "train_tokens_per_second": 2763.283 }, { "epoch": 0.883, "grad_norm": 2.6875, "learning_rate": 7.118044625912213e-07, "loss": 0.2981, "mean_token_accuracy": 0.9064082026481628, "num_input_tokens_seen": 3449588, "num_tokens": 3449588.0, "step": 4415, "train_runtime": 1248.1623, "train_tokens_per_second": 2763.734 }, { "epoch": 0.884, "grad_norm": 4.3125, "learning_rate": 6.99852561167258e-07, "loss": 0.2932, "mean_token_accuracy": 0.9034970879554749, "num_input_tokens_seen": 3454142, "num_tokens": 3454142.0, "step": 4420, "train_runtime": 1249.5061, "train_tokens_per_second": 2764.406 }, { "epoch": 0.885, "grad_norm": 8.875, "learning_rate": 6.879982138891717e-07, "loss": 0.2817, "mean_token_accuracy": 0.9137148976325988, "num_input_tokens_seen": 3458206, "num_tokens": 3458206.0, "step": 4425, "train_runtime": 1250.8594, "train_tokens_per_second": 2764.664 }, { "epoch": 0.886, "grad_norm": 4.65625, "learning_rate": 6.76241545103461e-07, "loss": 0.283, "mean_token_accuracy": 0.9084842920303344, "num_input_tokens_seen": 3462382, "num_tokens": 3462382.0, "step": 4430, "train_runtime": 1252.2563, "train_tokens_per_second": 2764.915 }, { "epoch": 0.887, "grad_norm": 3.40625, "learning_rate": 6.645826781320141e-07, "loss": 0.3824, "mean_token_accuracy": 0.8809847354888916, "num_input_tokens_seen": 3466148, "num_tokens": 3466148.0, "step": 4435, "train_runtime": 1253.5884, "train_tokens_per_second": 2764.981 }, { "epoch": 0.888, "grad_norm": 3.859375, "learning_rate": 6.530217352708301e-07, "loss": 0.2274, "mean_token_accuracy": 0.9234096288681031, "num_input_tokens_seen": 3470490, "num_tokens": 3470490.0, "step": 4440, "train_runtime": 1254.9356, "train_tokens_per_second": 2765.472 }, { "epoch": 0.889, "grad_norm": 5.03125, "learning_rate": 6.415588377887305e-07, "loss": 0.3274, "mean_token_accuracy": 0.9057687282562256, "num_input_tokens_seen": 3473649, "num_tokens": 3473649.0, "step": 4445, "train_runtime": 1256.2729, "train_tokens_per_second": 2765.043 }, { "epoch": 0.89, "grad_norm": 3.671875, "learning_rate": 6.30194105926083e-07, "loss": 0.3549, "mean_token_accuracy": 0.8828863024711608, "num_input_tokens_seen": 3476210, "num_tokens": 3476210.0, "step": 4450, "train_runtime": 1257.5313, "train_tokens_per_second": 2764.313 }, { "epoch": 0.891, "grad_norm": 4.4375, "learning_rate": 6.1892765889355e-07, "loss": 0.2696, "mean_token_accuracy": 0.9169727206230164, "num_input_tokens_seen": 3481121, "num_tokens": 3481121.0, "step": 4455, "train_runtime": 1258.997, "train_tokens_per_second": 2764.995 }, { "epoch": 0.892, "grad_norm": 3.65625, "learning_rate": 6.077596148708275e-07, "loss": 0.2815, "mean_token_accuracy": 0.9089458584785461, "num_input_tokens_seen": 3485468, "num_tokens": 3485468.0, "step": 4460, "train_runtime": 1260.4402, "train_tokens_per_second": 2765.278 }, { "epoch": 0.893, "grad_norm": 4.625, "learning_rate": 5.966900910054141e-07, "loss": 0.2129, "mean_token_accuracy": 0.936927330493927, "num_input_tokens_seen": 3488824, "num_tokens": 3488824.0, "step": 4465, "train_runtime": 1261.8281, "train_tokens_per_second": 2764.896 }, { "epoch": 0.894, "grad_norm": 10.0625, "learning_rate": 5.857192034113757e-07, "loss": 0.5388, "mean_token_accuracy": 0.8270596981048584, "num_input_tokens_seen": 3492118, "num_tokens": 3492118.0, "step": 4470, "train_runtime": 1263.2066, "train_tokens_per_second": 2764.487 }, { "epoch": 0.895, "grad_norm": 8.125, "learning_rate": 5.748470671681328e-07, "loss": 0.4706, "mean_token_accuracy": 0.8553109645843506, "num_input_tokens_seen": 3497334, "num_tokens": 3497334.0, "step": 4475, "train_runtime": 1264.663, "train_tokens_per_second": 2765.428 }, { "epoch": 0.896, "grad_norm": 7.6875, "learning_rate": 5.640737963192511e-07, "loss": 0.4297, "mean_token_accuracy": 0.849378764629364, "num_input_tokens_seen": 3501056, "num_tokens": 3501056.0, "step": 4480, "train_runtime": 1266.035, "train_tokens_per_second": 2765.371 }, { "epoch": 0.897, "grad_norm": 2.640625, "learning_rate": 5.533995038712403e-07, "loss": 0.2321, "mean_token_accuracy": 0.9289917469024658, "num_input_tokens_seen": 3505334, "num_tokens": 3505334.0, "step": 4485, "train_runtime": 1267.4241, "train_tokens_per_second": 2765.715 }, { "epoch": 0.898, "grad_norm": 5.53125, "learning_rate": 5.428243017923817e-07, "loss": 0.2007, "mean_token_accuracy": 0.9219006896018982, "num_input_tokens_seen": 3509056, "num_tokens": 3509056.0, "step": 4490, "train_runtime": 1268.7687, "train_tokens_per_second": 2765.718 }, { "epoch": 0.899, "grad_norm": 5.125, "learning_rate": 5.323483010115382e-07, "loss": 0.1808, "mean_token_accuracy": 0.9405884981155396, "num_input_tokens_seen": 3513378, "num_tokens": 3513378.0, "step": 4495, "train_runtime": 1270.1909, "train_tokens_per_second": 2766.024 }, { "epoch": 0.9, "grad_norm": 4.90625, "learning_rate": 5.219716114170026e-07, "loss": 0.2806, "mean_token_accuracy": 0.9014737129211425, "num_input_tokens_seen": 3516328, "num_tokens": 3516328.0, "step": 4500, "train_runtime": 1271.4925, "train_tokens_per_second": 2765.512 }, { "epoch": 0.901, "grad_norm": 2.734375, "learning_rate": 5.116943418553355e-07, "loss": 0.1718, "mean_token_accuracy": 0.9447907686233521, "num_input_tokens_seen": 3519761, "num_tokens": 3519761.0, "step": 4505, "train_runtime": 1272.8123, "train_tokens_per_second": 2765.342 }, { "epoch": 0.902, "grad_norm": 5.5, "learning_rate": 5.01516600130233e-07, "loss": 0.3149, "mean_token_accuracy": 0.9040312170982361, "num_input_tokens_seen": 3523718, "num_tokens": 3523718.0, "step": 4510, "train_runtime": 1274.2318, "train_tokens_per_second": 2765.366 }, { "epoch": 0.903, "grad_norm": 9.3125, "learning_rate": 4.914384930013927e-07, "loss": 0.4205, "mean_token_accuracy": 0.8609397649765015, "num_input_tokens_seen": 3527364, "num_tokens": 3527364.0, "step": 4515, "train_runtime": 1275.547, "train_tokens_per_second": 2765.374 }, { "epoch": 0.904, "grad_norm": 12.875, "learning_rate": 4.814601261833851e-07, "loss": 0.4838, "mean_token_accuracy": 0.8454326868057251, "num_input_tokens_seen": 3531368, "num_tokens": 3531368.0, "step": 4520, "train_runtime": 1276.9391, "train_tokens_per_second": 2765.494 }, { "epoch": 0.905, "grad_norm": 3.484375, "learning_rate": 4.715816043445609e-07, "loss": 0.3284, "mean_token_accuracy": 0.8839154005050659, "num_input_tokens_seen": 3537425, "num_tokens": 3537425.0, "step": 4525, "train_runtime": 1278.4741, "train_tokens_per_second": 2766.912 }, { "epoch": 0.906, "grad_norm": 10.375, "learning_rate": 4.618030311059352e-07, "loss": 0.3502, "mean_token_accuracy": 0.8906478762626648, "num_input_tokens_seen": 3541488, "num_tokens": 3541488.0, "step": 4530, "train_runtime": 1279.8609, "train_tokens_per_second": 2767.088 }, { "epoch": 0.907, "grad_norm": 3.75, "learning_rate": 4.521245090401172e-07, "loss": 0.2222, "mean_token_accuracy": 0.923599374294281, "num_input_tokens_seen": 3546970, "num_tokens": 3546970.0, "step": 4535, "train_runtime": 1281.3065, "train_tokens_per_second": 2768.245 }, { "epoch": 0.908, "grad_norm": 8.875, "learning_rate": 4.425461396702213e-07, "loss": 0.4568, "mean_token_accuracy": 0.8606929302215576, "num_input_tokens_seen": 3551586, "num_tokens": 3551586.0, "step": 4540, "train_runtime": 1282.7264, "train_tokens_per_second": 2768.779 }, { "epoch": 0.909, "grad_norm": 6.40625, "learning_rate": 4.3306802346881116e-07, "loss": 0.371, "mean_token_accuracy": 0.8819148659706115, "num_input_tokens_seen": 3555934, "num_tokens": 3555934.0, "step": 4545, "train_runtime": 1284.1163, "train_tokens_per_second": 2769.168 }, { "epoch": 0.91, "grad_norm": 11.9375, "learning_rate": 4.2369025985684264e-07, "loss": 0.5315, "mean_token_accuracy": 0.8568836450576782, "num_input_tokens_seen": 3558974, "num_tokens": 3558974.0, "step": 4550, "train_runtime": 1285.3774, "train_tokens_per_second": 2768.816 }, { "epoch": 0.911, "grad_norm": 3.875, "learning_rate": 4.1441294720261373e-07, "loss": 0.139, "mean_token_accuracy": 0.9573936820030212, "num_input_tokens_seen": 3562835, "num_tokens": 3562835.0, "step": 4555, "train_runtime": 1286.714, "train_tokens_per_second": 2768.941 }, { "epoch": 0.912, "grad_norm": 7.6875, "learning_rate": 4.0523618282074964e-07, "loss": 0.4052, "mean_token_accuracy": 0.8707888722419739, "num_input_tokens_seen": 3565812, "num_tokens": 3565812.0, "step": 4560, "train_runtime": 1288.0302, "train_tokens_per_second": 2768.423 }, { "epoch": 0.913, "grad_norm": 4.84375, "learning_rate": 3.961600629711615e-07, "loss": 0.3342, "mean_token_accuracy": 0.8865555167198181, "num_input_tokens_seen": 3569359, "num_tokens": 3569359.0, "step": 4565, "train_runtime": 1289.4482, "train_tokens_per_second": 2768.129 }, { "epoch": 0.914, "grad_norm": 3.421875, "learning_rate": 3.87184682858055e-07, "loss": 0.3485, "mean_token_accuracy": 0.8865175604820251, "num_input_tokens_seen": 3574150, "num_tokens": 3574150.0, "step": 4570, "train_runtime": 1290.8687, "train_tokens_per_second": 2768.794 }, { "epoch": 0.915, "grad_norm": 12.6875, "learning_rate": 3.783101366289199e-07, "loss": 0.3078, "mean_token_accuracy": 0.9024839997291565, "num_input_tokens_seen": 3576668, "num_tokens": 3576668.0, "step": 4575, "train_runtime": 1292.1544, "train_tokens_per_second": 2767.988 }, { "epoch": 0.916, "grad_norm": 2.390625, "learning_rate": 3.695365173735466e-07, "loss": 0.1868, "mean_token_accuracy": 0.9391541838645935, "num_input_tokens_seen": 3580920, "num_tokens": 3580920.0, "step": 4580, "train_runtime": 1293.5528, "train_tokens_per_second": 2768.283 }, { "epoch": 0.917, "grad_norm": 6.9375, "learning_rate": 3.608639171230488e-07, "loss": 0.4104, "mean_token_accuracy": 0.8768238306045533, "num_input_tokens_seen": 3585010, "num_tokens": 3585010.0, "step": 4585, "train_runtime": 1294.9712, "train_tokens_per_second": 2768.409 }, { "epoch": 0.918, "grad_norm": 4.4375, "learning_rate": 3.522924268489003e-07, "loss": 0.1987, "mean_token_accuracy": 0.9406271934509277, "num_input_tokens_seen": 3588922, "num_tokens": 3588922.0, "step": 4590, "train_runtime": 1296.3316, "train_tokens_per_second": 2768.521 }, { "epoch": 0.919, "grad_norm": 2.140625, "learning_rate": 3.438221364619776e-07, "loss": 0.4303, "mean_token_accuracy": 0.8612835645675659, "num_input_tokens_seen": 3593372, "num_tokens": 3593372.0, "step": 4595, "train_runtime": 1299.4422, "train_tokens_per_second": 2765.319 }, { "epoch": 0.92, "grad_norm": 7.75, "learning_rate": 3.3545313481161743e-07, "loss": 0.2485, "mean_token_accuracy": 0.9181795239448547, "num_input_tokens_seen": 3597940, "num_tokens": 3597940.0, "step": 4600, "train_runtime": 1300.8358, "train_tokens_per_second": 2765.868 }, { "epoch": 0.921, "grad_norm": 3.59375, "learning_rate": 3.271855096846899e-07, "loss": 0.2238, "mean_token_accuracy": 0.9324809789657593, "num_input_tokens_seen": 3601112, "num_tokens": 3601112.0, "step": 4605, "train_runtime": 1302.1664, "train_tokens_per_second": 2765.478 }, { "epoch": 0.922, "grad_norm": 4.6875, "learning_rate": 3.190193478046677e-07, "loss": 0.3876, "mean_token_accuracy": 0.8800474524497985, "num_input_tokens_seen": 3604940, "num_tokens": 3604940.0, "step": 4610, "train_runtime": 1304.1337, "train_tokens_per_second": 2764.241 }, { "epoch": 0.923, "grad_norm": 3.0, "learning_rate": 3.1095473483072733e-07, "loss": 0.2931, "mean_token_accuracy": 0.8981198072433472, "num_input_tokens_seen": 3608688, "num_tokens": 3608688.0, "step": 4615, "train_runtime": 1305.4649, "train_tokens_per_second": 2764.293 }, { "epoch": 0.924, "grad_norm": 2.59375, "learning_rate": 3.029917553568407e-07, "loss": 0.2703, "mean_token_accuracy": 0.91173095703125, "num_input_tokens_seen": 3612594, "num_tokens": 3612594.0, "step": 4620, "train_runtime": 1306.8564, "train_tokens_per_second": 2764.339 }, { "epoch": 0.925, "grad_norm": 7.9375, "learning_rate": 2.951304929108956e-07, "loss": 0.4021, "mean_token_accuracy": 0.8664975762367249, "num_input_tokens_seen": 3616509, "num_tokens": 3616509.0, "step": 4625, "train_runtime": 1308.2611, "train_tokens_per_second": 2764.363 }, { "epoch": 0.926, "grad_norm": 4.09375, "learning_rate": 2.873710299538146e-07, "loss": 0.2611, "mean_token_accuracy": 0.9080349802970886, "num_input_tokens_seen": 3620500, "num_tokens": 3620500.0, "step": 4630, "train_runtime": 1309.6005, "train_tokens_per_second": 2764.584 }, { "epoch": 0.927, "grad_norm": 3.6875, "learning_rate": 2.7971344787869114e-07, "loss": 0.4147, "mean_token_accuracy": 0.8747751712799072, "num_input_tokens_seen": 3623661, "num_tokens": 3623661.0, "step": 4635, "train_runtime": 1310.9318, "train_tokens_per_second": 2764.187 }, { "epoch": 0.928, "grad_norm": 5.25, "learning_rate": 2.721578270099412e-07, "loss": 0.3883, "mean_token_accuracy": 0.8808615326881408, "num_input_tokens_seen": 3626928, "num_tokens": 3626928.0, "step": 4640, "train_runtime": 1312.2278, "train_tokens_per_second": 2763.947 }, { "epoch": 0.929, "grad_norm": 2.890625, "learning_rate": 2.647042466024485e-07, "loss": 0.4727, "mean_token_accuracy": 0.8509864211082458, "num_input_tokens_seen": 3631629, "num_tokens": 3631629.0, "step": 4645, "train_runtime": 1313.6579, "train_tokens_per_second": 2764.517 }, { "epoch": 0.93, "grad_norm": 3.0625, "learning_rate": 2.5735278484074865e-07, "loss": 0.3311, "mean_token_accuracy": 0.8910769581794739, "num_input_tokens_seen": 3636756, "num_tokens": 3636756.0, "step": 4650, "train_runtime": 1315.0779, "train_tokens_per_second": 2765.43 }, { "epoch": 0.931, "grad_norm": 5.625, "learning_rate": 2.5010351883819283e-07, "loss": 0.3618, "mean_token_accuracy": 0.8791554808616638, "num_input_tokens_seen": 3640802, "num_tokens": 3640802.0, "step": 4655, "train_runtime": 1316.3957, "train_tokens_per_second": 2765.735 }, { "epoch": 0.932, "grad_norm": 11.0625, "learning_rate": 2.429565246361532e-07, "loss": 0.3421, "mean_token_accuracy": 0.8925309658050538, "num_input_tokens_seen": 3645070, "num_tokens": 3645070.0, "step": 4660, "train_runtime": 1317.7591, "train_tokens_per_second": 2766.113 }, { "epoch": 0.933, "grad_norm": 3.40625, "learning_rate": 2.359118772032176e-07, "loss": 0.2245, "mean_token_accuracy": 0.9245673060417176, "num_input_tokens_seen": 3647833, "num_tokens": 3647833.0, "step": 4665, "train_runtime": 1319.0701, "train_tokens_per_second": 2765.458 }, { "epoch": 0.934, "grad_norm": 8.5625, "learning_rate": 2.289696504344019e-07, "loss": 0.3244, "mean_token_accuracy": 0.8961198806762696, "num_input_tokens_seen": 3652992, "num_tokens": 3652992.0, "step": 4670, "train_runtime": 1320.5402, "train_tokens_per_second": 2766.286 }, { "epoch": 0.935, "grad_norm": 13.1875, "learning_rate": 2.2212991715038324e-07, "loss": 0.5056, "mean_token_accuracy": 0.8507928848266602, "num_input_tokens_seen": 3656629, "num_tokens": 3656629.0, "step": 4675, "train_runtime": 1321.9046, "train_tokens_per_second": 2766.182 }, { "epoch": 0.936, "grad_norm": 5.53125, "learning_rate": 2.1539274909672337e-07, "loss": 0.4333, "mean_token_accuracy": 0.8660982489585877, "num_input_tokens_seen": 3659950, "num_tokens": 3659950.0, "step": 4680, "train_runtime": 1323.2381, "train_tokens_per_second": 2765.904 }, { "epoch": 0.937, "grad_norm": 3.109375, "learning_rate": 2.0875821694313014e-07, "loss": 0.348, "mean_token_accuracy": 0.8969164967536927, "num_input_tokens_seen": 3665184, "num_tokens": 3665184.0, "step": 4685, "train_runtime": 1324.6288, "train_tokens_per_second": 2766.952 }, { "epoch": 0.938, "grad_norm": 3.765625, "learning_rate": 2.0222639028270486e-07, "loss": 0.2715, "mean_token_accuracy": 0.9105570435523986, "num_input_tokens_seen": 3669736, "num_tokens": 3669736.0, "step": 4690, "train_runtime": 1326.035, "train_tokens_per_second": 2767.45 }, { "epoch": 0.939, "grad_norm": 6.875, "learning_rate": 1.9579733763121943e-07, "loss": 0.3247, "mean_token_accuracy": 0.9014999747276307, "num_input_tokens_seen": 3673236, "num_tokens": 3673236.0, "step": 4695, "train_runtime": 1327.3693, "train_tokens_per_second": 2767.305 }, { "epoch": 0.94, "grad_norm": 0.94140625, "learning_rate": 1.8947112642639376e-07, "loss": 0.1714, "mean_token_accuracy": 0.9449493765830994, "num_input_tokens_seen": 3677934, "num_tokens": 3677934.0, "step": 4700, "train_runtime": 1328.7976, "train_tokens_per_second": 2767.866 }, { "epoch": 0.941, "grad_norm": 10.625, "learning_rate": 1.8324782302718835e-07, "loss": 0.345, "mean_token_accuracy": 0.9009698987007141, "num_input_tokens_seen": 3680845, "num_tokens": 3680845.0, "step": 4705, "train_runtime": 1330.1241, "train_tokens_per_second": 2767.294 }, { "epoch": 0.942, "grad_norm": 6.03125, "learning_rate": 1.7712749271311392e-07, "loss": 0.3427, "mean_token_accuracy": 0.8751938581466675, "num_input_tokens_seen": 3683048, "num_tokens": 3683048.0, "step": 4710, "train_runtime": 1331.4413, "train_tokens_per_second": 2766.211 }, { "epoch": 0.943, "grad_norm": 6.78125, "learning_rate": 1.7111019968353625e-07, "loss": 0.3346, "mean_token_accuracy": 0.892684280872345, "num_input_tokens_seen": 3688354, "num_tokens": 3688354.0, "step": 4715, "train_runtime": 1332.8496, "train_tokens_per_second": 2767.269 }, { "epoch": 0.944, "grad_norm": 7.6875, "learning_rate": 1.6519600705701465e-07, "loss": 0.3262, "mean_token_accuracy": 0.8959339618682861, "num_input_tokens_seen": 3691408, "num_tokens": 3691408.0, "step": 4720, "train_runtime": 1334.1594, "train_tokens_per_second": 2766.842 }, { "epoch": 0.945, "grad_norm": 11.375, "learning_rate": 1.5938497687062905e-07, "loss": 0.2425, "mean_token_accuracy": 0.9197401881217957, "num_input_tokens_seen": 3695685, "num_tokens": 3695685.0, "step": 4725, "train_runtime": 1335.5599, "train_tokens_per_second": 2767.143 }, { "epoch": 0.946, "grad_norm": 5.09375, "learning_rate": 1.5367717007933826e-07, "loss": 0.3202, "mean_token_accuracy": 0.8985082626342773, "num_input_tokens_seen": 3700178, "num_tokens": 3700178.0, "step": 4730, "train_runtime": 1336.9275, "train_tokens_per_second": 2767.673 }, { "epoch": 0.947, "grad_norm": 2.71875, "learning_rate": 1.4807264655533282e-07, "loss": 0.1401, "mean_token_accuracy": 0.9508221030235291, "num_input_tokens_seen": 3703296, "num_tokens": 3703296.0, "step": 4735, "train_runtime": 1338.2674, "train_tokens_per_second": 2767.232 }, { "epoch": 0.948, "grad_norm": 59.25, "learning_rate": 1.4257146508741436e-07, "loss": 0.4523, "mean_token_accuracy": 0.8644691944122315, "num_input_tokens_seen": 3707122, "num_tokens": 3707122.0, "step": 4740, "train_runtime": 1339.5855, "train_tokens_per_second": 2767.365 }, { "epoch": 0.949, "grad_norm": 5.78125, "learning_rate": 1.3717368338037163e-07, "loss": 0.3879, "mean_token_accuracy": 0.879356038570404, "num_input_tokens_seen": 3711393, "num_tokens": 3711393.0, "step": 4745, "train_runtime": 1340.9836, "train_tokens_per_second": 2767.665 }, { "epoch": 0.95, "grad_norm": 2.546875, "learning_rate": 1.318793580543809e-07, "loss": 0.2836, "mean_token_accuracy": 0.9083822846412659, "num_input_tokens_seen": 3716510, "num_tokens": 3716510.0, "step": 4750, "train_runtime": 1342.3817, "train_tokens_per_second": 2768.594 }, { "epoch": 0.951, "grad_norm": 3.734375, "learning_rate": 1.2668854464441104e-07, "loss": 0.2541, "mean_token_accuracy": 0.9137631893157959, "num_input_tokens_seen": 3721748, "num_tokens": 3721748.0, "step": 4755, "train_runtime": 1343.9428, "train_tokens_per_second": 2769.276 }, { "epoch": 0.952, "grad_norm": 5.21875, "learning_rate": 1.2160129759963723e-07, "loss": 0.2997, "mean_token_accuracy": 0.9140471458435059, "num_input_tokens_seen": 3725954, "num_tokens": 3725954.0, "step": 4760, "train_runtime": 1345.3258, "train_tokens_per_second": 2769.555 }, { "epoch": 0.953, "grad_norm": 3.96875, "learning_rate": 1.1661767028287363e-07, "loss": 0.3018, "mean_token_accuracy": 0.9101715207099914, "num_input_tokens_seen": 3729389, "num_tokens": 3729389.0, "step": 4765, "train_runtime": 1346.599, "train_tokens_per_second": 2769.487 }, { "epoch": 0.954, "grad_norm": 9.3125, "learning_rate": 1.1173771497001273e-07, "loss": 0.5378, "mean_token_accuracy": 0.8494227528572083, "num_input_tokens_seen": 3734016, "num_tokens": 3734016.0, "step": 4770, "train_runtime": 1348.0483, "train_tokens_per_second": 2769.942 }, { "epoch": 0.955, "grad_norm": 3.375, "learning_rate": 1.0696148284947694e-07, "loss": 0.1957, "mean_token_accuracy": 0.9320483207702637, "num_input_tokens_seen": 3737731, "num_tokens": 3737731.0, "step": 4775, "train_runtime": 1349.4449, "train_tokens_per_second": 2769.829 }, { "epoch": 0.956, "grad_norm": 7.21875, "learning_rate": 1.0228902402168118e-07, "loss": 0.4868, "mean_token_accuracy": 0.8542607307434082, "num_input_tokens_seen": 3741408, "num_tokens": 3741408.0, "step": 4780, "train_runtime": 1350.81, "train_tokens_per_second": 2769.751 }, { "epoch": 0.957, "grad_norm": 8.9375, "learning_rate": 9.772038749850665e-08, "loss": 0.3628, "mean_token_accuracy": 0.8856673121452332, "num_input_tokens_seen": 3745718, "num_tokens": 3745718.0, "step": 4785, "train_runtime": 1352.2252, "train_tokens_per_second": 2770.04 }, { "epoch": 0.958, "grad_norm": 7.1875, "learning_rate": 9.32556212027902e-08, "loss": 0.297, "mean_token_accuracy": 0.9029954671859741, "num_input_tokens_seen": 3750096, "num_tokens": 3750096.0, "step": 4790, "train_runtime": 1353.6296, "train_tokens_per_second": 2770.4 }, { "epoch": 0.959, "grad_norm": 3.984375, "learning_rate": 8.889477196781571e-08, "loss": 0.431, "mean_token_accuracy": 0.8582552194595336, "num_input_tokens_seen": 3753561, "num_tokens": 3753561.0, "step": 4795, "train_runtime": 1354.9588, "train_tokens_per_second": 2770.24 }, { "epoch": 0.96, "grad_norm": 19.875, "learning_rate": 8.463788553683017e-08, "loss": 0.38, "mean_token_accuracy": 0.866929292678833, "num_input_tokens_seen": 3756966, "num_tokens": 3756966.0, "step": 4800, "train_runtime": 1356.2793, "train_tokens_per_second": 2770.053 }, { "epoch": 0.961, "grad_norm": 15.875, "learning_rate": 8.04850065625551e-08, "loss": 0.3553, "mean_token_accuracy": 0.8842318892478943, "num_input_tokens_seen": 3760197, "num_tokens": 3760197.0, "step": 4805, "train_runtime": 1357.6444, "train_tokens_per_second": 2769.648 }, { "epoch": 0.962, "grad_norm": 3.703125, "learning_rate": 7.643617860672914e-08, "loss": 0.3149, "mean_token_accuracy": 0.8938192963600159, "num_input_tokens_seen": 3764648, "num_tokens": 3764648.0, "step": 4810, "train_runtime": 1359.0039, "train_tokens_per_second": 2770.152 }, { "epoch": 0.963, "grad_norm": 3.84375, "learning_rate": 7.24914441396396e-08, "loss": 0.4607, "mean_token_accuracy": 0.850116515159607, "num_input_tokens_seen": 3767578, "num_tokens": 3767578.0, "step": 4815, "train_runtime": 1360.3554, "train_tokens_per_second": 2769.554 }, { "epoch": 0.964, "grad_norm": 3.984375, "learning_rate": 6.865084453968495e-08, "loss": 0.2135, "mean_token_accuracy": 0.9284858226776123, "num_input_tokens_seen": 3771834, "num_tokens": 3771834.0, "step": 4820, "train_runtime": 1361.768, "train_tokens_per_second": 2769.807 }, { "epoch": 0.965, "grad_norm": 3.5, "learning_rate": 6.491442009293858e-08, "loss": 0.2481, "mean_token_accuracy": 0.9242266654968262, "num_input_tokens_seen": 3775898, "num_tokens": 3775898.0, "step": 4825, "train_runtime": 1363.1581, "train_tokens_per_second": 2769.963 }, { "epoch": 0.966, "grad_norm": 7.625, "learning_rate": 6.12822099927235e-08, "loss": 0.3509, "mean_token_accuracy": 0.8809558510780334, "num_input_tokens_seen": 3778390, "num_tokens": 3778390.0, "step": 4830, "train_runtime": 1364.4736, "train_tokens_per_second": 2769.119 }, { "epoch": 0.967, "grad_norm": 6.8125, "learning_rate": 5.7754252339204955e-08, "loss": 0.3834, "mean_token_accuracy": 0.8722980260848999, "num_input_tokens_seen": 3780526, "num_tokens": 3780526.0, "step": 4835, "train_runtime": 1365.7642, "train_tokens_per_second": 2768.066 }, { "epoch": 0.968, "grad_norm": 8.1875, "learning_rate": 5.4330584138989615e-08, "loss": 0.2597, "mean_token_accuracy": 0.9220953583717346, "num_input_tokens_seen": 3785218, "num_tokens": 3785218.0, "step": 4840, "train_runtime": 1367.2167, "train_tokens_per_second": 2768.557 }, { "epoch": 0.969, "grad_norm": 2.9375, "learning_rate": 5.1011241304738115e-08, "loss": 0.3319, "mean_token_accuracy": 0.8915521621704101, "num_input_tokens_seen": 3788864, "num_tokens": 3788864.0, "step": 4845, "train_runtime": 1368.628, "train_tokens_per_second": 2768.367 }, { "epoch": 0.97, "grad_norm": 6.46875, "learning_rate": 4.779625865478421e-08, "loss": 0.3051, "mean_token_accuracy": 0.9099111318588257, "num_input_tokens_seen": 3792066, "num_tokens": 3792066.0, "step": 4850, "train_runtime": 1369.989, "train_tokens_per_second": 2767.954 }, { "epoch": 0.971, "grad_norm": 4.625, "learning_rate": 4.468566991277512e-08, "loss": 0.3534, "mean_token_accuracy": 0.8895323872566223, "num_input_tokens_seen": 3795584, "num_tokens": 3795584.0, "step": 4855, "train_runtime": 1371.2963, "train_tokens_per_second": 2767.88 }, { "epoch": 0.972, "grad_norm": 3.59375, "learning_rate": 4.1679507707315106e-08, "loss": 0.2661, "mean_token_accuracy": 0.9140796542167664, "num_input_tokens_seen": 3799306, "num_tokens": 3799306.0, "step": 4860, "train_runtime": 1372.7045, "train_tokens_per_second": 2767.752 }, { "epoch": 0.973, "grad_norm": 3.28125, "learning_rate": 3.877780357162353e-08, "loss": 0.3433, "mean_token_accuracy": 0.8848061919212341, "num_input_tokens_seen": 3804233, "num_tokens": 3804233.0, "step": 4865, "train_runtime": 1374.0963, "train_tokens_per_second": 2768.534 }, { "epoch": 0.974, "grad_norm": 2.625, "learning_rate": 3.598058794320402e-08, "loss": 0.1661, "mean_token_accuracy": 0.9452144384384156, "num_input_tokens_seen": 3809110, "num_tokens": 3809110.0, "step": 4870, "train_runtime": 1375.5158, "train_tokens_per_second": 2769.223 }, { "epoch": 0.975, "grad_norm": 8.5, "learning_rate": 3.3287890163523626e-08, "loss": 0.1777, "mean_token_accuracy": 0.9447686910629273, "num_input_tokens_seen": 3812215, "num_tokens": 3812215.0, "step": 4875, "train_runtime": 1376.8515, "train_tokens_per_second": 2768.792 }, { "epoch": 0.976, "grad_norm": 2.90625, "learning_rate": 3.0699738477708576e-08, "loss": 0.2827, "mean_token_accuracy": 0.9084138751029969, "num_input_tokens_seen": 3816532, "num_tokens": 3816532.0, "step": 4880, "train_runtime": 1378.23, "train_tokens_per_second": 2769.155 }, { "epoch": 0.977, "grad_norm": 3.890625, "learning_rate": 2.8216160034244544e-08, "loss": 0.3084, "mean_token_accuracy": 0.8940139293670655, "num_input_tokens_seen": 3820711, "num_tokens": 3820711.0, "step": 4885, "train_runtime": 1379.6114, "train_tokens_per_second": 2769.411 }, { "epoch": 0.978, "grad_norm": 3.578125, "learning_rate": 2.583718088469689e-08, "loss": 0.359, "mean_token_accuracy": 0.8913275122642517, "num_input_tokens_seen": 3826262, "num_tokens": 3826262.0, "step": 4890, "train_runtime": 1381.0397, "train_tokens_per_second": 2770.566 }, { "epoch": 0.979, "grad_norm": 4.3125, "learning_rate": 2.3562825983427517e-08, "loss": 0.4303, "mean_token_accuracy": 0.8677053451538086, "num_input_tokens_seen": 3831138, "num_tokens": 3831138.0, "step": 4895, "train_runtime": 1382.5155, "train_tokens_per_second": 2771.136 }, { "epoch": 0.98, "grad_norm": 6.65625, "learning_rate": 2.1393119187345103e-08, "loss": 0.2842, "mean_token_accuracy": 0.9086182832717895, "num_input_tokens_seen": 3834940, "num_tokens": 3834940.0, "step": 4900, "train_runtime": 1383.9095, "train_tokens_per_second": 2771.092 }, { "epoch": 0.981, "grad_norm": 3.859375, "learning_rate": 1.93280832556475e-08, "loss": 0.3515, "mean_token_accuracy": 0.8928469777107239, "num_input_tokens_seen": 3839433, "num_tokens": 3839433.0, "step": 4905, "train_runtime": 1385.2642, "train_tokens_per_second": 2771.625 }, { "epoch": 0.982, "grad_norm": 13.1875, "learning_rate": 1.7367739849584174e-08, "loss": 0.383, "mean_token_accuracy": 0.8779733657836915, "num_input_tokens_seen": 3844366, "num_tokens": 3844366.0, "step": 4910, "train_runtime": 1387.3354, "train_tokens_per_second": 2771.043 }, { "epoch": 0.983, "grad_norm": 1.8125, "learning_rate": 1.5512109532229703e-08, "loss": 0.2245, "mean_token_accuracy": 0.9237821459770202, "num_input_tokens_seen": 3847635, "num_tokens": 3847635.0, "step": 4915, "train_runtime": 1388.6822, "train_tokens_per_second": 2770.71 }, { "epoch": 0.984, "grad_norm": 3.078125, "learning_rate": 1.376121176826728e-08, "loss": 0.2092, "mean_token_accuracy": 0.927723503112793, "num_input_tokens_seen": 3852818, "num_tokens": 3852818.0, "step": 4920, "train_runtime": 1390.1046, "train_tokens_per_second": 2771.603 }, { "epoch": 0.985, "grad_norm": 22.375, "learning_rate": 1.2115064923787778e-08, "loss": 0.4569, "mean_token_accuracy": 0.8628705501556396, "num_input_tokens_seen": 3857131, "num_tokens": 3857131.0, "step": 4925, "train_runtime": 1391.4966, "train_tokens_per_second": 2771.93 }, { "epoch": 0.986, "grad_norm": 6.4375, "learning_rate": 1.057368626609101e-08, "loss": 0.4318, "mean_token_accuracy": 0.8611314177513123, "num_input_tokens_seen": 3861248, "num_tokens": 3861248.0, "step": 4930, "train_runtime": 1392.9115, "train_tokens_per_second": 2772.07 }, { "epoch": 0.987, "grad_norm": 5.71875, "learning_rate": 9.137091963510314e-09, "loss": 0.4943, "mean_token_accuracy": 0.8540239095687866, "num_input_tokens_seen": 3864018, "num_tokens": 3864018.0, "step": 4935, "train_runtime": 1394.2017, "train_tokens_per_second": 2771.491 }, { "epoch": 0.988, "grad_norm": 9.75, "learning_rate": 7.80529708523936e-09, "loss": 0.3102, "mean_token_accuracy": 0.9093350291252136, "num_input_tokens_seen": 3868492, "num_tokens": 3868492.0, "step": 4940, "train_runtime": 1395.615, "train_tokens_per_second": 2771.89 }, { "epoch": 0.989, "grad_norm": 9.625, "learning_rate": 6.5783156011778315e-09, "loss": 0.4137, "mean_token_accuracy": 0.8717213630676269, "num_input_tokens_seen": 3872020, "num_tokens": 3872020.0, "step": 4945, "train_runtime": 1396.9653, "train_tokens_per_second": 2771.737 }, { "epoch": 0.99, "grad_norm": 8.4375, "learning_rate": 5.456160381779319e-09, "loss": 0.3845, "mean_token_accuracy": 0.8815066099166871, "num_input_tokens_seen": 3876594, "num_tokens": 3876594.0, "step": 4950, "train_runtime": 1398.3941, "train_tokens_per_second": 2772.176 }, { "epoch": 0.991, "grad_norm": 3.796875, "learning_rate": 4.438843197922538e-09, "loss": 0.3931, "mean_token_accuracy": 0.8743783950805664, "num_input_tokens_seen": 3881378, "num_tokens": 3881378.0, "step": 4955, "train_runtime": 1399.8253, "train_tokens_per_second": 2772.759 }, { "epoch": 0.992, "grad_norm": 6.96875, "learning_rate": 3.526374720782544e-09, "loss": 0.2961, "mean_token_accuracy": 0.9024595260620117, "num_input_tokens_seen": 3883846, "num_tokens": 3883846.0, "step": 4960, "train_runtime": 1401.1084, "train_tokens_per_second": 2771.981 }, { "epoch": 0.993, "grad_norm": 4.46875, "learning_rate": 2.7187645217219283e-09, "loss": 0.368, "mean_token_accuracy": 0.8791983842849731, "num_input_tokens_seen": 3888084, "num_tokens": 3888084.0, "step": 4965, "train_runtime": 1402.5101, "train_tokens_per_second": 2772.233 }, { "epoch": 0.994, "grad_norm": 6.6875, "learning_rate": 2.0160210721886788e-09, "loss": 0.287, "mean_token_accuracy": 0.9243991255760193, "num_input_tokens_seen": 3893206, "num_tokens": 3893206.0, "step": 4970, "train_runtime": 1403.9233, "train_tokens_per_second": 2773.09 }, { "epoch": 0.995, "grad_norm": 4.75, "learning_rate": 1.4181517436306913e-09, "loss": 0.1955, "mean_token_accuracy": 0.929287564754486, "num_input_tokens_seen": 3898484, "num_tokens": 3898484.0, "step": 4975, "train_runtime": 1405.3792, "train_tokens_per_second": 2773.973 }, { "epoch": 0.996, "grad_norm": 3.796875, "learning_rate": 9.251628074136154e-10, "loss": 0.1841, "mean_token_accuracy": 0.9349937677383423, "num_input_tokens_seen": 3903312, "num_tokens": 3903312.0, "step": 4980, "train_runtime": 1406.8081, "train_tokens_per_second": 2774.587 }, { "epoch": 0.997, "grad_norm": 18.25, "learning_rate": 5.370594347575697e-10, "loss": 0.3325, "mean_token_accuracy": 0.8981282353401184, "num_input_tokens_seen": 3907772, "num_tokens": 3907772.0, "step": 4985, "train_runtime": 1408.1649, "train_tokens_per_second": 2775.081 }, { "epoch": 0.998, "grad_norm": 6.71875, "learning_rate": 2.538456966838521e-10, "loss": 0.398, "mean_token_accuracy": 0.8777669787406921, "num_input_tokens_seen": 3911352, "num_tokens": 3911352.0, "step": 4990, "train_runtime": 1409.5005, "train_tokens_per_second": 2774.992 }, { "epoch": 0.999, "grad_norm": 8.8125, "learning_rate": 7.552456397053042e-11, "loss": 0.344, "mean_token_accuracy": 0.8929438829421997, "num_input_tokens_seen": 3914579, "num_tokens": 3914579.0, "step": 4995, "train_runtime": 1410.7981, "train_tokens_per_second": 2774.727 }, { "epoch": 1.0, "grad_norm": 5.21875, "learning_rate": 2.0979071224669357e-12, "loss": 0.411, "mean_token_accuracy": 0.882614254951477, "num_input_tokens_seen": 3918726, "num_tokens": 3918726.0, "step": 5000, "train_runtime": 1412.2173, "train_tokens_per_second": 2774.875 }, { "epoch": 1.0, "num_input_tokens_seen": 3918726, "step": 5000, "total_flos": 1.8215420210362368e+17, "train_loss": 0.36460426968336107, "train_runtime": 1412.4023, "train_samples_per_second": 3.54, "train_steps_per_second": 3.54, "train_tokens_per_second": 5607.003 } ], "logging_steps": 5, "max_steps": 5000, "num_input_tokens_seen": 3918726, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.8215420210362368e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }