diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,10171 +1,11975 @@ { - "best_global_step": 4300, - "best_metric": 41.374914604150135, - "best_model_checkpoint": "./output/string-repetition-tiny/checkpoint-2500", - "epoch": 1.8390307226308957, + "best_global_step": 6100, + "best_metric": 98.58235971529302, + "best_model_checkpoint": "./output/string-repetition-tiny/checkpoint-3500", + "epoch": 2.163565556036348, "eval_steps": 100, - "global_step": 8500, + "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, - "eval_loss": 5.421990871429443, - "eval_runtime": 48.4066, - "eval_samples_per_second": 0.661, - "eval_steps_per_second": 0.021, + "eval_loss": 4.988397121429443, + "eval_runtime": 2.0878, + "eval_samples_per_second": 15.327, + "eval_steps_per_second": 0.479, "num_input_tokens_seen": 0, "step": 0 }, { "epoch": 0, - "eval_byte_accuracy": 0.00556379821958457, - "eval_chrf": 0.23324520427624654, - "eval_sacrebleu": 0.051034814527341744, + "eval_byte_accuracy": 0.29080118694362017, + "eval_chrf": 1.9898598915486052, + "eval_sacrebleu": 0.027097937214200497, "eval_word_accuracy": 0.0, "num_input_tokens_seen": 0, - "perplexity": 226.3292667344946, + "perplexity": 146.70109090640656, "step": 0 }, { "epoch": 0.0021635655560363477, - "grad_norm": 4.25, - "learning_rate": 0.00029729999999999996, - "loss": 3.9446, + "grad_norm": 18.625, + "learning_rate": 1.0799999999999998e-05, + "loss": 4.9798, "num_input_tokens_seen": 655360, "step": 10, - "train_runtime": 53.3476, - "train_tokens_per_second": 12284.72 + "train_runtime": 7.4881, + "train_tokens_per_second": 87520.259 }, { "epoch": 0.004327131112072695, - "grad_norm": 2.890625, - "learning_rate": 0.00029429999999999994, - "loss": 2.6375, + "grad_norm": 16.5, + "learning_rate": 2.28e-05, + "loss": 4.7276, "num_input_tokens_seen": 1310720, "step": 20, - "train_runtime": 56.802, - "train_tokens_per_second": 23075.253 + "train_runtime": 11.2856, + "train_tokens_per_second": 116140.595 }, { "epoch": 0.006490696668109044, - "grad_norm": 2.3125, - "learning_rate": 0.0002913, - "loss": 2.0097, + "grad_norm": 16.375, + "learning_rate": 3.48e-05, + "loss": 4.0073, "num_input_tokens_seen": 1966080, "step": 30, - "train_runtime": 60.2174, - "train_tokens_per_second": 32649.681 + "train_runtime": 15.0717, + "train_tokens_per_second": 130448.408 }, { "epoch": 0.00865426222414539, - "grad_norm": 3.78125, - "learning_rate": 0.00028829999999999996, - "loss": 1.7175, + "grad_norm": 10.8125, + "learning_rate": 4.68e-05, + "loss": 2.5584, "num_input_tokens_seen": 2621440, "step": 40, - "train_runtime": 63.6481, - "train_tokens_per_second": 41186.473 + "train_runtime": 18.8656, + "train_tokens_per_second": 138953.652 }, { "epoch": 0.010817827780181739, - "grad_norm": 1.8046875, - "learning_rate": 0.00028529999999999994, - "loss": 1.4755, + "grad_norm": 13.0625, + "learning_rate": 5.88e-05, + "loss": 1.5522, "num_input_tokens_seen": 3276800, "step": 50, - "train_runtime": 67.154, - "train_tokens_per_second": 48795.295 + "train_runtime": 22.6701, + "train_tokens_per_second": 144542.984 }, { "epoch": 0.012981393336218087, - "grad_norm": 3.015625, - "learning_rate": 0.0002823, - "loss": 1.35, + "grad_norm": 10.0, + "learning_rate": 7.079999999999999e-05, + "loss": 1.2928, "num_input_tokens_seen": 3932160, "step": 60, - "train_runtime": 70.6232, - "train_tokens_per_second": 55678.034 + "train_runtime": 26.4611, + "train_tokens_per_second": 148601.312 }, { "epoch": 0.015144958892254435, - "grad_norm": 1.6015625, - "learning_rate": 0.0002793, - "loss": 1.2561, + "grad_norm": 11.5625, + "learning_rate": 8.28e-05, + "loss": 1.1674, "num_input_tokens_seen": 4587520, "step": 70, - "train_runtime": 74.0517, - "train_tokens_per_second": 61950.218 + "train_runtime": 30.2488, + "train_tokens_per_second": 151659.614 }, { "epoch": 0.01730852444829078, - "grad_norm": 2.53125, - "learning_rate": 0.0002763, - "loss": 1.1825, + "grad_norm": 9.375, + "learning_rate": 9.479999999999999e-05, + "loss": 1.0741, "num_input_tokens_seen": 5242880, "step": 80, - "train_runtime": 77.5175, - "train_tokens_per_second": 67634.8 + "train_runtime": 34.0502, + "train_tokens_per_second": 153975.057 }, { "epoch": 0.01947209000432713, - "grad_norm": 1.609375, - "learning_rate": 0.0002733, - "loss": 1.1201, + "grad_norm": 26.375, + "learning_rate": 0.00010679999999999998, + "loss": 1.0159, "num_input_tokens_seen": 5898240, "step": 90, - "train_runtime": 80.9279, - "train_tokens_per_second": 72882.663 + "train_runtime": 37.8468, + "train_tokens_per_second": 155845.247 }, { "epoch": 0.021635655560363478, - "grad_norm": 1.6796875, - "learning_rate": 0.00027029999999999996, - "loss": 1.0553, + "grad_norm": 3.5625, + "learning_rate": 0.0001188, + "loss": 0.9462, "num_input_tokens_seen": 6553600, "step": 100, - "train_runtime": 84.3638, - "train_tokens_per_second": 77682.564 + "train_runtime": 41.6439, + "train_tokens_per_second": 157372.435 }, { "epoch": 0.021635655560363478, - "eval_loss": 1.2029575109481812, - "eval_runtime": 22.7471, - "eval_samples_per_second": 1.407, - "eval_steps_per_second": 0.044, + "eval_loss": 0.9727591276168823, + "eval_runtime": 0.0828, + "eval_samples_per_second": 386.585, + "eval_steps_per_second": 12.081, "num_input_tokens_seen": 6553600, "step": 100 }, { "epoch": 0.021635655560363478, - "eval_byte_accuracy": 0.6672848664688428, - "eval_chrf": 4.816504308156234, - "eval_sacrebleu": 0.09253840735529666, - "eval_word_accuracy": 0.6195054945054945, + "eval_byte_accuracy": 0.7091988130563798, + "eval_chrf": 0.0, + "eval_sacrebleu": 0.0, + "eval_word_accuracy": 0.5659340659340659, "num_input_tokens_seen": 6553600, - "perplexity": 3.3299507395272667, + "perplexity": 2.6452329350572574, "step": 100 }, { "epoch": 0.023799221116399826, - "grad_norm": 1.7734375, - "learning_rate": 0.0002673, - "loss": 1.0688, + "grad_norm": 2.28125, + "learning_rate": 0.00013079999999999998, + "loss": 0.9063, "num_input_tokens_seen": 7204864, "step": 110, - "train_runtime": 110.5539, - "train_tokens_per_second": 65170.578 + "train_runtime": 45.5286, + "train_tokens_per_second": 158249.227 }, { "epoch": 0.025962786672436174, - "grad_norm": 1.4609375, - "learning_rate": 0.0002643, - "loss": 1.0217, + "grad_norm": 1.6640625, + "learning_rate": 0.00014279999999999997, + "loss": 0.8488, "num_input_tokens_seen": 7860224, "step": 120, - "train_runtime": 113.9776, - "train_tokens_per_second": 68962.871 + "train_runtime": 49.3213, + "train_tokens_per_second": 159367.885 }, { "epoch": 0.028126352228472522, - "grad_norm": 3.21875, - "learning_rate": 0.00026129999999999995, - "loss": 1.0099, + "grad_norm": 3.40625, + "learning_rate": 0.0001548, + "loss": 0.8492, "num_input_tokens_seen": 8515584, "step": 130, - "train_runtime": 117.3679, - "train_tokens_per_second": 72554.63 + "train_runtime": 53.131, + "train_tokens_per_second": 160275.187 }, { "epoch": 0.03028991778450887, - "grad_norm": 2.171875, - "learning_rate": 0.0002583, - "loss": 0.9889, + "grad_norm": 2.90625, + "learning_rate": 0.0001668, + "loss": 0.8508, "num_input_tokens_seen": 9170944, "step": 140, - "train_runtime": 120.7467, - "train_tokens_per_second": 75951.932 + "train_runtime": 56.9152, + "train_tokens_per_second": 161133.514 }, { "epoch": 0.032453483340545215, - "grad_norm": 3.3125, - "learning_rate": 0.00025529999999999997, - "loss": 0.9708, + "grad_norm": 1.03125, + "learning_rate": 0.00017879999999999998, + "loss": 0.8418, "num_input_tokens_seen": 9826304, "step": 150, - "train_runtime": 124.1577, - "train_tokens_per_second": 79143.736 + "train_runtime": 60.7146, + "train_tokens_per_second": 161844.175 }, { "epoch": 0.03461704889658156, - "grad_norm": 2.109375, - "learning_rate": 0.00025229999999999995, - "loss": 0.9574, + "grad_norm": 9.8125, + "learning_rate": 0.00019079999999999998, + "loss": 0.8424, "num_input_tokens_seen": 10481664, "step": 160, - "train_runtime": 127.6363, - "train_tokens_per_second": 82121.315 + "train_runtime": 64.5125, + "train_tokens_per_second": 162474.904 }, { "epoch": 0.03678061445261791, - "grad_norm": 1.7734375, - "learning_rate": 0.0002493, - "loss": 0.9417, + "grad_norm": 2.125, + "learning_rate": 0.0002028, + "loss": 0.8222, "num_input_tokens_seen": 11137024, "step": 170, - "train_runtime": 131.1407, - "train_tokens_per_second": 84924.235 + "train_runtime": 68.3125, + "train_tokens_per_second": 163030.526 }, { "epoch": 0.03894418000865426, - "grad_norm": 3.4375, - "learning_rate": 0.00024629999999999997, - "loss": 0.9035, + "grad_norm": 2.71875, + "learning_rate": 0.00021479999999999996, + "loss": 0.7886, "num_input_tokens_seen": 11792384, "step": 180, - "train_runtime": 134.5838, - "train_tokens_per_second": 87621.102 + "train_runtime": 72.1123, + "train_tokens_per_second": 163528.046 }, { "epoch": 0.04110774556469061, - "grad_norm": 2.296875, - "learning_rate": 0.0002433, - "loss": 0.8973, + "grad_norm": 0.76171875, + "learning_rate": 0.00022679999999999998, + "loss": 0.7946, "num_input_tokens_seen": 12447744, "step": 190, - "train_runtime": 138.0405, - "train_tokens_per_second": 90174.553 + "train_runtime": 75.9133, + "train_tokens_per_second": 163973.238 }, { "epoch": 0.043271311120726956, - "grad_norm": 2.03125, - "learning_rate": 0.00024029999999999999, - "loss": 0.8891, + "grad_norm": 1.28125, + "learning_rate": 0.0002388, + "loss": 0.7727, "num_input_tokens_seen": 13103104, "step": 200, - "train_runtime": 141.4844, - "train_tokens_per_second": 92611.638 + "train_runtime": 79.7148, + "train_tokens_per_second": 164374.865 }, { "epoch": 0.043271311120726956, - "eval_loss": 0.9748592972755432, - "eval_runtime": 22.0001, - "eval_samples_per_second": 1.455, - "eval_steps_per_second": 0.045, + "eval_loss": 0.8417919874191284, + "eval_runtime": 1.5809, + "eval_samples_per_second": 20.242, + "eval_steps_per_second": 0.633, "num_input_tokens_seen": 13103104, "step": 200 }, { "epoch": 0.043271311120726956, - "eval_byte_accuracy": 0.7206973293768546, - "eval_chrf": 8.703590487535086, - "eval_sacrebleu": 0.2699079660103599, - "eval_word_accuracy": 0.6510989010989011, + "eval_byte_accuracy": 0.7440652818991098, + "eval_chrf": 6.6820302805599665, + "eval_sacrebleu": 0.4581175341354607, + "eval_word_accuracy": 0.6263736263736264, "num_input_tokens_seen": 13103104, - "perplexity": 2.650794210774719, + "perplexity": 2.320521598664858, "step": 200 }, { "epoch": 0.045434876676763304, - "grad_norm": 2.6875, - "learning_rate": 0.0002373, - "loss": 0.8641, + "grad_norm": 3.46875, + "learning_rate": 0.00025079999999999997, + "loss": 0.7692, "num_input_tokens_seen": 13758464, "step": 210, - "train_runtime": 166.9485, - "train_tokens_per_second": 82411.44 + "train_runtime": 85.1122, + "train_tokens_per_second": 161650.863 }, { "epoch": 0.04759844223279965, - "grad_norm": 2.578125, - "learning_rate": 0.00023429999999999998, - "loss": 0.8039, + "grad_norm": 0.90234375, + "learning_rate": 0.0002628, + "loss": 0.7411, "num_input_tokens_seen": 14413824, "step": 220, - "train_runtime": 170.4084, - "train_tokens_per_second": 84583.985 + "train_runtime": 88.9079, + "train_tokens_per_second": 162120.908 }, { "epoch": 0.049762007788836, - "grad_norm": 2.59375, - "learning_rate": 0.00023129999999999998, - "loss": 0.8067, + "grad_norm": 0.7109375, + "learning_rate": 0.0002748, + "loss": 0.7572, "num_input_tokens_seen": 15069184, "step": 230, - "train_runtime": 173.8297, - "train_tokens_per_second": 86689.358 + "train_runtime": 92.7125, + "train_tokens_per_second": 162536.706 }, { "epoch": 0.05192557334487235, - "grad_norm": 3.203125, - "learning_rate": 0.0002283, - "loss": 0.7948, + "grad_norm": 1.125, + "learning_rate": 0.0002868, + "loss": 0.7636, "num_input_tokens_seen": 15724544, "step": 240, - "train_runtime": 177.2908, - "train_tokens_per_second": 88693.527 + "train_runtime": 96.5159, + "train_tokens_per_second": 162921.839 }, { "epoch": 0.0540891389009087, - "grad_norm": 4.375, - "learning_rate": 0.00022529999999999997, - "loss": 0.7461, + "grad_norm": 4.34375, + "learning_rate": 0.0002988, + "loss": 0.7582, "num_input_tokens_seen": 16379904, "step": 250, - "train_runtime": 180.7162, - "train_tokens_per_second": 90638.818 + "train_runtime": 100.3175, + "train_tokens_per_second": 163280.687 }, { "epoch": 0.056252704456945045, - "grad_norm": 5.625, - "learning_rate": 0.00022229999999999998, - "loss": 0.6889, + "grad_norm": 1.203125, + "learning_rate": 0.00031079999999999997, + "loss": 0.7522, "num_input_tokens_seen": 17035264, "step": 260, - "train_runtime": 184.1442, - "train_tokens_per_second": 92510.436 + "train_runtime": 104.1187, + "train_tokens_per_second": 163613.953 }, { "epoch": 0.05841627001298139, - "grad_norm": 4.21875, - "learning_rate": 0.00021929999999999996, - "loss": 0.6434, + "grad_norm": 1.953125, + "learning_rate": 0.0003228, + "loss": 0.749, "num_input_tokens_seen": 17690624, "step": 270, - "train_runtime": 187.6232, - "train_tokens_per_second": 94288.05 + "train_runtime": 107.9263, + "train_tokens_per_second": 163913.881 }, { "epoch": 0.06057983556901774, - "grad_norm": 5.46875, - "learning_rate": 0.00021629999999999997, - "loss": 0.6231, + "grad_norm": 1.5078125, + "learning_rate": 0.0003348, + "loss": 0.7412, "num_input_tokens_seen": 18345984, "step": 280, - "train_runtime": 191.0696, - "train_tokens_per_second": 96017.307 + "train_runtime": 111.7314, + "train_tokens_per_second": 164197.152 }, { "epoch": 0.06274340112505408, - "grad_norm": 3.046875, - "learning_rate": 0.00021329999999999998, - "loss": 0.5308, + "grad_norm": 2.421875, + "learning_rate": 0.0003467999999999999, + "loss": 0.6982, "num_input_tokens_seen": 19001344, "step": 290, - "train_runtime": 194.54, - "train_tokens_per_second": 97673.215 + "train_runtime": 115.5403, + "train_tokens_per_second": 164456.388 }, { "epoch": 0.06490696668109043, - "grad_norm": 5.3125, - "learning_rate": 0.00021029999999999996, - "loss": 0.5001, + "grad_norm": 0.9296875, + "learning_rate": 0.00035879999999999994, + "loss": 0.6894, "num_input_tokens_seen": 19656704, "step": 300, - "train_runtime": 198.007, - "train_tokens_per_second": 99272.781 + "train_runtime": 119.3511, + "train_tokens_per_second": 164696.484 }, { "epoch": 0.06490696668109043, - "eval_loss": 0.5745899677276611, - "eval_runtime": 22.8914, - "eval_samples_per_second": 1.398, - "eval_steps_per_second": 0.044, + "eval_loss": 0.7334468364715576, + "eval_runtime": 1.9947, + "eval_samples_per_second": 16.042, + "eval_steps_per_second": 0.501, "num_input_tokens_seen": 19656704, "step": 300 }, { "epoch": 0.06490696668109043, - "eval_byte_accuracy": 0.8445845697329377, - "eval_chrf": 9.178896775049383, - "eval_sacrebleu": 1.9389595133516544, - "eval_word_accuracy": 0.7774725274725275, + "eval_byte_accuracy": 0.771513353115727, + "eval_chrf": 12.160441031342272, + "eval_sacrebleu": 0.710868857582003, + "eval_word_accuracy": 0.6510989010989011, "num_input_tokens_seen": 19656704, - "perplexity": 1.7764019954167647, + "perplexity": 2.0822454120550398, "step": 300 }, { "epoch": 0.06707053223712678, - "grad_norm": 3.453125, - "learning_rate": 0.00020729999999999997, - "loss": 0.4656, + "grad_norm": 2.640625, + "learning_rate": 0.00037079999999999996, + "loss": 0.6499, "num_input_tokens_seen": 20312064, "step": 310, - "train_runtime": 224.3825, - "train_tokens_per_second": 90524.295 + "train_runtime": 125.165, + "train_tokens_per_second": 162282.302 }, { "epoch": 0.06923409779316313, - "grad_norm": 5.28125, - "learning_rate": 0.0002043, - "loss": 0.4398, + "grad_norm": 3.328125, + "learning_rate": 0.0003828, + "loss": 0.6507, "num_input_tokens_seen": 20967424, "step": 320, - "train_runtime": 227.7856, - "train_tokens_per_second": 92048.963 + "train_runtime": 128.9715, + "train_tokens_per_second": 162574.135 }, { "epoch": 0.07139766334919947, - "grad_norm": 4.03125, - "learning_rate": 0.0002013, - "loss": 0.4109, + "grad_norm": 3.890625, + "learning_rate": 0.0003948, + "loss": 0.6283, "num_input_tokens_seen": 21622784, "step": 330, - "train_runtime": 231.2085, - "train_tokens_per_second": 93520.72 + "train_runtime": 132.7747, + "train_tokens_per_second": 162853.208 }, { "epoch": 0.07356122890523582, - "grad_norm": 6.59375, - "learning_rate": 0.0001983, - "loss": 0.359, + "grad_norm": 2.234375, + "learning_rate": 0.00040679999999999997, + "loss": 0.5841, "num_input_tokens_seen": 22278144, "step": 340, - "train_runtime": 234.6039, - "train_tokens_per_second": 94960.692 + "train_runtime": 136.5774, + "train_tokens_per_second": 163117.394 }, { "epoch": 0.07572479446127217, - "grad_norm": 7.0625, - "learning_rate": 0.00019529999999999998, - "loss": 0.3624, + "grad_norm": 4.53125, + "learning_rate": 0.00041879999999999993, + "loss": 0.5767, "num_input_tokens_seen": 22933504, "step": 350, - "train_runtime": 238.0262, - "train_tokens_per_second": 96348.666 + "train_runtime": 140.3834, + "train_tokens_per_second": 163363.348 }, { "epoch": 0.07788836001730852, - "grad_norm": 3.71875, - "learning_rate": 0.00019229999999999999, - "loss": 0.3229, + "grad_norm": 1.6171875, + "learning_rate": 0.00043079999999999995, + "loss": 0.5292, "num_input_tokens_seen": 23588864, "step": 360, - "train_runtime": 241.4464, - "train_tokens_per_second": 97698.128 + "train_runtime": 144.187, + "train_tokens_per_second": 163599.098 }, { "epoch": 0.08005192557334487, - "grad_norm": 3.4375, - "learning_rate": 0.0001893, - "loss": 0.3278, + "grad_norm": 1.7890625, + "learning_rate": 0.0004428, + "loss": 0.5157, "num_input_tokens_seen": 24244224, "step": 370, - "train_runtime": 244.8476, - "train_tokens_per_second": 99017.601 + "train_runtime": 147.9913, + "train_tokens_per_second": 163821.98 }, { "epoch": 0.08221549112938122, - "grad_norm": 3.15625, - "learning_rate": 0.00018629999999999997, - "loss": 0.3145, + "grad_norm": 2.4375, + "learning_rate": 0.00045479999999999994, + "loss": 0.4898, "num_input_tokens_seen": 24899584, "step": 380, - "train_runtime": 248.2475, - "train_tokens_per_second": 100301.461 + "train_runtime": 151.7906, + "train_tokens_per_second": 164039.076 }, { "epoch": 0.08437905668541756, - "grad_norm": 3.859375, - "learning_rate": 0.00018329999999999998, - "loss": 0.2913, + "grad_norm": 2.78125, + "learning_rate": 0.00046679999999999996, + "loss": 0.4582, "num_input_tokens_seen": 25554944, "step": 390, - "train_runtime": 251.6505, - "train_tokens_per_second": 101549.353 + "train_runtime": 155.5937, + "train_tokens_per_second": 164241.491 }, { "epoch": 0.08654262224145391, - "grad_norm": 3.96875, - "learning_rate": 0.00018029999999999996, - "loss": 0.2765, + "grad_norm": 3.453125, + "learning_rate": 0.0004788, + "loss": 0.431, "num_input_tokens_seen": 26210304, "step": 400, - "train_runtime": 255.1075, - "train_tokens_per_second": 102742.195 + "train_runtime": 159.3994, + "train_tokens_per_second": 164431.67 }, { "epoch": 0.08654262224145391, - "eval_loss": 0.3843859136104584, - "eval_runtime": 21.2135, - "eval_samples_per_second": 1.508, - "eval_steps_per_second": 0.047, + "eval_loss": 0.4691297113895416, + "eval_runtime": 1.7697, + "eval_samples_per_second": 18.082, + "eval_steps_per_second": 0.565, "num_input_tokens_seen": 26210304, "step": 400 }, { "epoch": 0.08654262224145391, - "eval_byte_accuracy": 0.9009643916913946, - "eval_chrf": 16.239382668347154, - "eval_sacrebleu": 8.024979815502066, - "eval_word_accuracy": 0.8516483516483516, + "eval_byte_accuracy": 0.8479228486646885, + "eval_chrf": 25.30382858102524, + "eval_sacrebleu": 6.954022634991616, + "eval_word_accuracy": 0.7348901098901099, "num_input_tokens_seen": 26210304, - "perplexity": 1.468712128329079, + "perplexity": 1.5986023422380802, "step": 400 }, { "epoch": 0.08870618779749026, - "grad_norm": 3.609375, - "learning_rate": 0.00017729999999999997, - "loss": 0.2725, + "grad_norm": 1.328125, + "learning_rate": 0.0004907999999999999, + "loss": 0.4126, "num_input_tokens_seen": 26865664, "step": 410, - "train_runtime": 279.8326, - "train_tokens_per_second": 96006.207 + "train_runtime": 164.9942, + "train_tokens_per_second": 162827.884 }, { "epoch": 0.09086975335352661, - "grad_norm": 3.640625, - "learning_rate": 0.00017429999999999998, - "loss": 0.2587, + "grad_norm": 1.7421875, + "learning_rate": 0.0005028, + "loss": 0.4102, "num_input_tokens_seen": 27521024, "step": 420, - "train_runtime": 283.3448, - "train_tokens_per_second": 97129.088 + "train_runtime": 168.7954, + "train_tokens_per_second": 163043.709 }, { "epoch": 0.09303331890956296, - "grad_norm": 3.09375, - "learning_rate": 0.00017129999999999996, - "loss": 0.2441, + "grad_norm": 2.65625, + "learning_rate": 0.0005147999999999999, + "loss": 0.3765, "num_input_tokens_seen": 28176384, "step": 430, - "train_runtime": 286.8017, - "train_tokens_per_second": 98243.419 + "train_runtime": 172.5853, + "train_tokens_per_second": 163260.64 }, { "epoch": 0.0951968844655993, - "grad_norm": 4.34375, - "learning_rate": 0.0001683, - "loss": 0.2553, + "grad_norm": 5.875, + "learning_rate": 0.0005267999999999999, + "loss": 0.3727, "num_input_tokens_seen": 28831744, "step": 440, - "train_runtime": 290.3024, - "train_tokens_per_second": 99316.256 + "train_runtime": 176.393, + "train_tokens_per_second": 163451.711 }, { "epoch": 0.09736045002163565, - "grad_norm": 4.4375, - "learning_rate": 0.0001653, - "loss": 0.2408, + "grad_norm": 3.4375, + "learning_rate": 0.0005388, + "loss": 0.3512, "num_input_tokens_seen": 29487104, "step": 450, - "train_runtime": 293.8253, - "train_tokens_per_second": 100355.9 + "train_runtime": 180.1886, + "train_tokens_per_second": 163645.825 }, { "epoch": 0.099524015577672, - "grad_norm": 2.53125, - "learning_rate": 0.0001623, - "loss": 0.2199, + "grad_norm": 2.515625, + "learning_rate": 0.0005507999999999999, + "loss": 0.3391, "num_input_tokens_seen": 30142464, "step": 460, - "train_runtime": 297.287, - "train_tokens_per_second": 101391.806 + "train_runtime": 184.0009, + "train_tokens_per_second": 163816.961 }, { "epoch": 0.10168758113370835, - "grad_norm": 3.65625, - "learning_rate": 0.0001593, - "loss": 0.219, + "grad_norm": 1.921875, + "learning_rate": 0.0005627999999999999, + "loss": 0.3332, "num_input_tokens_seen": 30797824, "step": 470, - "train_runtime": 300.7519, - "train_tokens_per_second": 102402.744 + "train_runtime": 187.7962, + "train_tokens_per_second": 163995.993 }, { "epoch": 0.1038511466897447, - "grad_norm": 3.34375, - "learning_rate": 0.0001563, - "loss": 0.2167, + "grad_norm": 4.65625, + "learning_rate": 0.0005747999999999999, + "loss": 0.3333, "num_input_tokens_seen": 31453184, "step": 480, - "train_runtime": 304.211, - "train_tokens_per_second": 103392.663 + "train_runtime": 191.5972, + "train_tokens_per_second": 164163.059 }, { "epoch": 0.10601471224578105, - "grad_norm": 3.65625, - "learning_rate": 0.00015329999999999999, - "loss": 0.2102, + "grad_norm": 2.234375, + "learning_rate": 0.0005868, + "loss": 0.311, "num_input_tokens_seen": 32108544, "step": 490, - "train_runtime": 307.6045, - "train_tokens_per_second": 104382.57 + "train_runtime": 195.3941, + "train_tokens_per_second": 164327.106 }, { "epoch": 0.1081782778018174, - "grad_norm": 4.15625, - "learning_rate": 0.0001503, - "loss": 0.1954, + "grad_norm": 3.078125, + "learning_rate": 0.0005987999999999999, + "loss": 0.2887, "num_input_tokens_seen": 32763904, "step": 500, - "train_runtime": 311.009, - "train_tokens_per_second": 105347.123 + "train_runtime": 199.2107, + "train_tokens_per_second": 164468.582 + }, + { + "epoch": 0.1081782778018174, + "eval_loss": 0.31848961114883423, + "eval_runtime": 1.8473, + "eval_samples_per_second": 17.322, + "eval_steps_per_second": 0.541, + "num_input_tokens_seen": 32763904, + "step": 500 + }, + { + "epoch": 0.1081782778018174, + "eval_byte_accuracy": 0.9031899109792285, + "eval_chrf": 43.27469201356104, + "eval_sacrebleu": 23.601674532788458, + "eval_word_accuracy": 0.8255494505494505, + "num_input_tokens_seen": 32763904, + "perplexity": 1.375049335926744, + "step": 500 }, { "epoch": 0.1081782778018174, - "eval_loss": 0.27605682611465454, - "eval_runtime": 20.0871, - "eval_samples_per_second": 1.593, - "eval_steps_per_second": 0.05, + "eval_loss": 0.41804632544517517, + "eval_runtime": 2.9141, + "eval_samples_per_second": 10.981, + "eval_steps_per_second": 0.343, "num_input_tokens_seen": 32763904, "step": 500 }, { "epoch": 0.1081782778018174, - "eval_byte_accuracy": 0.9313798219584569, - "eval_chrf": 21.222450284196043, - "eval_sacrebleu": 10.631200397266909, - "eval_word_accuracy": 0.8832417582417582, + "eval_byte_accuracy": 0.8816765578635015, + "eval_chrf": 33.679806731632475, + "eval_sacrebleu": 17.00545584726125, + "eval_word_accuracy": 0.7912087912087912, "num_input_tokens_seen": 32763904, - "perplexity": 1.317922754328964, + "perplexity": 1.51899104070853, "step": 500 }, { "epoch": 0.11034184335785374, - "grad_norm": 3.84375, - "learning_rate": 0.00014729999999999998, - "loss": 0.1989, + "grad_norm": 3.140625, + "learning_rate": 0.0005994315789473684, + "loss": 0.358, "num_input_tokens_seen": 33419264, "step": 510, - "train_runtime": 334.6759, - "train_tokens_per_second": 99855.599 + "train_runtime": 8.3489, + "train_tokens_per_second": 4002858.123 }, { "epoch": 0.11250540891389009, - "grad_norm": 4.5, - "learning_rate": 0.00014429999999999998, - "loss": 0.1815, + "grad_norm": 1.5859375, + "learning_rate": 0.0005987999999999999, + "loss": 0.3043, "num_input_tokens_seen": 34074624, "step": 520, - "train_runtime": 338.187, - "train_tokens_per_second": 100756.748 + "train_runtime": 12.1656, + "train_tokens_per_second": 2800901.822 }, { "epoch": 0.11466897446992644, - "grad_norm": 3.25, - "learning_rate": 0.0001413, - "loss": 0.174, + "grad_norm": 3.078125, + "learning_rate": 0.0005981684210526315, + "loss": 0.3086, "num_input_tokens_seen": 34729984, "step": 530, - "train_runtime": 341.6188, - "train_tokens_per_second": 101662.981 + "train_runtime": 15.9521, + "train_tokens_per_second": 2177143.961 }, { "epoch": 0.11683254002596279, - "grad_norm": 3.34375, - "learning_rate": 0.0001383, - "loss": 0.1971, + "grad_norm": 3.78125, + "learning_rate": 0.0005975368421052631, + "loss": 0.2929, "num_input_tokens_seen": 35385344, "step": 540, - "train_runtime": 345.0306, - "train_tokens_per_second": 102557.126 + "train_runtime": 19.7607, + "train_tokens_per_second": 1790696.752 }, { "epoch": 0.11899610558199913, - "grad_norm": 2.296875, - "learning_rate": 0.00013529999999999998, - "loss": 0.1836, + "grad_norm": 2.609375, + "learning_rate": 0.0005969052631578947, + "loss": 0.2783, "num_input_tokens_seen": 36040704, "step": 550, - "train_runtime": 348.4866, - "train_tokens_per_second": 103420.618 + "train_runtime": 23.5553, + "train_tokens_per_second": 1530045.774 }, { "epoch": 0.12115967113803548, - "grad_norm": 2.390625, - "learning_rate": 0.0001323, - "loss": 0.1533, + "grad_norm": 1.171875, + "learning_rate": 0.0005962736842105263, + "loss": 0.2455, "num_input_tokens_seen": 36696064, "step": 560, - "train_runtime": 351.9756, - "train_tokens_per_second": 104257.4 + "train_runtime": 27.3702, + "train_tokens_per_second": 1340728.723 }, { "epoch": 0.12332323669407183, - "grad_norm": 3.375, - "learning_rate": 0.0001293, - "loss": 0.1886, + "grad_norm": 2.21875, + "learning_rate": 0.0005956421052631579, + "loss": 0.2591, "num_input_tokens_seen": 37351424, "step": 570, - "train_runtime": 355.4474, - "train_tokens_per_second": 105082.849 + "train_runtime": 31.1633, + "train_tokens_per_second": 1198571.413 }, { "epoch": 0.12548680225010816, - "grad_norm": 3.21875, - "learning_rate": 0.00012629999999999998, - "loss": 0.1742, + "grad_norm": 2.203125, + "learning_rate": 0.0005950105263157894, + "loss": 0.245, "num_input_tokens_seen": 38006784, "step": 580, - "train_runtime": 358.8927, - "train_tokens_per_second": 105900.131 + "train_runtime": 34.9733, + "train_tokens_per_second": 1086737.214 }, { "epoch": 0.12765036780614453, - "grad_norm": 2.421875, - "learning_rate": 0.0001233, - "loss": 0.16, + "grad_norm": 1.453125, + "learning_rate": 0.000594378947368421, + "loss": 0.2374, "num_input_tokens_seen": 38662144, "step": 590, - "train_runtime": 362.374, - "train_tokens_per_second": 106691.283 + "train_runtime": 38.778, + "train_tokens_per_second": 997012.875 }, { "epoch": 0.12981393336218086, - "grad_norm": 3.703125, - "learning_rate": 0.0001203, - "loss": 0.159, + "grad_norm": 1.9140625, + "learning_rate": 0.0005937473684210525, + "loss": 0.2287, "num_input_tokens_seen": 39317504, "step": 600, - "train_runtime": 365.8372, - "train_tokens_per_second": 107472.671 + "train_runtime": 42.5907, + "train_tokens_per_second": 923146.805 }, { "epoch": 0.12981393336218086, - "eval_loss": 0.20776523649692535, - "eval_runtime": 20.9696, - "eval_samples_per_second": 1.526, - "eval_steps_per_second": 0.048, + "eval_loss": 0.21300141513347626, + "eval_runtime": 1.6985, + "eval_samples_per_second": 18.84, + "eval_steps_per_second": 0.589, "num_input_tokens_seen": 39317504, "step": 600 }, { "epoch": 0.12981393336218086, - "eval_byte_accuracy": 0.951780415430267, - "eval_chrf": 23.247478740295417, - "eval_sacrebleu": 10.306722695379342, - "eval_word_accuracy": 0.9052197802197802, + "eval_byte_accuracy": 0.9362017804154302, + "eval_chrf": 58.769769707171605, + "eval_sacrebleu": 45.27561934180984, + "eval_word_accuracy": 0.8736263736263736, "num_input_tokens_seen": 39317504, - "perplexity": 1.2309241595579792, + "perplexity": 1.2373864023092827, "step": 600 }, { "epoch": 0.13197749891821722, - "grad_norm": 3.421875, - "learning_rate": 0.00011729999999999999, - "loss": 0.1489, + "grad_norm": 2.46875, + "learning_rate": 0.0005931157894736842, + "loss": 0.2159, "num_input_tokens_seen": 39972864, "step": 610, - "train_runtime": 390.2326, - "train_tokens_per_second": 102433.431 + "train_runtime": 48.1225, + "train_tokens_per_second": 830647.665 }, { "epoch": 0.13414106447425356, - "grad_norm": 3.515625, - "learning_rate": 0.00011429999999999999, - "loss": 0.1559, + "grad_norm": 1.703125, + "learning_rate": 0.0005924842105263157, + "loss": 0.2167, "num_input_tokens_seen": 40628224, "step": 620, - "train_runtime": 393.6228, - "train_tokens_per_second": 103216.142 + "train_runtime": 51.9415, + "train_tokens_per_second": 782191.417 }, { "epoch": 0.13630463003028992, - "grad_norm": 2.953125, - "learning_rate": 0.0001113, - "loss": 0.1548, + "grad_norm": 3.65625, + "learning_rate": 0.0005918526315789474, + "loss": 0.2096, "num_input_tokens_seen": 41283584, "step": 630, - "train_runtime": 397.0454, - "train_tokens_per_second": 103976.988 + "train_runtime": 55.7509, + "train_tokens_per_second": 740500.215 }, { "epoch": 0.13846819558632625, - "grad_norm": 3.15625, - "learning_rate": 0.00010829999999999999, - "loss": 0.1594, + "grad_norm": 2.421875, + "learning_rate": 0.0005912210526315788, + "loss": 0.2046, "num_input_tokens_seen": 41938944, "step": 640, - "train_runtime": 400.5466, - "train_tokens_per_second": 104704.282 + "train_runtime": 59.5559, + "train_tokens_per_second": 704194.506 }, { "epoch": 0.14063176114236262, - "grad_norm": 3.25, - "learning_rate": 0.00010529999999999998, - "loss": 0.1575, + "grad_norm": 3.15625, + "learning_rate": 0.0005905894736842105, + "loss": 0.2014, "num_input_tokens_seen": 42594304, "step": 650, - "train_runtime": 403.9782, - "train_tokens_per_second": 105437.143 + "train_runtime": 63.3661, + "train_tokens_per_second": 672194.358 }, { "epoch": 0.14279532669839895, - "grad_norm": 2.75, - "learning_rate": 0.00010229999999999999, - "loss": 0.1546, + "grad_norm": 2.34375, + "learning_rate": 0.000589957894736842, + "loss": 0.198, "num_input_tokens_seen": 43249664, "step": 660, - "train_runtime": 407.3829, - "train_tokens_per_second": 106164.664 + "train_runtime": 67.1728, + "train_tokens_per_second": 643856.801 }, { "epoch": 0.1449588922544353, "grad_norm": 2.21875, - "learning_rate": 9.93e-05, - "loss": 0.1469, + "learning_rate": 0.0005893263157894736, + "loss": 0.1932, "num_input_tokens_seen": 43905024, "step": 670, - "train_runtime": 410.8075, - "train_tokens_per_second": 106874.943 + "train_runtime": 70.9897, + "train_tokens_per_second": 618470.148 }, { "epoch": 0.14712245781047165, - "grad_norm": 2.921875, - "learning_rate": 9.63e-05, - "loss": 0.1473, + "grad_norm": 2.984375, + "learning_rate": 0.0005886947368421052, + "loss": 0.1864, "num_input_tokens_seen": 44560384, "step": 680, - "train_runtime": 414.239, - "train_tokens_per_second": 107571.685 + "train_runtime": 74.8037, + "train_tokens_per_second": 595697.625 }, { "epoch": 0.149286023366508, - "grad_norm": 3.015625, - "learning_rate": 9.329999999999999e-05, - "loss": 0.1441, + "grad_norm": 3.0625, + "learning_rate": 0.0005880631578947368, + "loss": 0.1815, "num_input_tokens_seen": 45215744, "step": 690, - "train_runtime": 417.7097, - "train_tokens_per_second": 108246.818 + "train_runtime": 78.6205, + "train_tokens_per_second": 575113.667 }, { "epoch": 0.15144958892254434, - "grad_norm": 3.3125, - "learning_rate": 9.029999999999999e-05, - "loss": 0.1393, + "grad_norm": 2.34375, + "learning_rate": 0.0005874315789473684, + "loss": 0.1814, "num_input_tokens_seen": 45871104, "step": 700, - "train_runtime": 421.2015, - "train_tokens_per_second": 108905.373 + "train_runtime": 82.4441, + "train_tokens_per_second": 556390.233 }, { "epoch": 0.15144958892254434, - "eval_loss": 0.17350037395954132, - "eval_runtime": 21.7487, - "eval_samples_per_second": 1.471, - "eval_steps_per_second": 0.046, + "eval_loss": 0.1680818498134613, + "eval_runtime": 1.3101, + "eval_samples_per_second": 24.426, + "eval_steps_per_second": 0.763, "num_input_tokens_seen": 45871104, "step": 700 }, { "epoch": 0.15144958892254434, - "eval_byte_accuracy": 0.9643916913946587, - "eval_chrf": 26.748052803704724, - "eval_sacrebleu": 13.976459661617579, - "eval_word_accuracy": 0.9340659340659341, + "eval_byte_accuracy": 0.9506676557863502, + "eval_chrf": 67.20800143820422, + "eval_sacrebleu": 55.06406011918448, + "eval_word_accuracy": 0.8997252747252747, "num_input_tokens_seen": 45871104, - "perplexity": 1.1894611315798644, + "perplexity": 1.183033437751315, "step": 700 }, { "epoch": 0.1536131544785807, - "grad_norm": 2.75, - "learning_rate": 8.729999999999998e-05, - "loss": 0.1507, + "grad_norm": 1.921875, + "learning_rate": 0.0005868, + "loss": 0.1903, "num_input_tokens_seen": 46526464, "step": 710, - "train_runtime": 446.365, - "train_tokens_per_second": 104234.123 + "train_runtime": 87.5833, + "train_tokens_per_second": 531225.494 }, { "epoch": 0.15577672003461704, - "grad_norm": 2.140625, - "learning_rate": 8.43e-05, - "loss": 0.1402, + "grad_norm": 2.265625, + "learning_rate": 0.0005861684210526315, + "loss": 0.175, "num_input_tokens_seen": 47181824, "step": 720, - "train_runtime": 449.8481, - "train_tokens_per_second": 104883.896 + "train_runtime": 91.4013, + "train_tokens_per_second": 516205.405 }, { "epoch": 0.1579402855906534, - "grad_norm": 2.375, - "learning_rate": 8.13e-05, - "loss": 0.1353, + "grad_norm": 2.4375, + "learning_rate": 0.0005855368421052631, + "loss": 0.1664, "num_input_tokens_seen": 47837184, "step": 730, - "train_runtime": 453.308, - "train_tokens_per_second": 105529.095 + "train_runtime": 95.225, + "train_tokens_per_second": 502359.361 }, { "epoch": 0.16010385114668974, - "grad_norm": 2.609375, - "learning_rate": 7.829999999999999e-05, - "loss": 0.1384, + "grad_norm": 3.46875, + "learning_rate": 0.0005849052631578946, + "loss": 0.1707, "num_input_tokens_seen": 48492544, "step": 740, - "train_runtime": 456.7381, - "train_tokens_per_second": 106171.456 + "train_runtime": 99.0437, + "train_tokens_per_second": 489607.502 }, { "epoch": 0.1622674167027261, - "grad_norm": 2.734375, - "learning_rate": 7.529999999999999e-05, - "loss": 0.1405, + "grad_norm": 2.296875, + "learning_rate": 0.0005842736842105263, + "loss": 0.1741, "num_input_tokens_seen": 49147904, "step": 750, - "train_runtime": 460.1649, - "train_tokens_per_second": 106804.984 + "train_runtime": 102.8647, + "train_tokens_per_second": 477791.681 }, { "epoch": 0.16443098225876243, - "grad_norm": 2.40625, - "learning_rate": 7.23e-05, - "loss": 0.1257, + "grad_norm": 4.125, + "learning_rate": 0.0005836421052631578, + "loss": 0.1601, "num_input_tokens_seen": 49803264, "step": 760, - "train_runtime": 463.5927, - "train_tokens_per_second": 107428.92 + "train_runtime": 106.6841, + "train_tokens_per_second": 466829.3 }, { "epoch": 0.1665945478147988, - "grad_norm": 2.09375, - "learning_rate": 6.93e-05, - "loss": 0.137, + "grad_norm": 1.484375, + "learning_rate": 0.0005830105263157895, + "loss": 0.1658, "num_input_tokens_seen": 50458624, "step": 770, - "train_runtime": 467.0576, - "train_tokens_per_second": 108035.112 + "train_runtime": 110.5067, + "train_tokens_per_second": 456611.574 }, { "epoch": 0.16875811337083513, - "grad_norm": 1.6953125, - "learning_rate": 6.63e-05, - "loss": 0.125, + "grad_norm": 1.40625, + "learning_rate": 0.000582378947368421, + "loss": 0.1514, "num_input_tokens_seen": 51113984, "step": 780, - "train_runtime": 470.5498, - "train_tokens_per_second": 108626.089 + "train_runtime": 114.331, + "train_tokens_per_second": 447070.255 }, { "epoch": 0.1709216789268715, - "grad_norm": 2.25, - "learning_rate": 6.33e-05, - "loss": 0.125, + "grad_norm": 1.875, + "learning_rate": 0.0005817473684210526, + "loss": 0.1582, "num_input_tokens_seen": 51769344, "step": 790, - "train_runtime": 474.3406, - "train_tokens_per_second": 109139.601 + "train_runtime": 118.1477, + "train_tokens_per_second": 438174.843 }, { "epoch": 0.17308524448290782, - "grad_norm": 1.9296875, - "learning_rate": 6.0299999999999995e-05, - "loss": 0.119, + "grad_norm": 1.546875, + "learning_rate": 0.0005811157894736841, + "loss": 0.1502, "num_input_tokens_seen": 52424704, "step": 800, - "train_runtime": 477.7448, - "train_tokens_per_second": 109733.707 + "train_runtime": 121.9652, + "train_tokens_per_second": 429833.159 }, { "epoch": 0.17308524448290782, - "eval_loss": 0.15284772217273712, - "eval_runtime": 21.1783, - "eval_samples_per_second": 1.511, - "eval_steps_per_second": 0.047, + "eval_loss": 0.1423906683921814, + "eval_runtime": 1.704, + "eval_samples_per_second": 18.779, + "eval_steps_per_second": 0.587, "num_input_tokens_seen": 52424704, "step": 800 }, { "epoch": 0.17308524448290782, - "eval_byte_accuracy": 0.9666172106824926, - "eval_chrf": 26.118122824864205, - "eval_sacrebleu": 13.014692647134785, - "eval_word_accuracy": 0.9354395604395604, + "eval_byte_accuracy": 0.9577151335311572, + "eval_chrf": 68.88294130098063, + "eval_sacrebleu": 58.6583351753512, + "eval_word_accuracy": 0.9120879120879121, "num_input_tokens_seen": 52424704, - "perplexity": 1.165147539297294, + "perplexity": 1.1530270117687667, "step": 800 }, { "epoch": 0.17524881003894419, - "grad_norm": 1.953125, - "learning_rate": 5.73e-05, - "loss": 0.1239, + "grad_norm": 1.6328125, + "learning_rate": 0.0005804842105263157, + "loss": 0.1441, "num_input_tokens_seen": 53080064, "step": 810, - "train_runtime": 502.4383, - "train_tokens_per_second": 105644.939 + "train_runtime": 127.4996, + "train_tokens_per_second": 416315.372 }, { "epoch": 0.17741237559498052, - "grad_norm": 2.203125, - "learning_rate": 5.429999999999999e-05, - "loss": 0.1257, + "grad_norm": 1.9375, + "learning_rate": 0.0005798526315789473, + "loss": 0.1451, "num_input_tokens_seen": 53735424, "step": 820, - "train_runtime": 505.9473, - "train_tokens_per_second": 106207.55 + "train_runtime": 131.3126, + "train_tokens_per_second": 409217.56 }, { "epoch": 0.17957594115101688, - "grad_norm": 2.296875, - "learning_rate": 5.13e-05, - "loss": 0.1272, + "grad_norm": 2.609375, + "learning_rate": 0.0005792210526315789, + "loss": 0.148, "num_input_tokens_seen": 54390784, "step": 830, - "train_runtime": 509.427, - "train_tokens_per_second": 106768.556 + "train_runtime": 135.1268, + "train_tokens_per_second": 402516.49 }, { "epoch": 0.18173950670705322, - "grad_norm": 2.171875, - "learning_rate": 4.8299999999999995e-05, - "loss": 0.1243, + "grad_norm": 1.1640625, + "learning_rate": 0.0005785894736842105, + "loss": 0.1378, "num_input_tokens_seen": 55046144, "step": 840, - "train_runtime": 512.859, - "train_tokens_per_second": 107331.925 + "train_runtime": 138.9294, + "train_tokens_per_second": 396216.745 }, { "epoch": 0.18390307226308958, - "grad_norm": 1.9375, - "learning_rate": 4.5299999999999997e-05, - "loss": 0.1304, + "grad_norm": 1.7109375, + "learning_rate": 0.000577957894736842, + "loss": 0.1417, "num_input_tokens_seen": 55701504, "step": 850, - "train_runtime": 516.3279, - "train_tokens_per_second": 107880.089 + "train_runtime": 142.7317, + "train_tokens_per_second": 390253.256 }, { "epoch": 0.1860666378191259, - "grad_norm": 1.5, - "learning_rate": 4.229999999999999e-05, - "loss": 0.1257, + "grad_norm": 2.25, + "learning_rate": 0.0005773263157894736, + "loss": 0.1471, "num_input_tokens_seen": 56356864, "step": 860, - "train_runtime": 519.8294, - "train_tokens_per_second": 108414.148 + "train_runtime": 146.5477, + "train_tokens_per_second": 384563.353 }, { "epoch": 0.18823020337516227, - "grad_norm": 1.890625, - "learning_rate": 3.93e-05, - "loss": 0.1229, + "grad_norm": 1.4140625, + "learning_rate": 0.0005766947368421052, + "loss": 0.1388, "num_input_tokens_seen": 57012224, "step": 870, - "train_runtime": 523.3269, - "train_tokens_per_second": 108941.904 + "train_runtime": 150.3727, + "train_tokens_per_second": 379139.434 }, { "epoch": 0.1903937689311986, - "grad_norm": 1.6484375, - "learning_rate": 3.6299999999999995e-05, - "loss": 0.1184, + "grad_norm": 1.5, + "learning_rate": 0.0005760631578947368, + "loss": 0.131, "num_input_tokens_seen": 57667584, "step": 880, - "train_runtime": 526.8154, - "train_tokens_per_second": 109464.499 + "train_runtime": 154.1907, + "train_tokens_per_second": 374001.588 }, { "epoch": 0.19255733448723497, - "grad_norm": 1.9921875, - "learning_rate": 3.3299999999999996e-05, - "loss": 0.1226, + "grad_norm": 1.515625, + "learning_rate": 0.0005754315789473684, + "loss": 0.1321, "num_input_tokens_seen": 58322944, "step": 890, - "train_runtime": 530.2727, - "train_tokens_per_second": 109986.692 + "train_runtime": 158.0167, + "train_tokens_per_second": 369093.521 }, { "epoch": 0.1947209000432713, - "grad_norm": 1.984375, - "learning_rate": 3.0299999999999998e-05, - "loss": 0.1368, + "grad_norm": 1.9609375, + "learning_rate": 0.0005747999999999999, + "loss": 0.1588, "num_input_tokens_seen": 58978304, "step": 900, - "train_runtime": 533.7125, - "train_tokens_per_second": 110505.748 + "train_runtime": 161.8381, + "train_tokens_per_second": 364427.771 }, { "epoch": 0.1947209000432713, - "eval_loss": 0.13770230114459991, - "eval_runtime": 18.3861, - "eval_samples_per_second": 1.74, - "eval_steps_per_second": 0.054, + "eval_loss": 0.12399701774120331, + "eval_runtime": 1.6341, + "eval_samples_per_second": 19.583, + "eval_steps_per_second": 0.612, "num_input_tokens_seen": 58978304, "step": 900 }, { "epoch": 0.1947209000432713, - "eval_byte_accuracy": 0.9695845697329377, - "eval_chrf": 26.897492377246113, - "eval_sacrebleu": 12.517042044908205, - "eval_word_accuracy": 0.9436813186813187, + "eval_byte_accuracy": 0.9636498516320475, + "eval_chrf": 74.82778143236575, + "eval_sacrebleu": 62.86084965698353, + "eval_word_accuracy": 0.9189560439560439, "num_input_tokens_seen": 58978304, - "perplexity": 1.147633850131229, + "perplexity": 1.13201249503992, "step": 900 }, { "epoch": 0.19688446559930767, - "grad_norm": 1.6015625, - "learning_rate": 2.7299999999999996e-05, - "loss": 0.1268, + "grad_norm": 2.78125, + "learning_rate": 0.0005741684210526316, + "loss": 0.1457, "num_input_tokens_seen": 59633664, "step": 910, - "train_runtime": 555.5178, - "train_tokens_per_second": 107347.889 + "train_runtime": 167.3034, + "train_tokens_per_second": 356440.14 }, { "epoch": 0.199048031155344, - "grad_norm": 2.078125, - "learning_rate": 2.4299999999999998e-05, - "loss": 0.1211, + "grad_norm": 2.15625, + "learning_rate": 0.000573536842105263, + "loss": 0.1314, "num_input_tokens_seen": 60289024, "step": 920, - "train_runtime": 558.9387, - "train_tokens_per_second": 107863.385 + "train_runtime": 171.1208, + "train_tokens_per_second": 352318.563 }, { "epoch": 0.20121159671138036, - "grad_norm": 2.25, - "learning_rate": 2.1299999999999996e-05, - "loss": 0.1249, + "grad_norm": 1.7578125, + "learning_rate": 0.0005729052631578947, + "loss": 0.1392, "num_input_tokens_seen": 60944384, "step": 930, - "train_runtime": 562.3982, - "train_tokens_per_second": 108365.188 + "train_runtime": 174.9323, + "train_tokens_per_second": 348388.345 }, { "epoch": 0.2033751622674167, - "grad_norm": 1.625, - "learning_rate": 1.8299999999999998e-05, - "loss": 0.1231, + "grad_norm": 1.6171875, + "learning_rate": 0.0005722736842105262, + "loss": 0.1306, "num_input_tokens_seen": 61599744, "step": 940, - "train_runtime": 565.866, - "train_tokens_per_second": 108859.239 + "train_runtime": 178.748, + "train_tokens_per_second": 344617.844 }, { "epoch": 0.20553872782345306, - "grad_norm": 1.734375, - "learning_rate": 1.53e-05, - "loss": 0.1153, + "grad_norm": 2.203125, + "learning_rate": 0.0005716421052631578, + "loss": 0.127, "num_input_tokens_seen": 62255104, "step": 950, - "train_runtime": 569.2873, - "train_tokens_per_second": 109356.208 + "train_runtime": 182.5636, + "train_tokens_per_second": 341005.098 }, { "epoch": 0.2077022933794894, - "grad_norm": 1.6484375, - "learning_rate": 1.2299999999999999e-05, - "loss": 0.125, + "grad_norm": 2.0625, + "learning_rate": 0.0005710105263157894, + "loss": 0.1286, "num_input_tokens_seen": 62910464, "step": 960, - "train_runtime": 572.7259, - "train_tokens_per_second": 109843.923 + "train_runtime": 186.3727, + "train_tokens_per_second": 337551.932 }, { "epoch": 0.20986585893552576, - "grad_norm": 1.390625, - "learning_rate": 9.299999999999999e-06, - "loss": 0.1147, + "grad_norm": 1.359375, + "learning_rate": 0.000570378947368421, + "loss": 0.1128, "num_input_tokens_seen": 63565824, "step": 970, - "train_runtime": 576.1634, - "train_tokens_per_second": 110326.043 + "train_runtime": 190.1949, + "train_tokens_per_second": 334214.163 }, { "epoch": 0.2120294244915621, - "grad_norm": 1.734375, - "learning_rate": 6.3e-06, - "loss": 0.1184, + "grad_norm": 2.28125, + "learning_rate": 0.0005697473684210526, + "loss": 0.1255, "num_input_tokens_seen": 64221184, "step": 980, - "train_runtime": 579.5823, - "train_tokens_per_second": 110805.988 + "train_runtime": 193.9964, + "train_tokens_per_second": 331043.206 }, { "epoch": 0.21419299004759845, - "grad_norm": 1.3984375, - "learning_rate": 3.2999999999999993e-06, - "loss": 0.1246, + "grad_norm": 1.8125, + "learning_rate": 0.0005691157894736842, + "loss": 0.1216, "num_input_tokens_seen": 64876544, "step": 990, - "train_runtime": 583.0424, - "train_tokens_per_second": 111272.425 + "train_runtime": 197.8194, + "train_tokens_per_second": 327958.403 }, { "epoch": 0.2163565556036348, - "grad_norm": 1.359375, - "learning_rate": 3e-07, - "loss": 0.1192, + "grad_norm": 1.3671875, + "learning_rate": 0.0005684842105263157, + "loss": 0.1215, "num_input_tokens_seen": 65531904, "step": 1000, - "train_runtime": 586.545, - "train_tokens_per_second": 111725.279 - }, - { - "epoch": 0.2163565556036348, - "eval_loss": 0.13286983966827393, - "eval_runtime": 20.4041, - "eval_samples_per_second": 1.568, - "eval_steps_per_second": 0.049, - "num_input_tokens_seen": 65531904, - "step": 1000 - }, - { - "epoch": 0.2163565556036348, - "eval_byte_accuracy": 0.9725519287833828, - "eval_chrf": 27.454226627757045, - "eval_sacrebleu": 13.628568400407401, - "eval_word_accuracy": 0.9478021978021978, - "num_input_tokens_seen": 65531904, - "perplexity": 1.1421013323676128, - "step": 1000 + "train_runtime": 201.6448, + "train_tokens_per_second": 324986.864 }, { "epoch": 0.2163565556036348, - "eval_loss": 0.13286983966827393, - "eval_runtime": 22.4405, - "eval_samples_per_second": 1.426, - "eval_steps_per_second": 0.045, + "eval_loss": 0.10376948118209839, + "eval_runtime": 1.7832, + "eval_samples_per_second": 17.945, + "eval_steps_per_second": 0.561, "num_input_tokens_seen": 65531904, "step": 1000 }, { "epoch": 0.2163565556036348, - "eval_byte_accuracy": 0.9725519287833828, - "eval_chrf": 27.454226627757045, - "eval_sacrebleu": 13.628568400407401, - "eval_word_accuracy": 0.9478021978021978, + "eval_byte_accuracy": 0.9721810089020771, + "eval_chrf": 77.58364870895588, + "eval_sacrebleu": 66.59314384617089, + "eval_word_accuracy": 0.9326923076923077, "num_input_tokens_seen": 65531904, - "perplexity": 1.1421013323676128, + "perplexity": 1.109344700609595, "step": 1000 }, { "epoch": 0.21852012115967115, - "grad_norm": 9.25, - "learning_rate": 0.00026973, - "loss": 0.6229, + "grad_norm": 1.3828125, + "learning_rate": 0.0005678526315789473, + "loss": 0.117, "num_input_tokens_seen": 66187264, "step": 1010, - "train_runtime": 27.6423, - "train_tokens_per_second": 2394423.769 + "train_runtime": 207.2888, + "train_tokens_per_second": 319299.811 }, { "epoch": 0.22068368671570748, - "grad_norm": 5.125, - "learning_rate": 0.00026943, - "loss": 0.3566, + "grad_norm": 2.375, + "learning_rate": 0.0005672210526315789, + "loss": 0.1187, "num_input_tokens_seen": 66842624, "step": 1020, - "train_runtime": 31.1195, - "train_tokens_per_second": 2147932.099 + "train_runtime": 211.106, + "train_tokens_per_second": 316630.621 }, { "epoch": 0.22284725227174385, - "grad_norm": 4.65625, - "learning_rate": 0.00026912999999999997, - "loss": 0.2129, + "grad_norm": 1.3671875, + "learning_rate": 0.0005665894736842105, + "loss": 0.1187, "num_input_tokens_seen": 67497984, "step": 1030, - "train_runtime": 34.5347, - "train_tokens_per_second": 1954500.193 + "train_runtime": 214.9217, + "train_tokens_per_second": 314058.489 }, { "epoch": 0.22501081782778018, - "grad_norm": 4.375, - "learning_rate": 0.00026882999999999996, - "loss": 0.181, + "grad_norm": 1.1328125, + "learning_rate": 0.000565957894736842, + "loss": 0.1259, "num_input_tokens_seen": 68153344, "step": 1040, - "train_runtime": 38.0075, - "train_tokens_per_second": 1793154.986 + "train_runtime": 218.7377, + "train_tokens_per_second": 311575.604 }, { "epoch": 0.22717438338381654, - "grad_norm": 5.0, - "learning_rate": 0.00026852999999999995, - "loss": 0.1686, + "grad_norm": 2.203125, + "learning_rate": 0.0005653263157894737, + "loss": 0.1243, "num_input_tokens_seen": 68808704, "step": 1050, - "train_runtime": 41.5089, - "train_tokens_per_second": 1657684.95 + "train_runtime": 222.5611, + "train_tokens_per_second": 309167.656 }, { "epoch": 0.22933794893985288, - "grad_norm": 4.96875, - "learning_rate": 0.00026823, - "loss": 0.1398, + "grad_norm": 2.0625, + "learning_rate": 0.0005646947368421052, + "loss": 0.1136, "num_input_tokens_seen": 69464064, "step": 1060, - "train_runtime": 44.9978, - "train_tokens_per_second": 1543722.438 + "train_runtime": 226.3782, + "train_tokens_per_second": 306849.57 }, { "epoch": 0.23150151449588924, - "grad_norm": 4.90625, - "learning_rate": 0.00026793, - "loss": 0.1304, + "grad_norm": 1.59375, + "learning_rate": 0.0005640631578947368, + "loss": 0.1111, "num_input_tokens_seen": 70119424, "step": 1070, - "train_runtime": 48.4421, - "train_tokens_per_second": 1447490.641 + "train_runtime": 230.1891, + "train_tokens_per_second": 304616.596 }, { "epoch": 0.23366508005192557, - "grad_norm": 3.59375, - "learning_rate": 0.00026763, - "loss": 0.134, + "grad_norm": 2.765625, + "learning_rate": 0.0005634315789473683, + "loss": 0.1161, "num_input_tokens_seen": 70774784, "step": 1080, - "train_runtime": 51.8834, - "train_tokens_per_second": 1364112.211 + "train_runtime": 234.0032, + "train_tokens_per_second": 302452.263 }, { "epoch": 0.23582864560796193, - "grad_norm": 4.03125, - "learning_rate": 0.00026733, - "loss": 0.129, + "grad_norm": 1.046875, + "learning_rate": 0.0005627999999999999, + "loss": 0.114, "num_input_tokens_seen": 71430144, "step": 1090, - "train_runtime": 55.3888, - "train_tokens_per_second": 1289612.493 + "train_runtime": 237.8211, + "train_tokens_per_second": 300352.379 }, { "epoch": 0.23799221116399827, - "grad_norm": 4.4375, - "learning_rate": 0.00026702999999999997, - "loss": 0.125, + "grad_norm": 2.09375, + "learning_rate": 0.0005621684210526315, + "loss": 0.114, "num_input_tokens_seen": 72085504, "step": 1100, - "train_runtime": 58.9002, - "train_tokens_per_second": 1223858.259 + "train_runtime": 241.6412, + "train_tokens_per_second": 298316.23 }, { "epoch": 0.23799221116399827, - "eval_loss": 0.12945495545864105, - "eval_runtime": 23.1791, - "eval_samples_per_second": 1.381, - "eval_steps_per_second": 0.043, + "eval_loss": 0.10388417541980743, + "eval_runtime": 1.1455, + "eval_samples_per_second": 27.934, + "eval_steps_per_second": 0.873, "num_input_tokens_seen": 72085504, "step": 1100 }, { "epoch": 0.23799221116399827, - "eval_byte_accuracy": 0.9729228486646885, - "eval_chrf": 30.022419954694445, - "eval_sacrebleu": 14.576924110393575, - "eval_word_accuracy": 0.945054945054945, + "eval_byte_accuracy": 0.9699554896142433, + "eval_chrf": 77.89753777813905, + "eval_sacrebleu": 69.6119771947039, + "eval_word_accuracy": 0.9326923076923077, "num_input_tokens_seen": 72085504, - "perplexity": 1.1382078402579139, + "perplexity": 1.1094719433512532, "step": 1100 }, { "epoch": 0.24015577672003463, - "grad_norm": 4.40625, - "learning_rate": 0.00026672999999999996, - "loss": 0.1318, + "grad_norm": 2.03125, + "learning_rate": 0.0005615368421052631, + "loss": 0.1178, "num_input_tokens_seen": 72740864, "step": 1110, - "train_runtime": 85.5674, - "train_tokens_per_second": 850100.188 + "train_runtime": 246.6141, + "train_tokens_per_second": 294958.283 }, { "epoch": 0.24231934227607096, - "grad_norm": 6.15625, - "learning_rate": 0.00026642999999999995, - "loss": 0.1315, + "grad_norm": 2.25, + "learning_rate": 0.0005609052631578947, + "loss": 0.1074, "num_input_tokens_seen": 73396224, "step": 1120, - "train_runtime": 88.9878, - "train_tokens_per_second": 824789.593 + "train_runtime": 250.4279, + "train_tokens_per_second": 293083.197 }, { "epoch": 0.2444829078321073, - "grad_norm": 4.0, - "learning_rate": 0.00026613, - "loss": 0.1221, + "grad_norm": 1.8203125, + "learning_rate": 0.0005602736842105263, + "loss": 0.1, "num_input_tokens_seen": 74051584, "step": 1130, - "train_runtime": 92.4161, - "train_tokens_per_second": 801284.737 + "train_runtime": 254.2431, + "train_tokens_per_second": 291262.949 }, { "epoch": 0.24664647338814366, - "grad_norm": 3.328125, - "learning_rate": 0.00026583, - "loss": 0.1168, + "grad_norm": 1.6171875, + "learning_rate": 0.0005596421052631578, + "loss": 0.1025, "num_input_tokens_seen": 74706944, "step": 1140, - "train_runtime": 95.9086, - "train_tokens_per_second": 778939.217 + "train_runtime": 258.0624, + "train_tokens_per_second": 289491.737 }, { "epoch": 0.24881003894418, - "grad_norm": 3.1875, - "learning_rate": 0.00026553, - "loss": 0.1164, + "grad_norm": 1.3515625, + "learning_rate": 0.0005590105263157894, + "loss": 0.1037, "num_input_tokens_seen": 75362304, "step": 1150, - "train_runtime": 99.3847, - "train_tokens_per_second": 758288.807 + "train_runtime": 261.8777, + "train_tokens_per_second": 287776.701 }, { "epoch": 0.25097360450021633, - "grad_norm": 4.65625, - "learning_rate": 0.00026523, - "loss": 0.1123, + "grad_norm": 1.765625, + "learning_rate": 0.000558378947368421, + "loss": 0.1021, "num_input_tokens_seen": 76017664, "step": 1160, - "train_runtime": 102.8214, - "train_tokens_per_second": 739317.604 + "train_runtime": 265.6992, + "train_tokens_per_second": 286104.222 }, { "epoch": 0.2531371700562527, - "grad_norm": 5.09375, - "learning_rate": 0.00026492999999999997, - "loss": 0.1274, + "grad_norm": 1.5234375, + "learning_rate": 0.0005577473684210526, + "loss": 0.0998, "num_input_tokens_seen": 76673024, "step": 1170, - "train_runtime": 106.258, - "train_tokens_per_second": 721573.971 + "train_runtime": 269.5112, + "train_tokens_per_second": 284489.237 }, { "epoch": 0.25530073561228905, - "grad_norm": 3.796875, - "learning_rate": 0.00026462999999999997, - "loss": 0.1099, + "grad_norm": 1.3515625, + "learning_rate": 0.0005571157894736842, + "loss": 0.0996, "num_input_tokens_seen": 77328384, "step": 1180, - "train_runtime": 109.752, - "train_tokens_per_second": 704573.522 + "train_runtime": 273.33, + "train_tokens_per_second": 282912.16 }, { "epoch": 0.2574643011683254, - "grad_norm": 4.21875, - "learning_rate": 0.00026432999999999996, - "loss": 0.111, + "grad_norm": 1.125, + "learning_rate": 0.0005564842105263158, + "loss": 0.0978, "num_input_tokens_seen": 77979648, "step": 1190, - "train_runtime": 113.2352, - "train_tokens_per_second": 688651.791 + "train_runtime": 277.1245, + "train_tokens_per_second": 281388.469 }, { "epoch": 0.2596278667243617, - "grad_norm": 3.765625, - "learning_rate": 0.00026402999999999995, - "loss": 0.122, + "grad_norm": 1.65625, + "learning_rate": 0.0005558526315789473, + "loss": 0.1089, "num_input_tokens_seen": 78635008, "step": 1200, - "train_runtime": 116.7247, - "train_tokens_per_second": 673679.199 + "train_runtime": 280.9373, + "train_tokens_per_second": 279902.327 }, { "epoch": 0.2596278667243617, - "eval_loss": 0.09579379111528397, - "eval_runtime": 24.403, - "eval_samples_per_second": 1.311, - "eval_steps_per_second": 0.041, + "eval_loss": 0.08458718657493591, + "eval_runtime": 1.7632, + "eval_samples_per_second": 18.149, + "eval_steps_per_second": 0.567, "num_input_tokens_seen": 78635008, "step": 1200 }, { "epoch": 0.2596278667243617, - "eval_byte_accuracy": 0.9807121661721068, - "eval_chrf": 35.350115677636715, - "eval_sacrebleu": 19.230749155439835, - "eval_word_accuracy": 0.9629120879120879, + "eval_byte_accuracy": 0.9755192878338279, + "eval_chrf": 80.88240618865393, + "eval_sacrebleu": 71.88855321657775, + "eval_word_accuracy": 0.9395604395604396, "num_input_tokens_seen": 78635008, - "perplexity": 1.1005321010967388, + "perplexity": 1.0882677224312234, "step": 1200 }, { "epoch": 0.2617914322803981, - "grad_norm": 3.234375, - "learning_rate": 0.00026373, - "loss": 0.0989, + "grad_norm": 1.2734375, + "learning_rate": 0.000555221052631579, + "loss": 0.0951, "num_input_tokens_seen": 79290368, "step": 1210, - "train_runtime": 144.613, - "train_tokens_per_second": 548293.454 + "train_runtime": 286.5287, + "train_tokens_per_second": 276727.531 }, { "epoch": 0.26395499783643445, - "grad_norm": 3.78125, - "learning_rate": 0.00026343, - "loss": 0.0998, + "grad_norm": 2.0, + "learning_rate": 0.0005545894736842104, + "loss": 0.0971, "num_input_tokens_seen": 79945728, "step": 1220, - "train_runtime": 148.0856, - "train_tokens_per_second": 539861.575 + "train_runtime": 290.3417, + "train_tokens_per_second": 275350.474 }, { "epoch": 0.2661185633924708, - "grad_norm": 3.359375, - "learning_rate": 0.00026313, - "loss": 0.1016, + "grad_norm": 1.1171875, + "learning_rate": 0.0005539578947368421, + "loss": 0.0993, "num_input_tokens_seen": 80601088, "step": 1230, - "train_runtime": 151.5232, - "train_tokens_per_second": 531938.975 + "train_runtime": 294.1684, + "train_tokens_per_second": 273996.464 }, { "epoch": 0.2682821289485071, - "grad_norm": 2.84375, - "learning_rate": 0.00026283, - "loss": 0.0964, + "grad_norm": 1.2890625, + "learning_rate": 0.0005533263157894736, + "loss": 0.0951, "num_input_tokens_seen": 81256448, "step": 1240, - "train_runtime": 154.9994, - "train_tokens_per_second": 524237.094 + "train_runtime": 297.9859, + "train_tokens_per_second": 272685.574 }, { "epoch": 0.2704456945045435, - "grad_norm": 4.0625, - "learning_rate": 0.00026252999999999997, - "loss": 0.098, + "grad_norm": 1.359375, + "learning_rate": 0.0005526947368421052, + "loss": 0.0968, "num_input_tokens_seen": 81911808, "step": 1250, - "train_runtime": 158.5169, - "train_tokens_per_second": 516738.677 + "train_runtime": 301.7986, + "train_tokens_per_second": 271412.157 }, { "epoch": 0.27260926006057984, - "grad_norm": 2.5625, - "learning_rate": 0.00026222999999999996, - "loss": 0.0947, + "grad_norm": 1.4140625, + "learning_rate": 0.0005520631578947368, + "loss": 0.0978, "num_input_tokens_seen": 82563072, "step": 1260, - "train_runtime": 161.9714, - "train_tokens_per_second": 509738.516 + "train_runtime": 305.5977, + "train_tokens_per_second": 270169.191 }, { "epoch": 0.2747728256166162, - "grad_norm": 4.71875, - "learning_rate": 0.00026192999999999995, - "loss": 0.0959, + "grad_norm": 1.640625, + "learning_rate": 0.0005514315789473684, + "loss": 0.0992, "num_input_tokens_seen": 83218432, "step": 1270, - "train_runtime": 165.4822, - "train_tokens_per_second": 502884.564 + "train_runtime": 309.4192, + "train_tokens_per_second": 268950.483 }, { "epoch": 0.2769363911726525, - "grad_norm": 3.8125, - "learning_rate": 0.00026162999999999995, - "loss": 0.0913, + "grad_norm": 2.046875, + "learning_rate": 0.0005507999999999999, + "loss": 0.0943, "num_input_tokens_seen": 83873792, "step": 1280, - "train_runtime": 169.0182, - "train_tokens_per_second": 496241.272 + "train_runtime": 313.2275, + "train_tokens_per_second": 267772.763 }, { "epoch": 0.27909995672868887, - "grad_norm": 2.734375, - "learning_rate": 0.00026133, - "loss": 0.0982, + "grad_norm": 1.328125, + "learning_rate": 0.0005501684210526315, + "loss": 0.093, "num_input_tokens_seen": 84529152, "step": 1290, - "train_runtime": 172.4628, - "train_tokens_per_second": 490129.632 + "train_runtime": 317.0294, + "train_tokens_per_second": 266628.739 }, { "epoch": 0.28126352228472523, - "grad_norm": 3.6875, - "learning_rate": 0.00026103, - "loss": 0.0956, + "grad_norm": 1.8828125, + "learning_rate": 0.0005495368421052631, + "loss": 0.0966, "num_input_tokens_seen": 85180416, "step": 1300, - "train_runtime": 175.9278, - "train_tokens_per_second": 484178.242 + "train_runtime": 320.8229, + "train_tokens_per_second": 265506.073 }, { "epoch": 0.28126352228472523, - "eval_loss": 0.07295849919319153, - "eval_runtime": 22.518, - "eval_samples_per_second": 1.421, - "eval_steps_per_second": 0.044, + "eval_loss": 0.06946705281734467, + "eval_runtime": 1.5133, + "eval_samples_per_second": 21.145, + "eval_steps_per_second": 0.661, "num_input_tokens_seen": 85180416, "step": 1300 }, { "epoch": 0.28126352228472523, - "eval_byte_accuracy": 0.9859050445103857, - "eval_chrf": 34.14051625323379, - "eval_sacrebleu": 18.706393322015977, - "eval_word_accuracy": 0.967032967032967, + "eval_byte_accuracy": 0.9799703264094956, + "eval_chrf": 84.70162551567762, + "eval_sacrebleu": 77.62774508830094, + "eval_word_accuracy": 0.9532967032967034, "num_input_tokens_seen": 85180416, - "perplexity": 1.0756858941558747, + "perplexity": 1.0719367433272844, "step": 1300 }, { "epoch": 0.2834270878407616, - "grad_norm": 3.625, - "learning_rate": 0.00026073, - "loss": 0.0882, + "grad_norm": 2.328125, + "learning_rate": 0.0005489052631578947, + "loss": 0.0902, "num_input_tokens_seen": 85835776, "step": 1310, - "train_runtime": 201.9369, - "train_tokens_per_second": 425062.332 + "train_runtime": 326.1743, + "train_tokens_per_second": 263159.235 }, { "epoch": 0.2855906533967979, - "grad_norm": 3.453125, - "learning_rate": 0.00026042999999999997, - "loss": 0.0928, + "grad_norm": 0.9375, + "learning_rate": 0.0005482736842105263, + "loss": 0.0901, "num_input_tokens_seen": 86491136, "step": 1320, - "train_runtime": 205.4466, - "train_tokens_per_second": 420990.774 + "train_runtime": 329.9891, + "train_tokens_per_second": 262103.012 }, { "epoch": 0.28775421895283426, - "grad_norm": 2.765625, - "learning_rate": 0.00026012999999999996, - "loss": 0.0854, + "grad_norm": 1.09375, + "learning_rate": 0.0005476421052631579, + "loss": 0.0925, "num_input_tokens_seen": 87146496, "step": 1330, - "train_runtime": 208.9298, - "train_tokens_per_second": 417108.996 + "train_runtime": 333.7894, + "train_tokens_per_second": 261082.25 }, { "epoch": 0.2899177845088706, - "grad_norm": 2.921875, - "learning_rate": 0.00025982999999999996, - "loss": 0.0804, + "grad_norm": 1.7890625, + "learning_rate": 0.0005470105263157895, + "loss": 0.0848, "num_input_tokens_seen": 87801856, "step": 1340, - "train_runtime": 212.4392, - "train_tokens_per_second": 413303.487 + "train_runtime": 337.6083, + "train_tokens_per_second": 260070.162 }, { "epoch": 0.292081350064907, - "grad_norm": 3.671875, - "learning_rate": 0.00025952999999999995, - "loss": 0.0844, + "grad_norm": 1.5, + "learning_rate": 0.000546378947368421, + "loss": 0.0891, "num_input_tokens_seen": 88457216, "step": 1350, - "train_runtime": 215.9304, - "train_tokens_per_second": 409656.147 + "train_runtime": 341.4242, + "train_tokens_per_second": 259083.017 }, { "epoch": 0.2942449156209433, - "grad_norm": 2.890625, - "learning_rate": 0.00025923, - "loss": 0.0863, + "grad_norm": 1.359375, + "learning_rate": 0.0005457473684210525, + "loss": 0.0909, "num_input_tokens_seen": 89112576, "step": 1360, - "train_runtime": 219.4125, - "train_tokens_per_second": 406141.839 + "train_runtime": 345.2394, + "train_tokens_per_second": 258118.218 }, { "epoch": 0.29640848117697965, - "grad_norm": 3.34375, - "learning_rate": 0.00025893, - "loss": 0.0835, + "grad_norm": 1.75, + "learning_rate": 0.0005451157894736842, + "loss": 0.0877, "num_input_tokens_seen": 89767936, "step": 1370, - "train_runtime": 222.9179, - "train_tokens_per_second": 402695.125 + "train_runtime": 349.0489, + "train_tokens_per_second": 257178.658 }, { "epoch": 0.298572046733016, - "grad_norm": 3.765625, - "learning_rate": 0.00025863, - "loss": 0.0872, + "grad_norm": 1.3984375, + "learning_rate": 0.0005444842105263157, + "loss": 0.0924, "num_input_tokens_seen": 90423296, "step": 1380, - "train_runtime": 226.428, - "train_tokens_per_second": 399346.766 + "train_runtime": 352.8617, + "train_tokens_per_second": 256257.015 }, { "epoch": 0.3007356122890524, - "grad_norm": 3.359375, - "learning_rate": 0.00025833, - "loss": 0.081, + "grad_norm": 1.1484375, + "learning_rate": 0.0005438526315789473, + "loss": 0.0905, "num_input_tokens_seen": 91078656, "step": 1390, - "train_runtime": 229.9185, - "train_tokens_per_second": 396134.601 + "train_runtime": 356.6816, + "train_tokens_per_second": 255350.05 }, { "epoch": 0.3028991778450887, - "grad_norm": 2.5, - "learning_rate": 0.00025802999999999997, - "loss": 0.0804, + "grad_norm": 1.5234375, + "learning_rate": 0.0005432210526315789, + "loss": 0.0863, "num_input_tokens_seen": 91734016, "step": 1400, - "train_runtime": 233.3677, - "train_tokens_per_second": 393087.948 + "train_runtime": 360.4899, + "train_tokens_per_second": 254470.434 }, { "epoch": 0.3028991778450887, - "eval_loss": 0.0870237722992897, - "eval_runtime": 25.2014, - "eval_samples_per_second": 1.27, - "eval_steps_per_second": 0.04, + "eval_loss": 0.07211805135011673, + "eval_runtime": 1.4771, + "eval_samples_per_second": 21.665, + "eval_steps_per_second": 0.677, "num_input_tokens_seen": 91734016, "step": 1400 }, { "epoch": 0.3028991778450887, - "eval_byte_accuracy": 0.982566765578635, - "eval_chrf": 34.29604038703655, - "eval_sacrebleu": 18.31973645097744, - "eval_word_accuracy": 0.9642857142857143, + "eval_byte_accuracy": 0.9814540059347181, + "eval_chrf": 85.4092761981197, + "eval_sacrebleu": 78.3597595743492, + "eval_word_accuracy": 0.9546703296703297, "num_input_tokens_seen": 91734016, - "perplexity": 1.0909226131488896, + "perplexity": 1.0747822160666487, "step": 1400 }, { "epoch": 0.30506274340112505, - "grad_norm": 2.34375, - "learning_rate": 0.00025772999999999996, - "loss": 0.074, + "grad_norm": 1.421875, + "learning_rate": 0.0005425894736842105, + "loss": 0.0863, "num_input_tokens_seen": 92389376, "step": 1410, - "train_runtime": 262.0703, - "train_tokens_per_second": 352536.602 + "train_runtime": 365.7965, + "train_tokens_per_second": 252570.395 }, { "epoch": 0.3072263089571614, - "grad_norm": 3.328125, - "learning_rate": 0.00025742999999999995, - "loss": 0.0833, + "grad_norm": 0.96484375, + "learning_rate": 0.000541957894736842, + "loss": 0.0933, "num_input_tokens_seen": 93044736, "step": 1420, - "train_runtime": 265.5304, - "train_tokens_per_second": 350410.908 + "train_runtime": 369.6021, + "train_tokens_per_second": 251742.984 }, { "epoch": 0.30938987451319777, - "grad_norm": 2.859375, - "learning_rate": 0.00025712999999999995, - "loss": 0.0802, + "grad_norm": 1.34375, + "learning_rate": 0.0005413263157894736, + "loss": 0.0903, "num_input_tokens_seen": 93700096, "step": 1430, - "train_runtime": 269.0082, - "train_tokens_per_second": 348316.847 + "train_runtime": 373.4103, + "train_tokens_per_second": 250930.673 }, { "epoch": 0.3115534400692341, - "grad_norm": 1.71875, - "learning_rate": 0.00025683, - "loss": 0.0821, + "grad_norm": 1.140625, + "learning_rate": 0.0005406947368421052, + "loss": 0.0905, "num_input_tokens_seen": 94355456, "step": 1440, - "train_runtime": 272.5366, - "train_tokens_per_second": 346212.108 + "train_runtime": 377.2166, + "train_tokens_per_second": 250136.041 }, { "epoch": 0.31371700562527044, - "grad_norm": 3.984375, - "learning_rate": 0.00025653, - "loss": 0.0857, + "grad_norm": 1.46875, + "learning_rate": 0.0005400631578947368, + "loss": 0.0849, "num_input_tokens_seen": 95010816, "step": 1450, - "train_runtime": 276.051, - "train_tokens_per_second": 344178.478 + "train_runtime": 381.0327, + "train_tokens_per_second": 249350.817 }, { "epoch": 0.3158805711813068, - "grad_norm": 3.375, - "learning_rate": 0.00025623, - "loss": 0.1028, + "grad_norm": 1.46875, + "learning_rate": 0.0005394315789473684, + "loss": 0.1062, "num_input_tokens_seen": 95666176, "step": 1460, - "train_runtime": 279.5309, - "train_tokens_per_second": 342238.286 + "train_runtime": 384.8359, + "train_tokens_per_second": 248589.508 }, { "epoch": 0.31804413673734316, - "grad_norm": 2.34375, - "learning_rate": 0.00025592999999999997, - "loss": 0.0772, + "grad_norm": 1.390625, + "learning_rate": 0.0005388, + "loss": 0.0876, "num_input_tokens_seen": 96321536, "step": 1470, - "train_runtime": 283.0667, - "train_tokens_per_second": 340278.556 + "train_runtime": 388.6531, + "train_tokens_per_second": 247834.231 }, { "epoch": 0.32020770229337947, - "grad_norm": 3.28125, - "learning_rate": 0.00025562999999999996, - "loss": 0.0724, + "grad_norm": 1.1796875, + "learning_rate": 0.0005381684210526316, + "loss": 0.0798, "num_input_tokens_seen": 96976896, "step": 1480, - "train_runtime": 286.6049, - "train_tokens_per_second": 338364.408 + "train_runtime": 392.4638, + "train_tokens_per_second": 247097.674 }, { "epoch": 0.32237126784941583, - "grad_norm": 3.34375, - "learning_rate": 0.00025532999999999996, - "loss": 0.0763, + "grad_norm": 1.6875, + "learning_rate": 0.0005375368421052632, + "loss": 0.0886, "num_input_tokens_seen": 97632256, "step": 1490, - "train_runtime": 290.0522, - "train_tokens_per_second": 336602.314 + "train_runtime": 396.2816, + "train_tokens_per_second": 246370.891 }, { "epoch": 0.3245348334054522, - "grad_norm": 2.5, - "learning_rate": 0.00025502999999999995, - "loss": 0.072, + "grad_norm": 1.140625, + "learning_rate": 0.0005369052631578947, + "loss": 0.0813, "num_input_tokens_seen": 98287616, "step": 1500, - "train_runtime": 293.5264, - "train_tokens_per_second": 334851.018 + "train_runtime": 400.0922, + "train_tokens_per_second": 245662.41 }, { "epoch": 0.3245348334054522, - "eval_loss": 0.06156960874795914, - "eval_runtime": 24.3706, - "eval_samples_per_second": 1.313, - "eval_steps_per_second": 0.041, + "eval_loss": 0.06404151022434235, + "eval_runtime": 1.651, + "eval_samples_per_second": 19.382, + "eval_steps_per_second": 0.606, "num_input_tokens_seen": 98287616, "step": 1500 }, { "epoch": 0.3245348334054522, - "eval_byte_accuracy": 0.9885014836795252, - "eval_chrf": 34.872865597020905, - "eval_sacrebleu": 17.175409427705006, - "eval_word_accuracy": 0.9739010989010989, + "eval_byte_accuracy": 0.9847922848664689, + "eval_chrf": 89.0715735988111, + "eval_sacrebleu": 85.97753690689314, + "eval_word_accuracy": 0.9684065934065934, "num_input_tokens_seen": 98287616, - "perplexity": 1.0635045231706255, + "perplexity": 1.0661366534146515, "step": 1500 }, { "epoch": 0.32669839896148856, - "grad_norm": 2.40625, - "learning_rate": 0.00025472999999999994, - "loss": 0.0831, + "grad_norm": 1.5625, + "learning_rate": 0.0005362736842105263, + "loss": 0.083, "num_input_tokens_seen": 98942976, "step": 1510, - "train_runtime": 321.5471, - "train_tokens_per_second": 307709.109 + "train_runtime": 405.6165, + "train_tokens_per_second": 243932.311 }, { "epoch": 0.32886196451752486, - "grad_norm": 1.9375, - "learning_rate": 0.00025443, - "loss": 0.0718, + "grad_norm": 1.1875, + "learning_rate": 0.0005356421052631578, + "loss": 0.0794, "num_input_tokens_seen": 99598336, "step": 1520, - "train_runtime": 325.0302, - "train_tokens_per_second": 306427.983 + "train_runtime": 409.4361, + "train_tokens_per_second": 243257.336 }, { "epoch": 0.3310255300735612, - "grad_norm": 1.5546875, - "learning_rate": 0.00025413, - "loss": 0.069, + "grad_norm": 1.046875, + "learning_rate": 0.0005350105263157894, + "loss": 0.0795, "num_input_tokens_seen": 100253696, "step": 1530, - "train_runtime": 328.5015, - "train_tokens_per_second": 305184.931 + "train_runtime": 413.2546, + "train_tokens_per_second": 242595.488 }, { "epoch": 0.3331890956295976, - "grad_norm": 2.734375, - "learning_rate": 0.00025383, - "loss": 0.0849, + "grad_norm": 1.109375, + "learning_rate": 0.000534378947368421, + "loss": 0.0851, "num_input_tokens_seen": 100909056, "step": 1540, - "train_runtime": 332.0025, - "train_tokens_per_second": 303940.672 + "train_runtime": 417.0767, + "train_tokens_per_second": 241943.636 }, { "epoch": 0.33535266118563395, - "grad_norm": 2.859375, - "learning_rate": 0.00025352999999999997, - "loss": 0.0749, + "grad_norm": 1.2734375, + "learning_rate": 0.0005337473684210526, + "loss": 0.0857, "num_input_tokens_seen": 101564416, "step": 1550, - "train_runtime": 335.4887, - "train_tokens_per_second": 302735.69 + "train_runtime": 420.8922, + "train_tokens_per_second": 241307.443 }, { "epoch": 0.33751622674167026, - "grad_norm": 2.859375, - "learning_rate": 0.00025322999999999996, - "loss": 0.0792, + "grad_norm": 1.7265625, + "learning_rate": 0.0005331157894736841, + "loss": 0.0895, "num_input_tokens_seen": 102219776, "step": 1560, - "train_runtime": 339.024, - "train_tokens_per_second": 301511.894 + "train_runtime": 424.7232, + "train_tokens_per_second": 240673.856 }, { "epoch": 0.3396797922977066, - "grad_norm": 3.6875, - "learning_rate": 0.00025292999999999995, - "loss": 0.0704, + "grad_norm": 0.8203125, + "learning_rate": 0.0005324842105263157, + "loss": 0.0832, "num_input_tokens_seen": 102875136, "step": 1570, - "train_runtime": 342.5025, - "train_tokens_per_second": 300363.146 + "train_runtime": 428.5459, + "train_tokens_per_second": 240056.294 }, { "epoch": 0.341843357853743, - "grad_norm": 2.28125, - "learning_rate": 0.00025262999999999994, - "loss": 0.0679, + "grad_norm": 0.99609375, + "learning_rate": 0.0005318526315789473, + "loss": 0.0818, "num_input_tokens_seen": 103530496, "step": 1580, - "train_runtime": 345.9877, - "train_tokens_per_second": 299231.745 + "train_runtime": 432.3726, + "train_tokens_per_second": 239447.421 }, { "epoch": 0.34400692340977934, - "grad_norm": 1.828125, - "learning_rate": 0.00025233, - "loss": 0.0662, + "grad_norm": 1.484375, + "learning_rate": 0.0005312210526315789, + "loss": 0.0823, "num_input_tokens_seen": 104185856, "step": 1590, - "train_runtime": 349.5162, - "train_tokens_per_second": 298085.939 + "train_runtime": 436.1733, + "train_tokens_per_second": 238863.469 }, { "epoch": 0.34617048896581565, - "grad_norm": 2.40625, - "learning_rate": 0.00025203, - "loss": 0.0687, + "grad_norm": 0.6015625, + "learning_rate": 0.0005305894736842105, + "loss": 0.081, "num_input_tokens_seen": 104841216, "step": 1600, - "train_runtime": 352.9737, - "train_tokens_per_second": 297022.765 + "train_runtime": 439.9906, + "train_tokens_per_second": 238280.565 }, { "epoch": 0.34617048896581565, - "eval_loss": 0.0666755959391594, - "eval_runtime": 22.5299, - "eval_samples_per_second": 1.42, - "eval_steps_per_second": 0.044, + "eval_loss": 0.06596406549215317, + "eval_runtime": 1.454, + "eval_samples_per_second": 22.009, + "eval_steps_per_second": 0.688, "num_input_tokens_seen": 104841216, "step": 1600 }, { "epoch": 0.34617048896581565, - "eval_byte_accuracy": 0.987759643916914, - "eval_chrf": 36.36666043851783, - "eval_sacrebleu": 19.949244247197115, - "eval_word_accuracy": 0.9752747252747253, + "eval_byte_accuracy": 0.983679525222552, + "eval_chrf": 88.0318341339548, + "eval_sacrebleu": 83.92707672444418, + "eval_word_accuracy": 0.9656593406593407, "num_input_tokens_seen": 104841216, - "perplexity": 1.068948650638414, + "perplexity": 1.0681883316543295, "step": 1600 }, { "epoch": 0.348334054521852, - "grad_norm": 4.28125, - "learning_rate": 0.00025173, - "loss": 0.0681, + "grad_norm": 0.9609375, + "learning_rate": 0.0005299578947368421, + "loss": 0.0782, "num_input_tokens_seen": 105492480, "step": 1610, - "train_runtime": 378.9555, - "train_tokens_per_second": 278376.971 + "train_runtime": 445.2599, + "train_tokens_per_second": 236923.37 }, { "epoch": 0.35049762007788837, - "grad_norm": 3.703125, - "learning_rate": 0.00025142999999999997, - "loss": 0.0708, + "grad_norm": 1.03125, + "learning_rate": 0.0005293263157894737, + "loss": 0.0809, "num_input_tokens_seen": 106147840, "step": 1620, - "train_runtime": 382.4277, - "train_tokens_per_second": 277563.15 + "train_runtime": 449.0622, + "train_tokens_per_second": 236376.719 }, { "epoch": 0.35266118563392473, - "grad_norm": 1.78125, - "learning_rate": 0.00025112999999999996, - "loss": 0.0703, + "grad_norm": 5.84375, + "learning_rate": 0.0005286947368421053, + "loss": 0.0895, "num_input_tokens_seen": 106803200, "step": 1630, - "train_runtime": 385.8818, - "train_tokens_per_second": 276776.981 + "train_runtime": 452.8883, + "train_tokens_per_second": 235826.785 }, { "epoch": 0.35482475118996104, - "grad_norm": 3.078125, - "learning_rate": 0.00025082999999999995, - "loss": 0.0692, + "grad_norm": 1.046875, + "learning_rate": 0.0005280631578947368, + "loss": 0.0932, "num_input_tokens_seen": 107458560, "step": 1640, - "train_runtime": 389.378, - "train_tokens_per_second": 275974.947 + "train_runtime": 456.7037, + "train_tokens_per_second": 235291.625 }, { "epoch": 0.3569883167459974, - "grad_norm": 2.703125, - "learning_rate": 0.00025052999999999995, - "loss": 0.0806, + "grad_norm": 1.21875, + "learning_rate": 0.0005274315789473684, + "loss": 0.0863, "num_input_tokens_seen": 108113920, "step": 1650, - "train_runtime": 392.881, - "train_tokens_per_second": 275182.365 + "train_runtime": 460.5239, + "train_tokens_per_second": 234762.901 }, { "epoch": 0.35915188230203376, - "grad_norm": 2.734375, - "learning_rate": 0.00025022999999999994, - "loss": 0.0584, + "grad_norm": 1.453125, + "learning_rate": 0.0005267999999999999, + "loss": 0.0716, "num_input_tokens_seen": 108765184, "step": 1660, - "train_runtime": 396.3085, - "train_tokens_per_second": 274445.736 + "train_runtime": 464.3225, + "train_tokens_per_second": 234244.92 }, { "epoch": 0.3613154478580701, - "grad_norm": 3.21875, - "learning_rate": 0.00024993, - "loss": 0.0618, + "grad_norm": 1.1796875, + "learning_rate": 0.0005261684210526315, + "loss": 0.0744, "num_input_tokens_seen": 109420544, "step": 1670, - "train_runtime": 399.7537, - "train_tokens_per_second": 273719.885 + "train_runtime": 468.1425, + "train_tokens_per_second": 233733.396 }, { "epoch": 0.36347901341410643, - "grad_norm": 1.8828125, - "learning_rate": 0.00024963, - "loss": 0.0618, + "grad_norm": 1.1171875, + "learning_rate": 0.0005255368421052631, + "loss": 0.0738, "num_input_tokens_seen": 110071808, "step": 1680, - "train_runtime": 403.1738, - "train_tokens_per_second": 273013.326 + "train_runtime": 471.9326, + "train_tokens_per_second": 233236.282 }, { "epoch": 0.3656425789701428, - "grad_norm": 1.9921875, - "learning_rate": 0.00024932999999999997, - "loss": 0.0632, + "grad_norm": 1.2890625, + "learning_rate": 0.0005249052631578947, + "loss": 0.079, "num_input_tokens_seen": 110727168, "step": 1690, - "train_runtime": 406.6482, - "train_tokens_per_second": 272292.272 + "train_runtime": 475.748, + "train_tokens_per_second": 232743.306 }, { "epoch": 0.36780614452617916, - "grad_norm": 2.296875, - "learning_rate": 0.00024902999999999997, - "loss": 0.0676, + "grad_norm": 1.0078125, + "learning_rate": 0.0005242736842105262, + "loss": 0.079, "num_input_tokens_seen": 111382528, "step": 1700, - "train_runtime": 410.145, - "train_tokens_per_second": 271568.683 + "train_runtime": 479.5698, + "train_tokens_per_second": 232255.089 }, { "epoch": 0.36780614452617916, - "eval_loss": 0.06032148003578186, - "eval_runtime": 24.2219, - "eval_samples_per_second": 1.321, - "eval_steps_per_second": 0.041, + "eval_loss": 0.05933792144060135, + "eval_runtime": 1.8241, + "eval_samples_per_second": 17.543, + "eval_steps_per_second": 0.548, "num_input_tokens_seen": 111382528, "step": 1700 }, { "epoch": 0.36780614452617916, - "eval_byte_accuracy": 0.9870178041543026, - "eval_chrf": 33.56203013459989, - "eval_sacrebleu": 18.274014834801072, - "eval_word_accuracy": 0.9697802197802198, + "eval_byte_accuracy": 0.9821958456973294, + "eval_chrf": 85.84022704344234, + "eval_sacrebleu": 79.27134876780914, + "eval_word_accuracy": 0.9560439560439561, "num_input_tokens_seen": 111382528, - "perplexity": 1.06217796067231, + "perplexity": 1.0611337600099267, "step": 1700 }, { "epoch": 0.36996971008221546, - "grad_norm": 1.84375, - "learning_rate": 0.00024872999999999996, - "loss": 0.0601, + "grad_norm": 0.8984375, + "learning_rate": 0.0005236421052631578, + "loss": 0.0775, "num_input_tokens_seen": 112037888, "step": 1710, - "train_runtime": 437.8276, - "train_tokens_per_second": 255895.015 + "train_runtime": 485.2237, + "train_tokens_per_second": 230899.473 }, { "epoch": 0.3721332756382518, - "grad_norm": 2.796875, - "learning_rate": 0.00024842999999999995, - "loss": 0.0653, + "grad_norm": 1.796875, + "learning_rate": 0.0005230105263157894, + "loss": 0.0844, "num_input_tokens_seen": 112693248, "step": 1720, - "train_runtime": 441.3492, - "train_tokens_per_second": 255338.056 + "train_runtime": 489.0379, + "train_tokens_per_second": 230438.671 }, { "epoch": 0.3742968411942882, - "grad_norm": 2.421875, - "learning_rate": 0.00024812999999999994, - "loss": 0.0623, + "grad_norm": 1.5, + "learning_rate": 0.000522378947368421, + "loss": 0.0771, "num_input_tokens_seen": 113348608, "step": 1730, - "train_runtime": 444.8046, - "train_tokens_per_second": 254827.875 + "train_runtime": 492.8493, + "train_tokens_per_second": 229986.364 }, { "epoch": 0.37646040675032455, - "grad_norm": 1.4609375, - "learning_rate": 0.00024782999999999994, - "loss": 0.0614, + "grad_norm": 1.046875, + "learning_rate": 0.0005217473684210526, + "loss": 0.0761, "num_input_tokens_seen": 114003968, "step": 1740, - "train_runtime": 448.2719, - "train_tokens_per_second": 254318.78 + "train_runtime": 496.6631, + "train_tokens_per_second": 229539.85 }, { "epoch": 0.37862397230636086, - "grad_norm": 2.0625, - "learning_rate": 0.00024753, - "loss": 0.0592, + "grad_norm": 1.4375, + "learning_rate": 0.0005211157894736842, + "loss": 0.0772, "num_input_tokens_seen": 114659328, "step": 1750, - "train_runtime": 451.7596, - "train_tokens_per_second": 253806.048 + "train_runtime": 500.4771, + "train_tokens_per_second": 229100.028 }, { "epoch": 0.3807875378623972, - "grad_norm": 2.8125, - "learning_rate": 0.00024723, - "loss": 0.0647, + "grad_norm": 0.84765625, + "learning_rate": 0.0005204842105263158, + "loss": 0.0767, "num_input_tokens_seen": 115314688, "step": 1760, - "train_runtime": 455.2643, - "train_tokens_per_second": 253291.733 + "train_runtime": 504.2884, + "train_tokens_per_second": 228668.122 }, { "epoch": 0.3829511034184336, - "grad_norm": 2.671875, - "learning_rate": 0.00024692999999999997, - "loss": 0.064, + "grad_norm": 1.1484375, + "learning_rate": 0.0005198526315789474, + "loss": 0.074, "num_input_tokens_seen": 115970048, "step": 1770, - "train_runtime": 459.0917, - "train_tokens_per_second": 252607.594 + "train_runtime": 508.1019, + "train_tokens_per_second": 228241.718 }, { "epoch": 0.38511466897446994, - "grad_norm": 1.546875, - "learning_rate": 0.00024663, - "loss": 0.0559, + "grad_norm": 1.2578125, + "learning_rate": 0.0005192210526315789, + "loss": 0.0737, "num_input_tokens_seen": 116625408, "step": 1780, - "train_runtime": 462.6029, - "train_tokens_per_second": 252106.963 + "train_runtime": 511.9131, + "train_tokens_per_second": 227822.652 }, { "epoch": 0.38727823453050625, - "grad_norm": 2.75, - "learning_rate": 0.00024633, - "loss": 0.0603, + "grad_norm": 1.2890625, + "learning_rate": 0.0005185894736842105, + "loss": 0.0741, "num_input_tokens_seen": 117280768, "step": 1790, - "train_runtime": 466.0637, - "train_tokens_per_second": 251641.092 + "train_runtime": 515.7164, + "train_tokens_per_second": 227413.314 }, { "epoch": 0.3894418000865426, - "grad_norm": 1.4765625, - "learning_rate": 0.00024603, - "loss": 0.0568, + "grad_norm": 1.203125, + "learning_rate": 0.0005179578947368421, + "loss": 0.0703, "num_input_tokens_seen": 117936128, "step": 1800, - "train_runtime": 469.5105, - "train_tokens_per_second": 251189.546 + "train_runtime": 519.5147, + "train_tokens_per_second": 227012.128 }, { "epoch": 0.3894418000865426, - "eval_loss": 0.055454593151807785, - "eval_runtime": 21.4521, - "eval_samples_per_second": 1.492, - "eval_steps_per_second": 0.047, + "eval_loss": 0.05443021282553673, + "eval_runtime": 1.9889, + "eval_samples_per_second": 16.089, + "eval_steps_per_second": 0.503, "num_input_tokens_seen": 117936128, "step": 1800 }, { "epoch": 0.3894418000865426, - "eval_byte_accuracy": 0.9862759643916914, - "eval_chrf": 35.4921586947006, - "eval_sacrebleu": 19.41181540826222, - "eval_word_accuracy": 0.9752747252747253, + "eval_byte_accuracy": 0.9847922848664689, + "eval_chrf": 87.69637221255437, + "eval_sacrebleu": 84.14156310397632, + "eval_word_accuracy": 0.9642857142857143, "num_input_tokens_seen": 117936128, - "perplexity": 1.0570210199897423, + "perplexity": 1.0559387828582605, "step": 1800 }, { "epoch": 0.391605365642579, - "grad_norm": 2.90625, - "learning_rate": 0.00024573, - "loss": 0.0632, + "grad_norm": 1.40625, + "learning_rate": 0.0005173263157894736, + "loss": 0.0786, "num_input_tokens_seen": 118591488, "step": 1810, - "train_runtime": 494.4227, - "train_tokens_per_second": 239858.515 + "train_runtime": 525.3311, + "train_tokens_per_second": 225746.181 }, { "epoch": 0.39376893119861534, - "grad_norm": 2.8125, - "learning_rate": 0.00024543, - "loss": 0.0601, + "grad_norm": 0.67578125, + "learning_rate": 0.0005166947368421052, + "loss": 0.0729, "num_input_tokens_seen": 119246848, "step": 1820, - "train_runtime": 497.8408, - "train_tokens_per_second": 239528.081 + "train_runtime": 529.1476, + "train_tokens_per_second": 225356.508 }, { "epoch": 0.39593249675465164, - "grad_norm": 2.328125, - "learning_rate": 0.00024513, - "loss": 0.0657, + "grad_norm": 1.1640625, + "learning_rate": 0.0005160631578947368, + "loss": 0.0694, "num_input_tokens_seen": 119902208, "step": 1830, - "train_runtime": 501.2829, - "train_tokens_per_second": 239190.719 + "train_runtime": 532.9676, + "train_tokens_per_second": 224970.907 }, { "epoch": 0.398096062310688, - "grad_norm": 2.34375, - "learning_rate": 0.00024482999999999997, - "loss": 0.0595, + "grad_norm": 0.8984375, + "learning_rate": 0.0005154315789473684, + "loss": 0.0746, "num_input_tokens_seen": 120557568, "step": 1840, - "train_runtime": 504.7693, - "train_tokens_per_second": 238836.986 + "train_runtime": 536.7692, + "train_tokens_per_second": 224598.51 }, { "epoch": 0.40025962786672437, - "grad_norm": 2.515625, - "learning_rate": 0.00024453, - "loss": 0.0583, + "grad_norm": 1.2734375, + "learning_rate": 0.0005147999999999999, + "loss": 0.0704, "num_input_tokens_seen": 121212928, "step": 1850, - "train_runtime": 508.2436, - "train_tokens_per_second": 238493.767 + "train_runtime": 540.578, + "train_tokens_per_second": 224228.364 }, { "epoch": 0.4024231934227607, - "grad_norm": 2.5625, - "learning_rate": 0.00024423, - "loss": 0.0595, + "grad_norm": 1.6640625, + "learning_rate": 0.0005141684210526315, + "loss": 0.0699, "num_input_tokens_seen": 121868288, "step": 1860, - "train_runtime": 511.6764, - "train_tokens_per_second": 238174.546 + "train_runtime": 544.3824, + "train_tokens_per_second": 223865.241 }, { "epoch": 0.40458675897879703, - "grad_norm": 1.8046875, - "learning_rate": 0.00024393, - "loss": 0.0612, + "grad_norm": 1.0625, + "learning_rate": 0.0005135368421052631, + "loss": 0.072, "num_input_tokens_seen": 122523648, "step": 1870, - "train_runtime": 515.1872, - "train_tokens_per_second": 237823.548 + "train_runtime": 548.1984, + "train_tokens_per_second": 223502.362 }, { "epoch": 0.4067503245348334, - "grad_norm": 3.25, - "learning_rate": 0.00024363, - "loss": 0.0563, + "grad_norm": 1.28125, + "learning_rate": 0.0005129052631578947, + "loss": 0.0701, "num_input_tokens_seen": 123179008, "step": 1880, - "train_runtime": 518.6146, - "train_tokens_per_second": 237515.504 + "train_runtime": 552.006, + "train_tokens_per_second": 223147.947 }, { "epoch": 0.40891389009086976, - "grad_norm": 2.59375, - "learning_rate": 0.00024333, - "loss": 0.0568, + "grad_norm": 1.0546875, + "learning_rate": 0.0005122736842105263, + "loss": 0.0673, "num_input_tokens_seen": 123834368, "step": 1890, - "train_runtime": 522.0823, - "train_tokens_per_second": 237193.185 + "train_runtime": 555.8224, + "train_tokens_per_second": 222794.848 }, { "epoch": 0.4110774556469061, - "grad_norm": 1.875, - "learning_rate": 0.00024302999999999998, - "loss": 0.0607, + "grad_norm": 1.3046875, + "learning_rate": 0.0005116421052631579, + "loss": 0.074, "num_input_tokens_seen": 124489728, "step": 1900, - "train_runtime": 525.5175, - "train_tokens_per_second": 236889.77 + "train_runtime": 559.6214, + "train_tokens_per_second": 222453.48 }, { "epoch": 0.4110774556469061, - "eval_loss": 0.04269232600927353, - "eval_runtime": 24.8304, - "eval_samples_per_second": 1.289, - "eval_steps_per_second": 0.04, + "eval_loss": 0.05228148028254509, + "eval_runtime": 1.8822, + "eval_samples_per_second": 17.002, + "eval_steps_per_second": 0.531, "num_input_tokens_seen": 124489728, "step": 1900 }, { "epoch": 0.4110774556469061, - "eval_byte_accuracy": 0.9922106824925816, - "eval_chrf": 36.916380880297666, - "eval_sacrebleu": 18.970379455177397, - "eval_word_accuracy": 0.9793956043956044, + "eval_byte_accuracy": 0.9888724035608308, + "eval_chrf": 91.12158224407392, + "eval_sacrebleu": 89.04307319092496, + "eval_word_accuracy": 0.9739010989010989, "num_input_tokens_seen": 124489728, - "perplexity": 1.0436167517186612, + "perplexity": 1.0536722887489947, "step": 1900 }, { "epoch": 0.4132410212029424, - "grad_norm": 1.390625, - "learning_rate": 0.00024273, - "loss": 0.0529, + "grad_norm": 0.9453125, + "learning_rate": 0.0005110105263157895, + "loss": 0.069, "num_input_tokens_seen": 125145088, "step": 1910, - "train_runtime": 553.8076, - "train_tokens_per_second": 225972.13 + "train_runtime": 565.3393, + "train_tokens_per_second": 221362.787 }, { "epoch": 0.4154045867589788, - "grad_norm": 1.921875, - "learning_rate": 0.00024243, - "loss": 0.0558, + "grad_norm": 1.65625, + "learning_rate": 0.000510378947368421, + "loss": 0.0655, "num_input_tokens_seen": 125800448, "step": 1920, - "train_runtime": 557.2378, - "train_tokens_per_second": 225757.211 + "train_runtime": 569.151, + "train_tokens_per_second": 221031.744 }, { "epoch": 0.41756815231501515, - "grad_norm": 1.6953125, - "learning_rate": 0.00024213, - "loss": 0.0597, + "grad_norm": 1.265625, + "learning_rate": 0.0005097473684210526, + "loss": 0.0741, "num_input_tokens_seen": 126455808, "step": 1930, - "train_runtime": 560.7172, - "train_tokens_per_second": 225525.121 + "train_runtime": 572.9578, + "train_tokens_per_second": 220707.034 }, { "epoch": 0.4197317178710515, - "grad_norm": 2.625, - "learning_rate": 0.00024182999999999998, - "loss": 0.0571, + "grad_norm": 0.90625, + "learning_rate": 0.0005091157894736842, + "loss": 0.0697, "num_input_tokens_seen": 127111168, "step": 1940, - "train_runtime": 564.2127, - "train_tokens_per_second": 225289.447 + "train_runtime": 576.7661, + "train_tokens_per_second": 220385.973 }, { "epoch": 0.4218952834270878, - "grad_norm": 2.453125, - "learning_rate": 0.00024153, - "loss": 0.0565, + "grad_norm": 0.78515625, + "learning_rate": 0.0005084842105263157, + "loss": 0.0714, "num_input_tokens_seen": 127766528, "step": 1950, - "train_runtime": 567.6756, - "train_tokens_per_second": 225069.625 + "train_runtime": 580.5827, + "train_tokens_per_second": 220066.025 }, { "epoch": 0.4240588489831242, - "grad_norm": 1.921875, - "learning_rate": 0.00024123, - "loss": 0.0489, + "grad_norm": 1.1953125, + "learning_rate": 0.0005078526315789474, + "loss": 0.0673, "num_input_tokens_seen": 128421888, "step": 1960, - "train_runtime": 571.1934, - "train_tokens_per_second": 224830.834 + "train_runtime": 584.3962, + "train_tokens_per_second": 219751.41 }, { "epoch": 0.42622241453916054, - "grad_norm": 1.5390625, - "learning_rate": 0.00024092999999999999, - "loss": 0.0531, + "grad_norm": 1.1640625, + "learning_rate": 0.0005072210526315789, + "loss": 0.0672, "num_input_tokens_seen": 129077248, "step": 1970, - "train_runtime": 574.6993, - "train_tokens_per_second": 224599.632 + "train_runtime": 588.2047, + "train_tokens_per_second": 219442.719 }, { "epoch": 0.4283859800951969, - "grad_norm": 1.984375, - "learning_rate": 0.00024062999999999998, - "loss": 0.0568, + "grad_norm": 1.0234375, + "learning_rate": 0.0005065894736842105, + "loss": 0.0721, "num_input_tokens_seen": 129732608, "step": 1980, - "train_runtime": 578.1888, - "train_tokens_per_second": 224377.578 + "train_runtime": 592.02, + "train_tokens_per_second": 219135.502 }, { "epoch": 0.4305495456512332, - "grad_norm": 1.390625, - "learning_rate": 0.00024033, - "loss": 0.0527, + "grad_norm": 0.8671875, + "learning_rate": 0.000505957894736842, + "loss": 0.0687, "num_input_tokens_seen": 130387968, "step": 1990, - "train_runtime": 581.7201, - "train_tokens_per_second": 224142.095 + "train_runtime": 595.8415, + "train_tokens_per_second": 218829.967 }, { "epoch": 0.4327131112072696, - "grad_norm": 1.90625, - "learning_rate": 0.00024003, - "loss": 0.0498, + "grad_norm": 1.828125, + "learning_rate": 0.0005053263157894736, + "loss": 0.067, "num_input_tokens_seen": 131043328, "step": 2000, - "train_runtime": 585.1607, - "train_tokens_per_second": 223944.18 + "train_runtime": 599.6411, + "train_tokens_per_second": 218536.272 }, { "epoch": 0.4327131112072696, - "eval_loss": 0.04264825955033302, - "eval_runtime": 24.7584, - "eval_samples_per_second": 1.292, - "eval_steps_per_second": 0.04, + "eval_loss": 0.056328125298023224, + "eval_runtime": 2.0705, + "eval_samples_per_second": 15.455, + "eval_steps_per_second": 0.483, "num_input_tokens_seen": 131043328, "step": 2000 }, { "epoch": 0.4327131112072696, - "eval_byte_accuracy": 0.9914688427299704, - "eval_chrf": 36.328078456405585, - "eval_sacrebleu": 19.433329503471203, - "eval_word_accuracy": 0.978021978021978, + "eval_byte_accuracy": 0.9862759643916914, + "eval_chrf": 88.37329071319488, + "eval_sacrebleu": 86.52596851533677, + "eval_word_accuracy": 0.9684065934065934, "num_input_tokens_seen": 131043328, - "perplexity": 1.043570764237182, + "perplexity": 1.0579447652317715, "step": 2000 }, { "epoch": 0.43487667676330594, - "grad_norm": 1.828125, - "learning_rate": 0.00023972999999999998, - "loss": 0.0521, + "grad_norm": 1.7421875, + "learning_rate": 0.0005046947368421052, + "loss": 0.0674, "num_input_tokens_seen": 131698688, "step": 2010, - "train_runtime": 613.5639, - "train_tokens_per_second": 214645.426 + "train_runtime": 605.5878, + "train_tokens_per_second": 217472.475 }, { "epoch": 0.4370402423193423, - "grad_norm": 1.3984375, - "learning_rate": 0.00023942999999999998, - "loss": 0.052, + "grad_norm": 1.03125, + "learning_rate": 0.0005040631578947368, + "loss": 0.0645, "num_input_tokens_seen": 132354048, "step": 2020, - "train_runtime": 617.0811, - "train_tokens_per_second": 214484.034 + "train_runtime": 609.4048, + "train_tokens_per_second": 217185.764 }, { "epoch": 0.4392038078753786, - "grad_norm": 2.46875, - "learning_rate": 0.00023913, - "loss": 0.0482, + "grad_norm": 0.94140625, + "learning_rate": 0.0005034315789473684, + "loss": 0.064, "num_input_tokens_seen": 133009408, "step": 2030, - "train_runtime": 620.5281, - "train_tokens_per_second": 214348.733 + "train_runtime": 613.218, + "train_tokens_per_second": 216903.949 }, { "epoch": 0.44136737343141497, - "grad_norm": 1.6328125, - "learning_rate": 0.00023883, - "loss": 0.0539, + "grad_norm": 1.21875, + "learning_rate": 0.0005028, + "loss": 0.0673, "num_input_tokens_seen": 133664768, "step": 2040, - "train_runtime": 624.0021, - "train_tokens_per_second": 214205.64 + "train_runtime": 617.0395, + "train_tokens_per_second": 216622.719 }, { "epoch": 0.44353093898745133, - "grad_norm": 2.328125, - "learning_rate": 0.00023852999999999998, - "loss": 0.0472, + "grad_norm": 0.9140625, + "learning_rate": 0.0005021684210526316, + "loss": 0.0627, "num_input_tokens_seen": 134320128, "step": 2050, - "train_runtime": 627.5095, - "train_tokens_per_second": 214052.738 + "train_runtime": 620.8539, + "train_tokens_per_second": 216347.415 }, { "epoch": 0.4456945045434877, - "grad_norm": 2.5625, - "learning_rate": 0.00023823, - "loss": 0.0521, + "grad_norm": 1.2421875, + "learning_rate": 0.0005015368421052631, + "loss": 0.0622, "num_input_tokens_seen": 134975488, "step": 2060, - "train_runtime": 630.9726, - "train_tokens_per_second": 213916.548 + "train_runtime": 624.6588, + "train_tokens_per_second": 216078.733 }, { "epoch": 0.447858070099524, - "grad_norm": 1.3046875, - "learning_rate": 0.00023793, - "loss": 0.0498, + "grad_norm": 1.0390625, + "learning_rate": 0.0005009052631578947, + "loss": 0.0686, "num_input_tokens_seen": 135630848, "step": 2070, - "train_runtime": 634.4375, - "train_tokens_per_second": 213781.256 + "train_runtime": 628.4714, + "train_tokens_per_second": 215810.681 }, { "epoch": 0.45002163565556036, - "grad_norm": 1.65625, - "learning_rate": 0.00023762999999999999, - "loss": 0.0499, + "grad_norm": 0.91015625, + "learning_rate": 0.0005002736842105263, + "loss": 0.0657, "num_input_tokens_seen": 136286208, "step": 2080, - "train_runtime": 637.9511, - "train_tokens_per_second": 213631.107 + "train_runtime": 632.28, + "train_tokens_per_second": 215547.249 }, { "epoch": 0.4521852012115967, - "grad_norm": 2.546875, - "learning_rate": 0.00023732999999999998, - "loss": 0.0499, + "grad_norm": 1.7109375, + "learning_rate": 0.0004996421052631578, + "loss": 0.0664, "num_input_tokens_seen": 136941568, "step": 2090, - "train_runtime": 641.4639, - "train_tokens_per_second": 213482.881 + "train_runtime": 636.0935, + "train_tokens_per_second": 215285.273 }, { "epoch": 0.4543487667676331, - "grad_norm": 1.4453125, - "learning_rate": 0.00023703, - "loss": 0.0501, + "grad_norm": 1.1328125, + "learning_rate": 0.0004990105263157895, + "loss": 0.0735, "num_input_tokens_seen": 137596928, "step": 2100, - "train_runtime": 644.9163, - "train_tokens_per_second": 213356.262 + "train_runtime": 639.9004, + "train_tokens_per_second": 215028.649 }, { "epoch": 0.4543487667676331, - "eval_loss": 0.03759818896651268, - "eval_runtime": 24.0523, - "eval_samples_per_second": 1.33, - "eval_steps_per_second": 0.042, + "eval_loss": 0.05029554292559624, + "eval_runtime": 1.8825, + "eval_samples_per_second": 16.998, + "eval_steps_per_second": 0.531, "num_input_tokens_seen": 137596928, "step": 2100 }, { "epoch": 0.4543487667676331, - "eval_byte_accuracy": 0.9929525222551929, - "eval_chrf": 39.30341162146411, - "eval_sacrebleu": 23.58206339534761, - "eval_word_accuracy": 0.9807692307692307, + "eval_byte_accuracy": 0.9866468842729971, + "eval_chrf": 88.29631077643761, + "eval_sacrebleu": 84.17696197936166, + "eval_word_accuracy": 0.9642857142857143, "num_input_tokens_seen": 137596928, - "perplexity": 1.0383139430497423, + "perplexity": 1.0515818380279285, "step": 2100 }, { "epoch": 0.4565123323236694, - "grad_norm": 2.203125, - "learning_rate": 0.00023673, - "loss": 0.0457, + "grad_norm": 0.9609375, + "learning_rate": 0.000498378947368421, + "loss": 0.0646, "num_input_tokens_seen": 138252288, "step": 2110, - "train_runtime": 672.4818, - "train_tokens_per_second": 205585.188 + "train_runtime": 645.6128, + "train_tokens_per_second": 214141.172 }, { "epoch": 0.45867589787970575, - "grad_norm": 2.375, - "learning_rate": 0.00023642999999999998, - "loss": 0.0551, + "grad_norm": 1.2109375, + "learning_rate": 0.0004977473684210526, + "loss": 0.066, "num_input_tokens_seen": 138907648, "step": 2120, - "train_runtime": 675.9962, - "train_tokens_per_second": 205485.844 + "train_runtime": 649.4219, + "train_tokens_per_second": 213894.317 }, { "epoch": 0.4608394634357421, - "grad_norm": 1.953125, - "learning_rate": 0.00023612999999999998, - "loss": 0.0477, + "grad_norm": 1.1640625, + "learning_rate": 0.0004971157894736841, + "loss": 0.0638, "num_input_tokens_seen": 139563008, "step": 2130, - "train_runtime": 679.5327, - "train_tokens_per_second": 205380.865 + "train_runtime": 653.2367, + "train_tokens_per_second": 213648.434 }, { "epoch": 0.4630030289917785, - "grad_norm": 1.546875, - "learning_rate": 0.00023583, - "loss": 0.0464, + "grad_norm": 0.80078125, + "learning_rate": 0.0004964842105263157, + "loss": 0.0636, "num_input_tokens_seen": 140218368, "step": 2140, - "train_runtime": 683.0548, - "train_tokens_per_second": 205281.291 + "train_runtime": 657.0303, + "train_tokens_per_second": 213412.331 }, { "epoch": 0.4651665945478148, - "grad_norm": 4.25, - "learning_rate": 0.00023553, - "loss": 0.0471, + "grad_norm": 1.6796875, + "learning_rate": 0.0004958526315789473, + "loss": 0.0626, "num_input_tokens_seen": 140873728, "step": 2150, - "train_runtime": 686.5927, - "train_tokens_per_second": 205178.03 + "train_runtime": 660.8452, + "train_tokens_per_second": 213172.068 }, { "epoch": 0.46733016010385114, - "grad_norm": 2.140625, - "learning_rate": 0.00023522999999999998, - "loss": 0.0481, + "grad_norm": 1.2890625, + "learning_rate": 0.0004952210526315789, + "loss": 0.0634, "num_input_tokens_seen": 141529088, "step": 2160, - "train_runtime": 690.137, - "train_tokens_per_second": 205073.915 + "train_runtime": 664.6606, + "train_tokens_per_second": 212934.373 }, { "epoch": 0.4694937256598875, - "grad_norm": 2.453125, - "learning_rate": 0.00023492999999999998, - "loss": 0.0479, + "grad_norm": 1.3203125, + "learning_rate": 0.0004945894736842105, + "loss": 0.0607, "num_input_tokens_seen": 142184448, "step": 2170, - "train_runtime": 693.6614, - "train_tokens_per_second": 204976.731 + "train_runtime": 668.4544, + "train_tokens_per_second": 212706.285 }, { "epoch": 0.47165729121592387, - "grad_norm": 1.9609375, - "learning_rate": 0.00023463, - "loss": 0.0484, + "grad_norm": 0.9375, + "learning_rate": 0.0004939578947368421, + "loss": 0.062, "num_input_tokens_seen": 142839808, "step": 2180, - "train_runtime": 697.1743, - "train_tokens_per_second": 204883.935 + "train_runtime": 672.2623, + "train_tokens_per_second": 212476.311 }, { "epoch": 0.4738208567719602, - "grad_norm": 3.078125, - "learning_rate": 0.00023433, - "loss": 0.0484, + "grad_norm": 1.4765625, + "learning_rate": 0.0004933263157894737, + "loss": 0.0611, "num_input_tokens_seen": 143495168, "step": 2190, - "train_runtime": 700.7155, - "train_tokens_per_second": 204783.769 + "train_runtime": 676.0777, + "train_tokens_per_second": 212246.565 }, { "epoch": 0.47598442232799654, - "grad_norm": 2.1875, - "learning_rate": 0.00023402999999999998, - "loss": 0.0463, + "grad_norm": 0.921875, + "learning_rate": 0.0004926947368421052, + "loss": 0.06, "num_input_tokens_seen": 144150528, "step": 2200, - "train_runtime": 704.2131, - "train_tokens_per_second": 204697.32 + "train_runtime": 679.8922, + "train_tokens_per_second": 212019.69 }, { "epoch": 0.47598442232799654, - "eval_loss": 0.03342563286423683, - "eval_runtime": 23.5085, - "eval_samples_per_second": 1.361, - "eval_steps_per_second": 0.043, + "eval_loss": 0.044721540063619614, + "eval_runtime": 1.9205, + "eval_samples_per_second": 16.663, + "eval_steps_per_second": 0.521, "num_input_tokens_seen": 144150528, "step": 2200 }, { "epoch": 0.47598442232799654, - "eval_byte_accuracy": 0.9925816023738873, - "eval_chrf": 35.574330208408014, - "eval_sacrebleu": 18.432535464931984, - "eval_word_accuracy": 0.9821428571428571, + "eval_byte_accuracy": 0.9896142433234422, + "eval_chrf": 92.35975049219142, + "eval_sacrebleu": 89.92764168467501, + "eval_word_accuracy": 0.9752747252747253, "num_input_tokens_seen": 144150528, - "perplexity": 1.0339905459516716, + "perplexity": 1.0457366236080974, "step": 2200 }, { "epoch": 0.4781479878840329, - "grad_norm": 2.0625, - "learning_rate": 0.00023372999999999997, - "loss": 0.0447, + "grad_norm": 1.0390625, + "learning_rate": 0.0004920631578947368, + "loss": 0.0589, "num_input_tokens_seen": 144805888, "step": 2210, - "train_runtime": 731.2525, - "train_tokens_per_second": 198024.455 + "train_runtime": 685.64, + "train_tokens_per_second": 211198.131 }, { "epoch": 0.48031155344006926, - "grad_norm": 2.265625, - "learning_rate": 0.00023343, - "loss": 0.0451, + "grad_norm": 0.91796875, + "learning_rate": 0.0004914315789473684, + "loss": 0.0596, "num_input_tokens_seen": 145461248, "step": 2220, - "train_runtime": 734.7597, - "train_tokens_per_second": 197971.192 + "train_runtime": 689.4556, + "train_tokens_per_second": 210979.865 }, { "epoch": 0.48247511899610557, - "grad_norm": 1.875, - "learning_rate": 0.00023312999999999999, - "loss": 0.051, + "grad_norm": 0.94140625, + "learning_rate": 0.0004907999999999999, + "loss": 0.0605, "num_input_tokens_seen": 146116608, "step": 2230, - "train_runtime": 738.3073, - "train_tokens_per_second": 197907.578 + "train_runtime": 693.2702, + "train_tokens_per_second": 210764.3 }, { "epoch": 0.48463868455214193, - "grad_norm": 1.8046875, - "learning_rate": 0.00023282999999999998, - "loss": 0.0475, + "grad_norm": 1.2421875, + "learning_rate": 0.0004901684210526316, + "loss": 0.0596, "num_input_tokens_seen": 146771968, "step": 2240, - "train_runtime": 741.7778, - "train_tokens_per_second": 197865.145 + "train_runtime": 697.0767, + "train_tokens_per_second": 210553.54 }, { "epoch": 0.4868022501081783, - "grad_norm": 1.8984375, - "learning_rate": 0.00023252999999999997, - "loss": 0.0475, + "grad_norm": 1.359375, + "learning_rate": 0.0004895368421052631, + "loss": 0.0624, "num_input_tokens_seen": 147427328, "step": 2250, - "train_runtime": 745.266, - "train_tokens_per_second": 197818.393 + "train_runtime": 700.8957, + "train_tokens_per_second": 210341.331 }, { "epoch": 0.4889658156642146, - "grad_norm": 1.90625, - "learning_rate": 0.00023223, - "loss": 0.0416, + "grad_norm": 0.8671875, + "learning_rate": 0.0004889052631578948, + "loss": 0.0576, "num_input_tokens_seen": 148074496, "step": 2260, - "train_runtime": 748.7612, - "train_tokens_per_second": 197759.295 + "train_runtime": 704.6655, + "train_tokens_per_second": 210134.435 }, { "epoch": 0.49112938122025096, - "grad_norm": 1.7109375, - "learning_rate": 0.00023192999999999998, - "loss": 0.0433, + "grad_norm": 0.8515625, + "learning_rate": 0.00048827368421052624, + "loss": 0.0554, "num_input_tokens_seen": 148729856, "step": 2270, - "train_runtime": 752.2694, - "train_tokens_per_second": 197708.239 + "train_runtime": 708.4862, + "train_tokens_per_second": 209926.25 }, { "epoch": 0.4932929467762873, - "grad_norm": 1.890625, - "learning_rate": 0.00023162999999999998, - "loss": 0.0435, + "grad_norm": 1.46875, + "learning_rate": 0.0004876421052631579, + "loss": 0.0587, "num_input_tokens_seen": 149385216, "step": 2280, - "train_runtime": 755.8071, - "train_tokens_per_second": 197649.916 + "train_runtime": 712.2958, + "train_tokens_per_second": 209723.561 }, { "epoch": 0.4954565123323237, - "grad_norm": 1.984375, - "learning_rate": 0.00023132999999999997, - "loss": 0.0494, + "grad_norm": 0.89453125, + "learning_rate": 0.0004870105263157894, + "loss": 0.0612, "num_input_tokens_seen": 150040576, "step": 2290, - "train_runtime": 759.328, - "train_tokens_per_second": 197596.529 + "train_runtime": 716.1094, + "train_tokens_per_second": 209521.872 }, { "epoch": 0.49762007788836, - "grad_norm": 1.3828125, - "learning_rate": 0.00023103, - "loss": 0.0455, + "grad_norm": 1.109375, + "learning_rate": 0.000486378947368421, + "loss": 0.0593, "num_input_tokens_seen": 150695936, "step": 2300, - "train_runtime": 762.8374, - "train_tokens_per_second": 197546.61 + "train_runtime": 719.9311, + "train_tokens_per_second": 209319.935 }, { "epoch": 0.49762007788836, - "eval_loss": 0.03758781775832176, - "eval_runtime": 24.8142, - "eval_samples_per_second": 1.29, - "eval_steps_per_second": 0.04, + "eval_loss": 0.046136274933815, + "eval_runtime": 2.1122, + "eval_samples_per_second": 15.15, + "eval_steps_per_second": 0.473, "num_input_tokens_seen": 150695936, "step": 2300 }, { "epoch": 0.49762007788836, - "eval_byte_accuracy": 0.9933234421364985, - "eval_chrf": 37.716401918882354, - "eval_sacrebleu": 23.661666903937192, - "eval_word_accuracy": 0.9835164835164835, + "eval_byte_accuracy": 0.9881305637982196, + "eval_chrf": 91.02875313945079, + "eval_sacrebleu": 88.18216573136273, + "eval_word_accuracy": 0.9725274725274725, "num_input_tokens_seen": 150695936, - "perplexity": 1.0383031745355127, + "perplexity": 1.0472171106759662, "step": 2300 }, { "epoch": 0.49978364344439635, - "grad_norm": 2.296875, - "learning_rate": 0.00023072999999999998, - "loss": 0.0454, + "grad_norm": 0.84765625, + "learning_rate": 0.0004857473684210526, + "loss": 0.0572, "num_input_tokens_seen": 151351296, "step": 2310, - "train_runtime": 791.1554, - "train_tokens_per_second": 191304.131 + "train_runtime": 725.8732, + "train_tokens_per_second": 208509.263 }, { "epoch": 0.5019472090004327, - "grad_norm": 1.2578125, - "learning_rate": 0.00023042999999999997, - "loss": 0.0446, + "grad_norm": 0.83203125, + "learning_rate": 0.0004851157894736842, + "loss": 0.0607, "num_input_tokens_seen": 152006656, "step": 2320, - "train_runtime": 794.6593, - "train_tokens_per_second": 191285.328 + "train_runtime": 729.6707, + "train_tokens_per_second": 208322.262 }, { "epoch": 0.504110774556469, - "grad_norm": 2.25, - "learning_rate": 0.00023013, - "loss": 0.0543, + "grad_norm": 1.65625, + "learning_rate": 0.00048448421052631576, + "loss": 0.0657, "num_input_tokens_seen": 152662016, "step": 2330, - "train_runtime": 798.1086, - "train_tokens_per_second": 191279.758 + "train_runtime": 733.4793, + "train_tokens_per_second": 208134.061 }, { "epoch": 0.5062743401125054, - "grad_norm": 1.7421875, - "learning_rate": 0.00022983, - "loss": 0.04, + "grad_norm": 0.81640625, + "learning_rate": 0.0004838526315789473, + "loss": 0.057, "num_input_tokens_seen": 153317376, "step": 2340, - "train_runtime": 801.5578, - "train_tokens_per_second": 191274.255 + "train_runtime": 737.291, + "train_tokens_per_second": 207946.884 }, { "epoch": 0.5084379056685417, - "grad_norm": 2.0, - "learning_rate": 0.00022952999999999998, - "loss": 0.0415, + "grad_norm": 0.984375, + "learning_rate": 0.00048322105263157893, + "loss": 0.0573, "num_input_tokens_seen": 153968640, "step": 2350, - "train_runtime": 805.0238, - "train_tokens_per_second": 191259.743 + "train_runtime": 741.0801, + "train_tokens_per_second": 207762.487 }, { "epoch": 0.5106014712245781, - "grad_norm": 2.28125, - "learning_rate": 0.00022922999999999997, - "loss": 0.04, + "grad_norm": 0.83984375, + "learning_rate": 0.00048258947368421046, + "loss": 0.0568, "num_input_tokens_seen": 154624000, "step": 2360, - "train_runtime": 808.5327, - "train_tokens_per_second": 191240.259 + "train_runtime": 744.8993, + "train_tokens_per_second": 207577.062 }, { "epoch": 0.5127650367806145, - "grad_norm": 1.8359375, - "learning_rate": 0.00022893, - "loss": 0.0481, + "grad_norm": 0.875, + "learning_rate": 0.00048195789473684205, + "loss": 0.059, "num_input_tokens_seen": 155279360, "step": 2370, - "train_runtime": 812.0094, - "train_tokens_per_second": 191228.533 + "train_runtime": 748.7137, + "train_tokens_per_second": 207394.832 }, { "epoch": 0.5149286023366508, - "grad_norm": 2.28125, - "learning_rate": 0.00022862999999999998, - "loss": 0.0412, + "grad_norm": 1.1015625, + "learning_rate": 0.00048132631578947364, + "loss": 0.0597, "num_input_tokens_seen": 155934720, "step": 2380, - "train_runtime": 815.5377, - "train_tokens_per_second": 191204.801 + "train_runtime": 752.5173, + "train_tokens_per_second": 207217.454 }, { "epoch": 0.5170921678926872, - "grad_norm": 1.6953125, - "learning_rate": 0.00022832999999999998, - "loss": 0.0454, + "grad_norm": 0.8359375, + "learning_rate": 0.0004806947368421052, + "loss": 0.0583, "num_input_tokens_seen": 156590080, "step": 2390, - "train_runtime": 819.0293, - "train_tokens_per_second": 191189.833 + "train_runtime": 756.326, + "train_tokens_per_second": 207040.443 }, { "epoch": 0.5192557334487234, "grad_norm": 1.359375, - "learning_rate": 0.00022802999999999997, - "loss": 0.0394, + "learning_rate": 0.0004800631578947368, + "loss": 0.0588, "num_input_tokens_seen": 157245440, "step": 2400, - "train_runtime": 822.5352, - "train_tokens_per_second": 191171.685 + "train_runtime": 760.1418, + "train_tokens_per_second": 206863.304 }, { "epoch": 0.5192557334487234, - "eval_loss": 0.0345010943710804, - "eval_runtime": 26.2871, - "eval_samples_per_second": 1.217, - "eval_steps_per_second": 0.038, + "eval_loss": 0.04235806316137314, + "eval_runtime": 1.7624, + "eval_samples_per_second": 18.157, + "eval_steps_per_second": 0.567, "num_input_tokens_seen": 157245440, "step": 2400 }, { "epoch": 0.5192557334487234, - "eval_byte_accuracy": 0.9936943620178041, - "eval_chrf": 38.40417404231697, - "eval_sacrebleu": 22.35942527066413, - "eval_word_accuracy": 0.9862637362637363, + "eval_byte_accuracy": 0.9873887240356083, + "eval_chrf": 90.22196022995097, + "eval_sacrebleu": 86.9543619692272, + "eval_word_accuracy": 0.9697802197802198, "num_input_tokens_seen": 157245440, - "perplexity": 1.0351031611624593, + "perplexity": 1.0432679677071313, "step": 2400 }, { "epoch": 0.5214192990047598, - "grad_norm": 1.9921875, - "learning_rate": 0.00022773, - "loss": 0.0423, + "grad_norm": 1.3359375, + "learning_rate": 0.00047943157894736834, + "loss": 0.0579, "num_input_tokens_seen": 157900800, "step": 2410, - "train_runtime": 852.3148, - "train_tokens_per_second": 185261.117 + "train_runtime": 765.7372, + "train_tokens_per_second": 206207.57 }, { "epoch": 0.5235828645607962, - "grad_norm": 2.796875, - "learning_rate": 0.00022742999999999998, - "loss": 0.0459, + "grad_norm": 1.1875, + "learning_rate": 0.0004788, + "loss": 0.0588, "num_input_tokens_seen": 158556160, "step": 2420, - "train_runtime": 855.7897, - "train_tokens_per_second": 185274.667 + "train_runtime": 769.5519, + "train_tokens_per_second": 206036.994 }, { "epoch": 0.5257464301168325, - "grad_norm": 1.796875, - "learning_rate": 0.00022712999999999998, - "loss": 0.04, + "grad_norm": 1.25, + "learning_rate": 0.0004781684210526315, + "loss": 0.0547, "num_input_tokens_seen": 159211520, "step": 2430, - "train_runtime": 859.2718, - "train_tokens_per_second": 185286.562 + "train_runtime": 773.369, + "train_tokens_per_second": 205867.465 }, { "epoch": 0.5279099956728689, - "grad_norm": 1.3125, - "learning_rate": 0.00022682999999999997, - "loss": 0.0397, + "grad_norm": 1.3984375, + "learning_rate": 0.0004775368421052631, + "loss": 0.0592, "num_input_tokens_seen": 159866880, "step": 2440, - "train_runtime": 862.7419, - "train_tokens_per_second": 185300.926 + "train_runtime": 777.1861, + "train_tokens_per_second": 205699.607 }, { "epoch": 0.5300735612289053, - "grad_norm": 1.6171875, - "learning_rate": 0.00022653, - "loss": 0.0409, + "grad_norm": 1.3125, + "learning_rate": 0.0004769052631578947, + "loss": 0.0546, "num_input_tokens_seen": 160522240, "step": 2450, - "train_runtime": 866.2979, - "train_tokens_per_second": 185296.804 + "train_runtime": 781.0098, + "train_tokens_per_second": 205531.662 }, { "epoch": 0.5322371267849416, - "grad_norm": 1.7421875, - "learning_rate": 0.00022622999999999998, - "loss": 0.0422, + "grad_norm": 0.9375, + "learning_rate": 0.0004762736842105263, + "loss": 0.0537, "num_input_tokens_seen": 161177600, "step": 2460, - "train_runtime": 869.827, - "train_tokens_per_second": 185298.46 + "train_runtime": 784.8261, + "train_tokens_per_second": 205367.271 }, { "epoch": 0.534400692340978, - "grad_norm": 1.546875, - "learning_rate": 0.00022592999999999997, - "loss": 0.0382, + "grad_norm": 0.75390625, + "learning_rate": 0.00047564210526315786, + "loss": 0.0544, "num_input_tokens_seen": 161832960, "step": 2470, - "train_runtime": 873.2951, - "train_tokens_per_second": 185313.023 + "train_runtime": 788.6389, + "train_tokens_per_second": 205205.391 }, { "epoch": 0.5365642578970142, - "grad_norm": 1.6171875, - "learning_rate": 0.00022562999999999997, - "loss": 0.0397, + "grad_norm": 1.28125, + "learning_rate": 0.0004750105263157894, + "loss": 0.0526, "num_input_tokens_seen": 162488320, "step": 2480, - "train_runtime": 876.7487, - "train_tokens_per_second": 185330.547 + "train_runtime": 792.463, + "train_tokens_per_second": 205042.153 }, { "epoch": 0.5387278234530506, - "grad_norm": 1.5390625, - "learning_rate": 0.00022532999999999999, - "loss": 0.0405, + "grad_norm": 1.3828125, + "learning_rate": 0.00047437894736842103, + "loss": 0.058, "num_input_tokens_seen": 163139584, "step": 2490, - "train_runtime": 880.1711, - "train_tokens_per_second": 185349.859 + "train_runtime": 796.2564, + "train_tokens_per_second": 204883.23 }, { "epoch": 0.540891389009087, - "grad_norm": 1.3125, - "learning_rate": 0.00022502999999999998, - "loss": 0.034, + "grad_norm": 0.796875, + "learning_rate": 0.00047374736842105257, + "loss": 0.0521, "num_input_tokens_seen": 163794944, "step": 2500, - "train_runtime": 883.631, - "train_tokens_per_second": 185365.764 + "train_runtime": 800.0752, + "train_tokens_per_second": 204724.427 }, { "epoch": 0.540891389009087, - "eval_loss": 0.02886604145169258, - "eval_runtime": 24.7992, - "eval_samples_per_second": 1.29, - "eval_steps_per_second": 0.04, + "eval_loss": 0.0381394699215889, + "eval_runtime": 1.7699, + "eval_samples_per_second": 18.08, + "eval_steps_per_second": 0.565, "num_input_tokens_seen": 163794944, "step": 2500 }, { "epoch": 0.540891389009087, - "eval_byte_accuracy": 0.9955489614243324, - "eval_chrf": 39.85825386624454, - "eval_sacrebleu": 24.128339489135165, - "eval_word_accuracy": 0.9848901098901099, + "eval_byte_accuracy": 0.9888724035608308, + "eval_chrf": 90.06189872438668, + "eval_sacrebleu": 86.93698181397612, + "eval_word_accuracy": 0.9711538461538461, "num_input_tokens_seen": 163794944, - "perplexity": 1.0292867034869075, + "perplexity": 1.0388761147451957, "step": 2500 }, { "epoch": 0.5430549545651233, - "grad_norm": 1.234375, - "learning_rate": 0.00022472999999999997, - "loss": 0.0387, + "grad_norm": 0.91796875, + "learning_rate": 0.00047311578947368415, + "loss": 0.056, "num_input_tokens_seen": 164450304, "step": 2510, - "train_runtime": 912.1048, - "train_tokens_per_second": 180297.6 + "train_runtime": 805.7206, + "train_tokens_per_second": 204103.396 }, { "epoch": 0.5452185201211597, - "grad_norm": 1.765625, - "learning_rate": 0.00022442999999999996, - "loss": 0.0371, + "grad_norm": 0.9140625, + "learning_rate": 0.00047248421052631574, + "loss": 0.0519, "num_input_tokens_seen": 165105664, "step": 2520, - "train_runtime": 915.6304, - "train_tokens_per_second": 180319.116 + "train_runtime": 809.5424, + "train_tokens_per_second": 203949.384 }, { "epoch": 0.547382085677196, - "grad_norm": 1.8671875, - "learning_rate": 0.00022412999999999998, - "loss": 0.041, + "grad_norm": 0.953125, + "learning_rate": 0.0004718526315789473, + "loss": 0.0548, "num_input_tokens_seen": 165761024, "step": 2530, - "train_runtime": 919.0703, - "train_tokens_per_second": 180357.292 + "train_runtime": 813.357, + "train_tokens_per_second": 203798.62 }, { "epoch": 0.5495456512332324, - "grad_norm": 1.546875, - "learning_rate": 0.00022382999999999998, - "loss": 0.0407, + "grad_norm": 0.83203125, + "learning_rate": 0.0004712210526315789, + "loss": 0.0538, "num_input_tokens_seen": 166412288, "step": 2540, - "train_runtime": 922.4976, - "train_tokens_per_second": 180393.2 + "train_runtime": 817.142, + "train_tokens_per_second": 203651.614 }, { "epoch": 0.5517092167892688, - "grad_norm": 1.4453125, - "learning_rate": 0.00022352999999999997, - "loss": 0.0398, + "grad_norm": 0.6171875, + "learning_rate": 0.0004705894736842105, + "loss": 0.0555, "num_input_tokens_seen": 167067648, "step": 2550, - "train_runtime": 925.9525, - "train_tokens_per_second": 180427.891 + "train_runtime": 820.9524, + "train_tokens_per_second": 203504.673 }, { "epoch": 0.553872782345305, - "grad_norm": 1.2890625, - "learning_rate": 0.00022323, - "loss": 0.0362, + "grad_norm": 0.86328125, + "learning_rate": 0.0004699578947368421, + "loss": 0.0521, "num_input_tokens_seen": 167723008, "step": 2560, - "train_runtime": 929.4541, - "train_tokens_per_second": 180453.24 + "train_runtime": 824.76, + "train_tokens_per_second": 203359.777 }, { "epoch": 0.5560363479013414, - "grad_norm": 1.7734375, - "learning_rate": 0.00022292999999999998, - "loss": 0.0389, + "grad_norm": 1.0, + "learning_rate": 0.0004693263157894736, + "loss": 0.0531, "num_input_tokens_seen": 168378368, "step": 2570, - "train_runtime": 932.9661, - "train_tokens_per_second": 180476.401 + "train_runtime": 828.5662, + "train_tokens_per_second": 203216.547 }, { "epoch": 0.5581999134573777, - "grad_norm": 1.4453125, - "learning_rate": 0.00022262999999999997, - "loss": 0.0409, + "grad_norm": 0.640625, + "learning_rate": 0.0004686947368421052, + "loss": 0.056, "num_input_tokens_seen": 169033728, "step": 2580, - "train_runtime": 936.5053, - "train_tokens_per_second": 180494.149 + "train_runtime": 832.3752, + "train_tokens_per_second": 203073.956 }, { "epoch": 0.5603634790134141, - "grad_norm": 2.515625, - "learning_rate": 0.00022232999999999997, - "loss": 0.039, + "grad_norm": 1.015625, + "learning_rate": 0.0004680631578947368, + "loss": 0.0533, "num_input_tokens_seen": 169684992, "step": 2590, - "train_runtime": 939.9908, - "train_tokens_per_second": 180517.71 + "train_runtime": 836.161, + "train_tokens_per_second": 202933.408 }, { "epoch": 0.5625270445694505, - "grad_norm": 1.1796875, - "learning_rate": 0.00022203, - "loss": 0.0433, + "grad_norm": 0.9921875, + "learning_rate": 0.0004674315789473684, + "loss": 0.0574, "num_input_tokens_seen": 170340352, "step": 2600, - "train_runtime": 943.4525, - "train_tokens_per_second": 180549.996 + "train_runtime": 839.9698, + "train_tokens_per_second": 202793.423 }, { "epoch": 0.5625270445694505, - "eval_loss": 0.026349857449531555, - "eval_runtime": 23.7062, - "eval_samples_per_second": 1.35, - "eval_steps_per_second": 0.042, + "eval_loss": 0.03629929572343826, + "eval_runtime": 1.8211, + "eval_samples_per_second": 17.572, + "eval_steps_per_second": 0.549, "num_input_tokens_seen": 170340352, "step": 2600 }, { "epoch": 0.5625270445694505, - "eval_byte_accuracy": 0.9962908011869436, - "eval_chrf": 39.12453434585508, - "eval_sacrebleu": 22.22162428452978, - "eval_word_accuracy": 0.9876373626373627, + "eval_byte_accuracy": 0.990727002967359, + "eval_chrf": 94.42432169397047, + "eval_sacrebleu": 92.86889065123009, + "eval_word_accuracy": 0.9821428571428571, "num_input_tokens_seen": 170340352, - "perplexity": 1.0267000843195373, + "perplexity": 1.0369661595878308, "step": 2600 }, { "epoch": 0.5646906101254868, - "grad_norm": 1.8515625, - "learning_rate": 0.00022172999999999998, - "loss": 0.0478, + "grad_norm": 1.046875, + "learning_rate": 0.00046679999999999996, + "loss": 0.0553, "num_input_tokens_seen": 170995712, "step": 2610, - "train_runtime": 970.6327, - "train_tokens_per_second": 176169.331 + "train_runtime": 845.6193, + "train_tokens_per_second": 202213.575 }, { "epoch": 0.5668541756815232, - "grad_norm": 1.2109375, - "learning_rate": 0.00022142999999999997, - "loss": 0.0436, + "grad_norm": 1.1015625, + "learning_rate": 0.00046616842105263155, + "loss": 0.0575, "num_input_tokens_seen": 171651072, "step": 2620, - "train_runtime": 974.0862, - "train_tokens_per_second": 176217.538 + "train_runtime": 849.4317, + "train_tokens_per_second": 202077.555 }, { "epoch": 0.5690177412375595, - "grad_norm": 1.9453125, - "learning_rate": 0.00022112999999999996, - "loss": 0.0422, + "grad_norm": 0.859375, + "learning_rate": 0.00046553684210526314, + "loss": 0.0576, "num_input_tokens_seen": 172306432, "step": 2630, - "train_runtime": 977.6045, - "train_tokens_per_second": 176253.714 + "train_runtime": 853.258, + "train_tokens_per_second": 201939.431 }, { "epoch": 0.5711813067935958, - "grad_norm": 2.109375, - "learning_rate": 0.00022082999999999998, - "loss": 0.0376, + "grad_norm": 0.98828125, + "learning_rate": 0.00046490526315789467, + "loss": 0.0529, "num_input_tokens_seen": 172961792, "step": 2640, - "train_runtime": 981.1145, - "train_tokens_per_second": 176291.147 + "train_runtime": 857.0844, + "train_tokens_per_second": 201802.519 }, { "epoch": 0.5733448723496322, - "grad_norm": 1.765625, - "learning_rate": 0.00022052999999999998, - "loss": 0.0379, + "grad_norm": 0.8359375, + "learning_rate": 0.00046427368421052625, + "loss": 0.0499, "num_input_tokens_seen": 173617152, "step": 2650, - "train_runtime": 984.618, - "train_tokens_per_second": 176329.46 + "train_runtime": 860.9023, + "train_tokens_per_second": 201668.821 }, { "epoch": 0.5755084379056685, - "grad_norm": 1.703125, - "learning_rate": 0.00022022999999999997, - "loss": 0.035, + "grad_norm": 1.2734375, + "learning_rate": 0.00046364210526315784, + "loss": 0.0496, "num_input_tokens_seen": 174268416, "step": 2660, - "train_runtime": 988.071, - "train_tokens_per_second": 176372.356 + "train_runtime": 864.7001, + "train_tokens_per_second": 201536.254 }, { "epoch": 0.5776720034617049, - "grad_norm": 1.375, - "learning_rate": 0.00021992999999999996, - "loss": 0.0365, + "grad_norm": 0.7734375, + "learning_rate": 0.0004630105263157894, + "loss": 0.0516, "num_input_tokens_seen": 174923776, "step": 2670, - "train_runtime": 991.5176, - "train_tokens_per_second": 176420.242 + "train_runtime": 868.5118, + "train_tokens_per_second": 201406.34 }, { "epoch": 0.5798355690177412, - "grad_norm": 1.9765625, - "learning_rate": 0.00021962999999999998, - "loss": 0.0428, + "grad_norm": 1.109375, + "learning_rate": 0.000462378947368421, + "loss": 0.0584, "num_input_tokens_seen": 175579136, "step": 2680, - "train_runtime": 994.9781, - "train_tokens_per_second": 176465.328 + "train_runtime": 872.3297, + "train_tokens_per_second": 201276.128 }, { "epoch": 0.5819991345737776, - "grad_norm": 1.859375, - "learning_rate": 0.00021932999999999998, - "loss": 0.0405, + "grad_norm": 0.96875, + "learning_rate": 0.0004617473684210526, + "loss": 0.0504, "num_input_tokens_seen": 176234496, "step": 2690, - "train_runtime": 998.5196, - "train_tokens_per_second": 176495.775 + "train_runtime": 876.1365, + "train_tokens_per_second": 201149.593 }, { "epoch": 0.584162700129814, - "grad_norm": 1.6796875, - "learning_rate": 0.00021902999999999997, - "loss": 0.0444, + "grad_norm": 0.91015625, + "learning_rate": 0.0004611157894736842, + "loss": 0.0586, "num_input_tokens_seen": 176889856, "step": 2700, - "train_runtime": 1002.041, - "train_tokens_per_second": 176529.554 + "train_runtime": 879.9402, + "train_tokens_per_second": 201024.85 }, { "epoch": 0.584162700129814, - "eval_loss": 0.03541192412376404, - "eval_runtime": 25.907, - "eval_samples_per_second": 1.235, - "eval_steps_per_second": 0.039, + "eval_loss": 0.03447730839252472, + "eval_runtime": 1.7192, + "eval_samples_per_second": 18.614, + "eval_steps_per_second": 0.582, "num_input_tokens_seen": 176889856, "step": 2700 }, { "epoch": 0.584162700129814, - "eval_byte_accuracy": 0.9936943620178041, - "eval_chrf": 37.58257718246753, - "eval_sacrebleu": 21.76257523383724, - "eval_word_accuracy": 0.9835164835164835, + "eval_byte_accuracy": 0.990727002967359, + "eval_chrf": 93.44542123318921, + "eval_sacrebleu": 92.43982388879606, + "eval_word_accuracy": 0.9766483516483516, "num_input_tokens_seen": 176889856, - "perplexity": 1.0360463934155333, + "perplexity": 1.0350785405136793, "step": 2700 }, { "epoch": 0.5863262656858503, - "grad_norm": 1.40625, - "learning_rate": 0.00021872999999999996, - "loss": 0.0349, + "grad_norm": 0.76953125, + "learning_rate": 0.0004604842105263157, + "loss": 0.0491, "num_input_tokens_seen": 177545216, "step": 2710, - "train_runtime": 1031.4663, - "train_tokens_per_second": 172128.954 + "train_runtime": 885.4951, + "train_tokens_per_second": 200503.892 }, { "epoch": 0.5884898312418866, - "grad_norm": 1.4921875, - "learning_rate": 0.00021842999999999998, - "loss": 0.0379, + "grad_norm": 1.1875, + "learning_rate": 0.00045985263157894736, + "loss": 0.053, "num_input_tokens_seen": 178200576, "step": 2720, - "train_runtime": 1034.9738, - "train_tokens_per_second": 172178.828 + "train_runtime": 889.3102, + "train_tokens_per_second": 200380.684 }, { "epoch": 0.590653396797923, - "grad_norm": 1.421875, - "learning_rate": 0.00021812999999999997, - "loss": 0.0333, + "grad_norm": 0.95703125, + "learning_rate": 0.0004592210526315789, + "loss": 0.0509, "num_input_tokens_seen": 178855936, "step": 2730, - "train_runtime": 1038.5031, - "train_tokens_per_second": 172224.755 + "train_runtime": 893.1217, + "train_tokens_per_second": 200259.314 }, { "epoch": 0.5928169623539593, - "grad_norm": 1.40625, - "learning_rate": 0.00021782999999999997, - "loss": 0.0355, + "grad_norm": 1.2265625, + "learning_rate": 0.0004585894736842105, + "loss": 0.0489, "num_input_tokens_seen": 179503104, "step": 2740, - "train_runtime": 1041.9584, - "train_tokens_per_second": 172274.739 + "train_runtime": 896.8967, + "train_tokens_per_second": 200137.981 }, { "epoch": 0.5949805279099957, - "grad_norm": 1.09375, - "learning_rate": 0.00021752999999999996, - "loss": 0.0354, + "grad_norm": 0.9453125, + "learning_rate": 0.00045795789473684206, + "loss": 0.0476, "num_input_tokens_seen": 180158464, "step": 2750, - "train_runtime": 1045.48, - "train_tokens_per_second": 172321.298 + "train_runtime": 900.7088, + "train_tokens_per_second": 200018.543 }, { "epoch": 0.597144093466032, - "grad_norm": 0.96875, - "learning_rate": 0.00021722999999999998, - "loss": 0.033, + "grad_norm": 0.80078125, + "learning_rate": 0.00045732631578947365, + "loss": 0.0494, "num_input_tokens_seen": 180813824, "step": 2760, - "train_runtime": 1048.9736, - "train_tokens_per_second": 172372.147 + "train_runtime": 904.5344, + "train_tokens_per_second": 199897.123 }, { "epoch": 0.5993076590220684, - "grad_norm": 1.1328125, - "learning_rate": 0.00021692999999999997, - "loss": 0.0398, + "grad_norm": 0.88671875, + "learning_rate": 0.00045669473684210524, + "loss": 0.0529, "num_input_tokens_seen": 181469184, "step": 2770, - "train_runtime": 1052.4931, - "train_tokens_per_second": 172418.4 + "train_runtime": 908.3444, + "train_tokens_per_second": 199780.16 }, { "epoch": 0.6014712245781048, - "grad_norm": 1.5390625, - "learning_rate": 0.00021662999999999996, - "loss": 0.0324, + "grad_norm": 0.8984375, + "learning_rate": 0.0004560631578947368, + "loss": 0.0503, "num_input_tokens_seen": 182124544, "step": 2780, - "train_runtime": 1055.9569, - "train_tokens_per_second": 172473.46 + "train_runtime": 912.1602, + "train_tokens_per_second": 199662.888 }, { "epoch": 0.6036347901341411, - "grad_norm": 1.6875, - "learning_rate": 0.00021632999999999996, - "loss": 0.0329, + "grad_norm": 1.1875, + "learning_rate": 0.0004554315789473684, + "loss": 0.0477, "num_input_tokens_seen": 182779904, "step": 2790, - "train_runtime": 1059.4531, - "train_tokens_per_second": 172522.884 + "train_runtime": 915.9796, + "train_tokens_per_second": 199545.819 }, { "epoch": 0.6057983556901774, - "grad_norm": 1.296875, - "learning_rate": 0.00021602999999999998, - "loss": 0.0418, + "grad_norm": 1.1484375, + "learning_rate": 0.00045479999999999994, + "loss": 0.0546, "num_input_tokens_seen": 183435264, "step": 2800, - "train_runtime": 1062.9237, - "train_tokens_per_second": 172576.13 + "train_runtime": 919.7965, + "train_tokens_per_second": 199430.278 }, { "epoch": 0.6057983556901774, - "eval_loss": 0.029509739950299263, - "eval_runtime": 26.4538, - "eval_samples_per_second": 1.21, - "eval_steps_per_second": 0.038, + "eval_loss": 0.03884509950876236, + "eval_runtime": 1.7475, + "eval_samples_per_second": 18.312, + "eval_steps_per_second": 0.572, "num_input_tokens_seen": 183435264, "step": 2800 }, { "epoch": 0.6057983556901774, - "eval_byte_accuracy": 0.994807121661721, - "eval_chrf": 40.03666382004858, - "eval_sacrebleu": 25.076596217882095, - "eval_word_accuracy": 0.9862637362637363, + "eval_byte_accuracy": 0.9918397626112759, + "eval_chrf": 93.07953774652597, + "eval_sacrebleu": 91.3351215187772, + "eval_word_accuracy": 0.9807692307692307, "num_input_tokens_seen": 183435264, - "perplexity": 1.029949467079658, + "perplexity": 1.0396094351650327, "step": 2800 }, { "epoch": 0.6079619212462137, - "grad_norm": 1.828125, - "learning_rate": 0.00021572999999999997, - "loss": 0.034, + "grad_norm": 0.63671875, + "learning_rate": 0.00045416842105263153, + "loss": 0.0481, "num_input_tokens_seen": 184090624, "step": 2810, - "train_runtime": 1092.8526, - "train_tokens_per_second": 168449.64 + "train_runtime": 925.3841, + "train_tokens_per_second": 198934.282 }, { "epoch": 0.6101254868022501, - "grad_norm": 1.3671875, - "learning_rate": 0.00021542999999999996, - "loss": 0.0357, + "grad_norm": 0.859375, + "learning_rate": 0.0004535368421052631, + "loss": 0.0503, "num_input_tokens_seen": 184745984, "step": 2820, - "train_runtime": 1096.3098, - "train_tokens_per_second": 168516.225 + "train_runtime": 929.1957, + "train_tokens_per_second": 198823.555 }, { "epoch": 0.6122890523582865, - "grad_norm": 1.2265625, - "learning_rate": 0.00021512999999999998, - "loss": 0.0327, + "grad_norm": 0.63671875, + "learning_rate": 0.0004529052631578947, + "loss": 0.048, "num_input_tokens_seen": 185401344, "step": 2830, - "train_runtime": 1099.7882, - "train_tokens_per_second": 168579.128 + "train_runtime": 933.0099, + "train_tokens_per_second": 198713.165 }, { "epoch": 0.6144526179143228, - "grad_norm": 1.15625, - "learning_rate": 0.00021482999999999997, - "loss": 0.0337, + "grad_norm": 1.109375, + "learning_rate": 0.0004522736842105263, + "loss": 0.0513, "num_input_tokens_seen": 186056704, "step": 2840, - "train_runtime": 1103.2864, - "train_tokens_per_second": 168638.627 + "train_runtime": 936.8218, + "train_tokens_per_second": 198604.161 }, { "epoch": 0.6166161834703592, - "grad_norm": 1.734375, - "learning_rate": 0.00021452999999999997, - "loss": 0.0346, + "grad_norm": 0.8828125, + "learning_rate": 0.0004516421052631579, + "loss": 0.05, "num_input_tokens_seen": 186712064, "step": 2850, - "train_runtime": 1106.8079, - "train_tokens_per_second": 168694.195 + "train_runtime": 940.6403, + "train_tokens_per_second": 198494.647 }, { "epoch": 0.6187797490263955, - "grad_norm": 2.0625, - "learning_rate": 0.00021422999999999996, - "loss": 0.0348, + "grad_norm": 0.70703125, + "learning_rate": 0.00045101052631578946, + "loss": 0.0503, "num_input_tokens_seen": 187367424, "step": 2860, - "train_runtime": 1110.3261, - "train_tokens_per_second": 168749.901 + "train_runtime": 944.4565, + "train_tokens_per_second": 198386.503 }, { "epoch": 0.6209433145824318, - "grad_norm": 1.1875, - "learning_rate": 0.00021392999999999998, - "loss": 0.0375, + "grad_norm": 1.0625, + "learning_rate": 0.000450378947368421, + "loss": 0.0535, "num_input_tokens_seen": 188022784, "step": 2870, - "train_runtime": 1113.8592, - "train_tokens_per_second": 168803.003 + "train_runtime": 948.2753, + "train_tokens_per_second": 198278.689 }, { "epoch": 0.6231068801384682, - "grad_norm": 1.5390625, - "learning_rate": 0.00021362999999999997, - "loss": 0.0343, + "grad_norm": 1.1875, + "learning_rate": 0.0004497473684210526, + "loss": 0.0528, "num_input_tokens_seen": 188678144, "step": 2880, - "train_runtime": 1117.3538, - "train_tokens_per_second": 168861.599 + "train_runtime": 952.0987, + "train_tokens_per_second": 198170.787 }, { "epoch": 0.6252704456945045, - "grad_norm": 1.578125, - "learning_rate": 0.00021332999999999996, - "loss": 0.0349, + "grad_norm": 1.1015625, + "learning_rate": 0.00044911578947368417, + "loss": 0.051, "num_input_tokens_seen": 189333504, "step": 2890, - "train_runtime": 1120.882, - "train_tokens_per_second": 168914.757 + "train_runtime": 955.9177, + "train_tokens_per_second": 198064.645 }, { "epoch": 0.6274340112505409, - "grad_norm": 1.34375, - "learning_rate": 0.00021302999999999996, - "loss": 0.036, + "grad_norm": 0.75, + "learning_rate": 0.00044848421052631575, + "loss": 0.0486, "num_input_tokens_seen": 189988864, "step": 2900, - "train_runtime": 1124.3669, - "train_tokens_per_second": 168974.086 + "train_runtime": 959.7442, + "train_tokens_per_second": 197957.814 }, { "epoch": 0.6274340112505409, - "eval_loss": 0.03246085345745087, - "eval_runtime": 24.7904, - "eval_samples_per_second": 1.291, - "eval_steps_per_second": 0.04, + "eval_loss": 0.0350818894803524, + "eval_runtime": 1.9312, + "eval_samples_per_second": 16.57, + "eval_steps_per_second": 0.518, "num_input_tokens_seen": 189988864, "step": 2900 }, { "epoch": 0.6274340112505409, - "eval_byte_accuracy": 0.9940652818991098, - "eval_chrf": 40.392273871958885, - "eval_sacrebleu": 25.58872424611773, - "eval_word_accuracy": 0.9862637362637363, + "eval_byte_accuracy": 0.9918397626112759, + "eval_chrf": 93.43306198468976, + "eval_sacrebleu": 91.46567212681035, + "eval_word_accuracy": 0.9793956043956044, "num_input_tokens_seen": 189988864, - "perplexity": 1.032993454230247, + "perplexity": 1.0357045186318887, "step": 2900 }, { "epoch": 0.6295975768065772, - "grad_norm": 1.75, - "learning_rate": 0.00021272999999999998, - "loss": 0.0377, + "grad_norm": 0.6796875, + "learning_rate": 0.00044785263157894734, + "loss": 0.0485, "num_input_tokens_seen": 190644224, "step": 2910, - "train_runtime": 1152.6364, - "train_tokens_per_second": 165398.401 + "train_runtime": 965.5126, + "train_tokens_per_second": 197453.898 }, { "epoch": 0.6317611423626136, - "grad_norm": 1.2734375, - "learning_rate": 0.00021242999999999997, - "loss": 0.033, + "grad_norm": 1.015625, + "learning_rate": 0.0004472210526315789, + "loss": 0.0508, "num_input_tokens_seen": 191299584, "step": 2920, - "train_runtime": 1156.1263, - "train_tokens_per_second": 165465.988 + "train_runtime": 969.3212, + "train_tokens_per_second": 197354.183 }, { "epoch": 0.63392470791865, - "grad_norm": 1.6796875, - "learning_rate": 0.00021212999999999996, - "loss": 0.0319, + "grad_norm": 0.91796875, + "learning_rate": 0.0004465894736842105, + "loss": 0.0486, "num_input_tokens_seen": 191954944, "step": 2930, - "train_runtime": 1159.6173, - "train_tokens_per_second": 165533.013 + "train_runtime": 973.1287, + "train_tokens_per_second": 197255.457 }, { "epoch": 0.6360882734746863, - "grad_norm": 1.5546875, - "learning_rate": 0.00021182999999999996, - "loss": 0.031, + "grad_norm": 0.75, + "learning_rate": 0.00044595789473684204, + "loss": 0.0504, "num_input_tokens_seen": 192610304, "step": 2940, - "train_runtime": 1163.1162, - "train_tokens_per_second": 165598.5 + "train_runtime": 976.9568, + "train_tokens_per_second": 197153.346 }, { "epoch": 0.6382518390307226, - "grad_norm": 1.5390625, - "learning_rate": 0.00021152999999999998, - "loss": 0.034, + "grad_norm": 0.76953125, + "learning_rate": 0.00044532631578947363, + "loss": 0.049, "num_input_tokens_seen": 193265664, "step": 2950, - "train_runtime": 1166.6346, - "train_tokens_per_second": 165660.84 + "train_runtime": 980.7648, + "train_tokens_per_second": 197056.071 }, { "epoch": 0.6404154045867589, - "grad_norm": 1.7578125, - "learning_rate": 0.00021122999999999997, - "loss": 0.037, + "grad_norm": 1.2109375, + "learning_rate": 0.0004446947368421052, + "loss": 0.053, "num_input_tokens_seen": 193921024, "step": 2960, - "train_runtime": 1170.1462, - "train_tokens_per_second": 165723.76 + "train_runtime": 984.5799, + "train_tokens_per_second": 196958.138 }, { "epoch": 0.6425789701427953, - "grad_norm": 1.875, - "learning_rate": 0.00021092999999999996, - "loss": 0.037, + "grad_norm": 0.71875, + "learning_rate": 0.0004440631578947368, + "loss": 0.0513, "num_input_tokens_seen": 194576384, "step": 2970, - "train_runtime": 1173.6558, - "train_tokens_per_second": 165786.589 + "train_runtime": 988.3971, + "train_tokens_per_second": 196860.531 }, { "epoch": 0.6447425356988317, - "grad_norm": 1.15625, - "learning_rate": 0.00021062999999999995, - "loss": 0.0361, + "grad_norm": 1.265625, + "learning_rate": 0.0004434315789473684, + "loss": 0.0486, "num_input_tokens_seen": 195231744, "step": 2980, - "train_runtime": 1177.1644, - "train_tokens_per_second": 165849.179 + "train_runtime": 992.2137, + "train_tokens_per_second": 196763.799 }, { "epoch": 0.646906101254868, - "grad_norm": 1.3828125, - "learning_rate": 0.00021032999999999997, - "loss": 0.0336, + "grad_norm": 0.80078125, + "learning_rate": 0.0004428, + "loss": 0.0489, "num_input_tokens_seen": 195887104, "step": 2990, - "train_runtime": 1180.656, - "train_tokens_per_second": 165913.782 + "train_runtime": 996.0352, + "train_tokens_per_second": 196666.855 }, { "epoch": 0.6490696668109044, - "grad_norm": 1.3671875, - "learning_rate": 0.00021002999999999997, - "loss": 0.0335, + "grad_norm": 0.67578125, + "learning_rate": 0.00044216842105263156, + "loss": 0.0482, "num_input_tokens_seen": 196542464, "step": 3000, - "train_runtime": 1184.0733, - "train_tokens_per_second": 165988.425 + "train_runtime": 999.8291, + "train_tokens_per_second": 196576.059 }, { "epoch": 0.6490696668109044, - "eval_loss": 0.02560480311512947, - "eval_runtime": 23.5203, - "eval_samples_per_second": 1.361, - "eval_steps_per_second": 0.043, + "eval_loss": 0.03645855933427811, + "eval_runtime": 1.6979, + "eval_samples_per_second": 18.847, + "eval_steps_per_second": 0.589, "num_input_tokens_seen": 196542464, "step": 3000 }, { "epoch": 0.6490696668109044, - "eval_byte_accuracy": 0.9951780415430267, - "eval_chrf": 38.93681514257257, - "eval_sacrebleu": 23.254554029771697, - "eval_word_accuracy": 0.9862637362637363, + "eval_byte_accuracy": 0.9903560830860534, + "eval_chrf": 92.59396939397627, + "eval_sacrebleu": 88.68557212958412, + "eval_word_accuracy": 0.978021978021978, "num_input_tokens_seen": 196542464, - "perplexity": 1.0259354218644945, + "perplexity": 1.037131323714694, "step": 3000 }, { "epoch": 0.6512332323669408, - "grad_norm": 1.1015625, - "learning_rate": 0.00020972999999999999, - "loss": 0.0325, + "grad_norm": 0.85546875, + "learning_rate": 0.00044153684210526315, + "loss": 0.0494, "num_input_tokens_seen": 197197824, "step": 3010, - "train_runtime": 1211.2085, - "train_tokens_per_second": 162810.798 + "train_runtime": 1005.4045, + "train_tokens_per_second": 196137.805 }, { "epoch": 0.6533967979229771, - "grad_norm": 1.5703125, - "learning_rate": 0.00020943, - "loss": 0.034, + "grad_norm": 1.015625, + "learning_rate": 0.0004409052631578947, + "loss": 0.0502, "num_input_tokens_seen": 197853184, "step": 3020, - "train_runtime": 1214.7134, - "train_tokens_per_second": 162880.543 + "train_runtime": 1009.221, + "train_tokens_per_second": 196045.449 }, { "epoch": 0.6555603634790134, - "grad_norm": 1.640625, - "learning_rate": 0.00020913, - "loss": 0.0385, + "grad_norm": 1.0390625, + "learning_rate": 0.00044027368421052627, + "loss": 0.0537, "num_input_tokens_seen": 198508544, "step": 3030, - "train_runtime": 1218.2286, - "train_tokens_per_second": 162948.513 + "train_runtime": 1013.0394, + "train_tokens_per_second": 195953.429 }, { "epoch": 0.6577239290350497, - "grad_norm": 1.375, - "learning_rate": 0.00020883, - "loss": 0.0325, + "grad_norm": 0.81640625, + "learning_rate": 0.00043964210526315785, + "loss": 0.0479, "num_input_tokens_seen": 199163904, "step": 3040, - "train_runtime": 1221.7006, - "train_tokens_per_second": 163021.857 + "train_runtime": 1016.8572, + "train_tokens_per_second": 195862.212 }, { "epoch": 0.6598874945910861, - "grad_norm": 1.703125, - "learning_rate": 0.00020852999999999998, - "loss": 0.0326, + "grad_norm": 0.7265625, + "learning_rate": 0.00043901052631578944, + "loss": 0.046, "num_input_tokens_seen": 199819264, "step": 3050, - "train_runtime": 1225.1799, - "train_tokens_per_second": 163093.809 + "train_runtime": 1020.671, + "train_tokens_per_second": 195772.461 }, { "epoch": 0.6620510601471224, - "grad_norm": 2.265625, - "learning_rate": 0.00020823, - "loss": 0.0319, + "grad_norm": 0.859375, + "learning_rate": 0.000438378947368421, + "loss": 0.0456, "num_input_tokens_seen": 200474624, "step": 3060, - "train_runtime": 1228.6518, - "train_tokens_per_second": 163166.343 + "train_runtime": 1024.488, + "train_tokens_per_second": 195682.737 }, { "epoch": 0.6642146257031588, - "grad_norm": 1.171875, - "learning_rate": 0.00020793, - "loss": 0.0356, + "grad_norm": 0.984375, + "learning_rate": 0.0004377473684210526, + "loss": 0.048, "num_input_tokens_seen": 201129984, "step": 3070, - "train_runtime": 1232.0823, - "train_tokens_per_second": 163243.957 + "train_runtime": 1028.3086, + "train_tokens_per_second": 195593.021 }, { "epoch": 0.6663781912591952, - "grad_norm": 1.46875, - "learning_rate": 0.00020763, - "loss": 0.0325, + "grad_norm": 1.0390625, + "learning_rate": 0.0004371157894736842, + "loss": 0.0495, "num_input_tokens_seen": 201785344, "step": 3080, - "train_runtime": 1235.5476, - "train_tokens_per_second": 163316.526 + "train_runtime": 1032.1272, + "train_tokens_per_second": 195504.331 }, { "epoch": 0.6685417568152315, - "grad_norm": 1.1953125, - "learning_rate": 0.00020733, - "loss": 0.0313, + "grad_norm": 0.796875, + "learning_rate": 0.00043648421052631573, + "loss": 0.0493, "num_input_tokens_seen": 202440704, "step": 3090, - "train_runtime": 1239.0389, - "train_tokens_per_second": 163385.268 + "train_runtime": 1035.9438, + "train_tokens_per_second": 195416.689 }, { "epoch": 0.6707053223712679, - "grad_norm": 1.3125, - "learning_rate": 0.00020703, - "loss": 0.0296, + "grad_norm": 0.92578125, + "learning_rate": 0.0004358526315789473, + "loss": 0.046, "num_input_tokens_seen": 203096064, "step": 3100, - "train_runtime": 1242.5526, - "train_tokens_per_second": 163450.674 + "train_runtime": 1039.7635, + "train_tokens_per_second": 195329.092 }, { "epoch": 0.6707053223712679, - "eval_loss": 0.022955331951379776, - "eval_runtime": 27.7072, - "eval_samples_per_second": 1.155, - "eval_steps_per_second": 0.036, + "eval_loss": 0.03350326791405678, + "eval_runtime": 1.8809, + "eval_samples_per_second": 17.013, + "eval_steps_per_second": 0.532, "num_input_tokens_seen": 203096064, "step": 3100 }, { "epoch": 0.6707053223712679, - "eval_byte_accuracy": 0.9966617210682492, - "eval_chrf": 39.1323808390993, - "eval_sacrebleu": 23.889620809402665, - "eval_word_accuracy": 0.9903846153846154, + "eval_byte_accuracy": 0.9918397626112759, + "eval_chrf": 93.82444893287017, + "eval_sacrebleu": 91.25242885956851, + "eval_word_accuracy": 0.9807692307692307, "num_input_tokens_seen": 203096064, - "perplexity": 1.0232208332484907, + "perplexity": 1.034070822975335, "step": 3100 }, { "epoch": 0.6728688879273041, - "grad_norm": 1.375, - "learning_rate": 0.00020673, - "loss": 0.0319, + "grad_norm": 0.71484375, + "learning_rate": 0.0004352210526315789, + "loss": 0.0462, "num_input_tokens_seen": 203751424, "step": 3110, - "train_runtime": 1273.7393, - "train_tokens_per_second": 159963.203 + "train_runtime": 1045.4737, + "train_tokens_per_second": 194889.098 }, { "epoch": 0.6750324534833405, - "grad_norm": 0.96875, - "learning_rate": 0.00020643, - "loss": 0.0317, + "grad_norm": 0.74609375, + "learning_rate": 0.0004345894736842105, + "loss": 0.048, "num_input_tokens_seen": 204406784, "step": 3120, - "train_runtime": 1277.2239, - "train_tokens_per_second": 160039.898 + "train_runtime": 1049.2876, + "train_tokens_per_second": 194805.295 }, { "epoch": 0.6771960190393769, - "grad_norm": 1.2109375, - "learning_rate": 0.00020613, - "loss": 0.03, + "grad_norm": 1.03125, + "learning_rate": 0.0004339578947368421, + "loss": 0.0468, "num_input_tokens_seen": 205062144, "step": 3130, - "train_runtime": 1280.6722, - "train_tokens_per_second": 160120.714 + "train_runtime": 1053.1102, + "train_tokens_per_second": 194720.49 }, { "epoch": 0.6793595845954132, - "grad_norm": 1.0703125, - "learning_rate": 0.00020583, - "loss": 0.0323, + "grad_norm": 1.125, + "learning_rate": 0.00043332631578947366, + "loss": 0.048, "num_input_tokens_seen": 205717504, "step": 3140, - "train_runtime": 1284.153, - "train_tokens_per_second": 160197.037 + "train_runtime": 1056.9229, + "train_tokens_per_second": 194638.144 }, { "epoch": 0.6815231501514496, - "grad_norm": 1.6171875, - "learning_rate": 0.00020553, - "loss": 0.03, + "grad_norm": 0.91796875, + "learning_rate": 0.00043269473684210525, + "loss": 0.0455, "num_input_tokens_seen": 206372864, "step": 3150, - "train_runtime": 1287.6625, - "train_tokens_per_second": 160269.369 + "train_runtime": 1060.7403, + "train_tokens_per_second": 194555.501 }, { "epoch": 0.683686715707486, - "grad_norm": 0.87890625, - "learning_rate": 0.00020522999999999998, - "loss": 0.0282, + "grad_norm": 0.89453125, + "learning_rate": 0.0004320631578947368, + "loss": 0.0442, "num_input_tokens_seen": 207028224, "step": 3160, - "train_runtime": 1291.1639, - "train_tokens_per_second": 160342.328 + "train_runtime": 1064.5576, + "train_tokens_per_second": 194473.488 }, { "epoch": 0.6858502812635223, - "grad_norm": 2.625, - "learning_rate": 0.00020493, - "loss": 0.0314, + "grad_norm": 1.4921875, + "learning_rate": 0.00043143157894736837, + "loss": 0.0472, "num_input_tokens_seen": 207683584, "step": 3170, - "train_runtime": 1294.6195, - "train_tokens_per_second": 160420.563 + "train_runtime": 1068.3716, + "train_tokens_per_second": 194392.647 }, { "epoch": 0.6880138468195587, - "grad_norm": 1.3125, - "learning_rate": 0.00020463, - "loss": 0.0312, + "grad_norm": 0.61328125, + "learning_rate": 0.00043079999999999995, + "loss": 0.0466, "num_input_tokens_seen": 208338944, "step": 3180, - "train_runtime": 1298.089, - "train_tokens_per_second": 160496.655 + "train_runtime": 1072.1806, + "train_tokens_per_second": 194313.287 }, { "epoch": 0.6901774123755949, - "grad_norm": 1.6171875, - "learning_rate": 0.00020433, - "loss": 0.0282, + "grad_norm": 1.1171875, + "learning_rate": 0.00043016842105263154, + "loss": 0.0433, "num_input_tokens_seen": 208994304, "step": 3190, - "train_runtime": 1301.6091, - "train_tokens_per_second": 160566.104 + "train_runtime": 1075.9997, + "train_tokens_per_second": 194232.684 }, { "epoch": 0.6923409779316313, - "grad_norm": 1.359375, - "learning_rate": 0.00020402999999999998, - "loss": 0.0286, + "grad_norm": 1.0546875, + "learning_rate": 0.00042953684210526313, + "loss": 0.0475, "num_input_tokens_seen": 209649664, "step": 3200, - "train_runtime": 1305.0539, - "train_tokens_per_second": 160644.454 + "train_runtime": 1079.7767, + "train_tokens_per_second": 194160.206 }, { "epoch": 0.6923409779316313, - "eval_loss": 0.02385401539504528, - "eval_runtime": 26.4763, - "eval_samples_per_second": 1.209, - "eval_steps_per_second": 0.038, + "eval_loss": 0.03217955678701401, + "eval_runtime": 1.7478, + "eval_samples_per_second": 18.309, + "eval_steps_per_second": 0.572, "num_input_tokens_seen": 209649664, "step": 3200 }, { "epoch": 0.6923409779316313, - "eval_byte_accuracy": 0.9966617210682492, - "eval_chrf": 40.01441753803777, - "eval_sacrebleu": 23.17121626781889, - "eval_word_accuracy": 0.989010989010989, + "eval_byte_accuracy": 0.9922106824925816, + "eval_chrf": 93.56755109473936, + "eval_sacrebleu": 92.12472781620804, + "eval_word_accuracy": 0.9807692307692307, "num_input_tokens_seen": 209649664, - "perplexity": 1.0241407981872583, + "perplexity": 1.0327029174764155, "step": 3200 }, { "epoch": 0.6945045434876677, - "grad_norm": 1.53125, - "learning_rate": 0.00020373, - "loss": 0.0291, + "grad_norm": 0.65234375, + "learning_rate": 0.0004289052631578947, + "loss": 0.0452, "num_input_tokens_seen": 210305024, "step": 3210, - "train_runtime": 1335.0231, - "train_tokens_per_second": 157529.127 + "train_runtime": 1085.3612, + "train_tokens_per_second": 193765.005 }, { "epoch": 0.696668109043704, - "grad_norm": 1.0703125, - "learning_rate": 0.00020343, - "loss": 0.0379, + "grad_norm": 0.6796875, + "learning_rate": 0.0004282736842105263, + "loss": 0.0485, "num_input_tokens_seen": 210960384, "step": 3220, - "train_runtime": 1338.5012, - "train_tokens_per_second": 157609.413 + "train_runtime": 1089.1789, + "train_tokens_per_second": 193687.542 }, { "epoch": 0.6988316745997404, - "grad_norm": 1.0390625, - "learning_rate": 0.00020313, - "loss": 0.0319, + "grad_norm": 0.498046875, + "learning_rate": 0.00042764210526315783, + "loss": 0.0449, "num_input_tokens_seen": 211615744, "step": 3230, - "train_runtime": 1342.0219, - "train_tokens_per_second": 157684.265 + "train_runtime": 1092.9778, + "train_tokens_per_second": 193613.946 }, { "epoch": 0.7009952401557767, - "grad_norm": 1.6328125, - "learning_rate": 0.00020282999999999998, - "loss": 0.0326, + "grad_norm": 0.59375, + "learning_rate": 0.0004270105263157895, + "loss": 0.0459, "num_input_tokens_seen": 212271104, "step": 3240, - "train_runtime": 1345.5147, - "train_tokens_per_second": 157762.012 + "train_runtime": 1096.795, + "train_tokens_per_second": 193537.622 }, { "epoch": 0.7031588057118131, - "grad_norm": 1.59375, - "learning_rate": 0.00020253, - "loss": 0.0311, + "grad_norm": 0.81640625, + "learning_rate": 0.000426378947368421, + "loss": 0.0432, "num_input_tokens_seen": 212926464, "step": 3250, - "train_runtime": 1348.9672, - "train_tokens_per_second": 157844.06 + "train_runtime": 1100.6057, + "train_tokens_per_second": 193462.988 }, { "epoch": 0.7053223712678495, - "grad_norm": 1.5, - "learning_rate": 0.00020223, - "loss": 0.0286, + "grad_norm": 0.96875, + "learning_rate": 0.0004257473684210526, + "loss": 0.0419, "num_input_tokens_seen": 213581824, "step": 3260, - "train_runtime": 1352.4636, - "train_tokens_per_second": 157920.567 + "train_runtime": 1104.4101, + "train_tokens_per_second": 193389.955 }, { "epoch": 0.7074859368238857, - "grad_norm": 1.40625, - "learning_rate": 0.00020192999999999999, - "loss": 0.0301, + "grad_norm": 0.9609375, + "learning_rate": 0.0004251157894736842, + "loss": 0.0461, "num_input_tokens_seen": 214237184, "step": 3270, - "train_runtime": 1355.9021, - "train_tokens_per_second": 158003.426 + "train_runtime": 1108.2216, + "train_tokens_per_second": 193316.195 }, { "epoch": 0.7096495023799221, - "grad_norm": 1.7109375, - "learning_rate": 0.00020162999999999998, - "loss": 0.0312, + "grad_norm": 0.84765625, + "learning_rate": 0.00042448421052631576, + "loss": 0.0453, "num_input_tokens_seen": 214892544, "step": 3280, - "train_runtime": 1359.3505, - "train_tokens_per_second": 158084.722 + "train_runtime": 1112.0361, + "train_tokens_per_second": 193242.412 }, { "epoch": 0.7118130679359584, - "grad_norm": 0.87109375, - "learning_rate": 0.00020133, - "loss": 0.0323, + "grad_norm": 0.69140625, + "learning_rate": 0.00042385263157894735, + "loss": 0.0491, "num_input_tokens_seen": 215547904, "step": 3290, - "train_runtime": 1362.8441, - "train_tokens_per_second": 158160.351 + "train_runtime": 1115.8415, + "train_tokens_per_second": 193170.723 }, { "epoch": 0.7139766334919948, - "grad_norm": 1.21875, - "learning_rate": 0.00020103, - "loss": 0.0282, + "grad_norm": 0.71484375, + "learning_rate": 0.0004232210526315789, + "loss": 0.0456, "num_input_tokens_seen": 216203264, "step": 3300, - "train_runtime": 1366.3222, - "train_tokens_per_second": 158237.397 + "train_runtime": 1119.6565, + "train_tokens_per_second": 193097.858 }, { "epoch": 0.7139766334919948, - "eval_loss": 0.025436140596866608, - "eval_runtime": 25.3687, - "eval_samples_per_second": 1.261, - "eval_steps_per_second": 0.039, + "eval_loss": 0.032814886420965195, + "eval_runtime": 1.5657, + "eval_samples_per_second": 20.439, + "eval_steps_per_second": 0.639, "num_input_tokens_seen": 216203264, "step": 3300 }, { "epoch": 0.7139766334919948, - "eval_byte_accuracy": 0.995919881305638, - "eval_chrf": 39.616154045771374, - "eval_sacrebleu": 22.11652869007284, - "eval_word_accuracy": 0.9876373626373627, + "eval_byte_accuracy": 0.990727002967359, + "eval_chrf": 92.9667765910088, + "eval_sacrebleu": 91.00055209302626, + "eval_word_accuracy": 0.9793956043956044, "num_input_tokens_seen": 216203264, - "perplexity": 1.025762399604263, + "perplexity": 1.0333592327091379, "step": 3300 }, { "epoch": 0.7161401990480312, - "grad_norm": 1.265625, - "learning_rate": 0.00020072999999999998, - "loss": 0.031, + "grad_norm": 0.8984375, + "learning_rate": 0.0004225894736842105, + "loss": 0.0481, "num_input_tokens_seen": 216858624, "step": 3310, - "train_runtime": 1395.2072, - "train_tokens_per_second": 155431.127 + "train_runtime": 1125.0552, + "train_tokens_per_second": 192753.759 }, { "epoch": 0.7183037646040675, - "grad_norm": 1.859375, - "learning_rate": 0.00020043, - "loss": 0.0358, + "grad_norm": 1.1875, + "learning_rate": 0.00042195789473684206, + "loss": 0.0485, "num_input_tokens_seen": 217513984, "step": 3320, - "train_runtime": 1398.7123, - "train_tokens_per_second": 155510.165 + "train_runtime": 1128.8657, + "train_tokens_per_second": 192683.671 }, { "epoch": 0.7204673301601039, - "grad_norm": 1.1328125, - "learning_rate": 0.00020013, - "loss": 0.0294, + "grad_norm": 0.8203125, + "learning_rate": 0.00042132631578947364, + "loss": 0.045, "num_input_tokens_seen": 218169344, "step": 3330, - "train_runtime": 1402.2031, - "train_tokens_per_second": 155590.397 + "train_runtime": 1132.6806, + "train_tokens_per_second": 192613.289 }, { "epoch": 0.7226308957161403, - "grad_norm": 1.328125, - "learning_rate": 0.00019983, - "loss": 0.0267, + "grad_norm": 0.96484375, + "learning_rate": 0.00042069473684210523, + "loss": 0.0433, "num_input_tokens_seen": 218824704, "step": 3340, - "train_runtime": 1405.7206, - "train_tokens_per_second": 155667.28 + "train_runtime": 1136.5047, + "train_tokens_per_second": 192541.834 }, { "epoch": 0.7247944612721765, - "grad_norm": 1.1875, - "learning_rate": 0.00019952999999999998, - "loss": 0.033, + "grad_norm": 0.66796875, + "learning_rate": 0.0004200631578947368, + "loss": 0.0425, "num_input_tokens_seen": 219480064, "step": 3350, - "train_runtime": 1409.2099, - "train_tokens_per_second": 155746.896 + "train_runtime": 1140.3158, + "train_tokens_per_second": 192473.056 }, { "epoch": 0.7269580268282129, - "grad_norm": 1.3828125, - "learning_rate": 0.00019923, - "loss": 0.0302, + "grad_norm": 0.609375, + "learning_rate": 0.0004194315789473684, + "loss": 0.0448, "num_input_tokens_seen": 220135424, "step": 3360, - "train_runtime": 1412.6924, - "train_tokens_per_second": 155826.859 + "train_runtime": 1144.1264, + "train_tokens_per_second": 192404.804 }, { "epoch": 0.7291215923842492, - "grad_norm": 1.4375, - "learning_rate": 0.00019893, - "loss": 0.0311, + "grad_norm": 0.76171875, + "learning_rate": 0.00041879999999999993, + "loss": 0.0478, "num_input_tokens_seen": 220790784, "step": 3370, - "train_runtime": 1416.2124, - "train_tokens_per_second": 155902.314 + "train_runtime": 1147.9385, + "train_tokens_per_second": 192336.766 }, { "epoch": 0.7312851579402856, - "grad_norm": 1.296875, - "learning_rate": 0.00019863, - "loss": 0.0279, + "grad_norm": 0.734375, + "learning_rate": 0.0004181684210526316, + "loss": 0.0436, "num_input_tokens_seen": 221446144, "step": 3380, - "train_runtime": 1419.7407, - "train_tokens_per_second": 155976.474 + "train_runtime": 1151.7449, + "train_tokens_per_second": 192270.137 }, { "epoch": 0.733448723496322, - "grad_norm": 0.9765625, - "learning_rate": 0.00019832999999999998, - "loss": 0.0257, + "grad_norm": 0.71875, + "learning_rate": 0.0004175368421052631, + "loss": 0.0415, "num_input_tokens_seen": 222101504, "step": 3390, - "train_runtime": 1423.2379, - "train_tokens_per_second": 156053.671 + "train_runtime": 1155.5548, + "train_tokens_per_second": 192203.356 }, { "epoch": 0.7356122890523583, - "grad_norm": 1.6484375, - "learning_rate": 0.00019803, - "loss": 0.0276, + "grad_norm": 1.2890625, + "learning_rate": 0.0004169052631578947, + "loss": 0.041, "num_input_tokens_seen": 222756864, "step": 3400, - "train_runtime": 1426.7086, - "train_tokens_per_second": 156133.402 + "train_runtime": 1159.3705, + "train_tokens_per_second": 192136.047 }, { "epoch": 0.7356122890523583, - "eval_loss": 0.026543285697698593, - "eval_runtime": 30.8543, - "eval_samples_per_second": 1.037, - "eval_steps_per_second": 0.032, + "eval_loss": 0.03272933140397072, + "eval_runtime": 1.4683, + "eval_samples_per_second": 21.794, + "eval_steps_per_second": 0.681, "num_input_tokens_seen": 222756864, "step": 3400 }, { "epoch": 0.7356122890523583, - "eval_byte_accuracy": 0.995919881305638, - "eval_chrf": 40.13433174156631, - "eval_sacrebleu": 24.091544753238622, - "eval_word_accuracy": 0.989010989010989, + "eval_byte_accuracy": 0.9918397626112759, + "eval_chrf": 94.39060549038757, + "eval_sacrebleu": 92.48709733402343, + "eval_word_accuracy": 0.9793956043956044, "num_input_tokens_seen": 222756864, - "perplexity": 1.026898696326207, + "perplexity": 1.0332708274242337, "step": 3400 }, { "epoch": 0.7377758546083947, - "grad_norm": 1.5859375, - "learning_rate": 0.00019773, - "loss": 0.0279, + "grad_norm": 0.9609375, + "learning_rate": 0.0004162736842105263, + "loss": 0.0415, "num_input_tokens_seen": 223412224, "step": 3410, - "train_runtime": 1461.0104, - "train_tokens_per_second": 152916.244 + "train_runtime": 1164.6558, + "train_tokens_per_second": 191826.826 }, { "epoch": 0.7399394201644309, - "grad_norm": 1.0234375, - "learning_rate": 0.00019742999999999999, - "loss": 0.0287, + "grad_norm": 0.75390625, + "learning_rate": 0.00041564210526315787, + "loss": 0.0435, "num_input_tokens_seen": 224067584, "step": 3420, - "train_runtime": 1464.4636, - "train_tokens_per_second": 153003.181 + "train_runtime": 1168.4717, + "train_tokens_per_second": 191761.234 }, { "epoch": 0.7421029857204673, - "grad_norm": 1.265625, - "learning_rate": 0.00019712999999999998, - "loss": 0.0286, + "grad_norm": 0.7421875, + "learning_rate": 0.00041501052631578945, + "loss": 0.0429, "num_input_tokens_seen": 224722944, "step": 3430, - "train_runtime": 1467.9702, - "train_tokens_per_second": 153084.133 + "train_runtime": 1172.2982, + "train_tokens_per_second": 191694.351 }, { "epoch": 0.7442665512765037, - "grad_norm": 1.9921875, - "learning_rate": 0.00019683, - "loss": 0.0302, + "grad_norm": 1.1875, + "learning_rate": 0.000414378947368421, + "loss": 0.0444, "num_input_tokens_seen": 225378304, "step": 3440, - "train_runtime": 1471.4411, - "train_tokens_per_second": 153168.412 + "train_runtime": 1176.1143, + "train_tokens_per_second": 191629.6 }, { "epoch": 0.74643011683254, - "grad_norm": 1.0625, - "learning_rate": 0.00019653, - "loss": 0.0285, + "grad_norm": 0.76171875, + "learning_rate": 0.0004137473684210526, + "loss": 0.0467, "num_input_tokens_seen": 226033664, "step": 3450, - "train_runtime": 1474.8996, - "train_tokens_per_second": 153253.595 + "train_runtime": 1179.9278, + "train_tokens_per_second": 191565.67 }, { "epoch": 0.7485936823885764, - "grad_norm": 1.140625, - "learning_rate": 0.00019622999999999998, - "loss": 0.0266, + "grad_norm": 0.765625, + "learning_rate": 0.00041311578947368416, + "loss": 0.042, "num_input_tokens_seen": 226684928, "step": 3460, - "train_runtime": 1478.3876, - "train_tokens_per_second": 153332.537 + "train_runtime": 1183.7227, + "train_tokens_per_second": 191501.723 }, { "epoch": 0.7507572479446127, - "grad_norm": 1.0625, - "learning_rate": 0.00019592999999999998, - "loss": 0.0345, + "grad_norm": 0.6796875, + "learning_rate": 0.0004124842105263158, + "loss": 0.0433, "num_input_tokens_seen": 227340288, "step": 3470, - "train_runtime": 1481.906, - "train_tokens_per_second": 153410.74 + "train_runtime": 1187.5357, + "train_tokens_per_second": 191438.691 }, { "epoch": 0.7529208135006491, - "grad_norm": 1.1484375, - "learning_rate": 0.00019563, - "loss": 0.0255, + "grad_norm": 0.7265625, + "learning_rate": 0.00041185263157894733, + "loss": 0.0386, "num_input_tokens_seen": 227995648, "step": 3480, - "train_runtime": 1485.4058, - "train_tokens_per_second": 153490.475 + "train_runtime": 1191.3433, + "train_tokens_per_second": 191376.943 }, { "epoch": 0.7550843790566855, - "grad_norm": 1.15625, - "learning_rate": 0.00019533, - "loss": 0.0286, + "grad_norm": 0.64453125, + "learning_rate": 0.0004112210526315789, + "loss": 0.042, "num_input_tokens_seen": 228651008, "step": 3490, - "train_runtime": 1488.8839, - "train_tokens_per_second": 153572.089 + "train_runtime": 1195.1587, + "train_tokens_per_second": 191314.349 }, { "epoch": 0.7572479446127217, - "grad_norm": 1.8671875, - "learning_rate": 0.00019502999999999998, - "loss": 0.0303, + "grad_norm": 0.7890625, + "learning_rate": 0.0004105894736842105, + "loss": 0.0452, "num_input_tokens_seen": 229306368, "step": 3500, - "train_runtime": 1492.3837, - "train_tokens_per_second": 153651.084 + "train_runtime": 1198.972, + "train_tokens_per_second": 191252.485 }, { "epoch": 0.7572479446127217, - "eval_loss": 0.018595121800899506, - "eval_runtime": 25.382, - "eval_samples_per_second": 1.261, - "eval_steps_per_second": 0.039, + "eval_loss": 0.02840009331703186, + "eval_runtime": 1.49, + "eval_samples_per_second": 21.476, + "eval_steps_per_second": 0.671, "num_input_tokens_seen": 229306368, "step": 3500 }, { "epoch": 0.7572479446127217, - "eval_byte_accuracy": 0.9974035608308606, - "eval_chrf": 38.45583225823713, - "eval_sacrebleu": 21.724855449932402, - "eval_word_accuracy": 0.9931318681318682, + "eval_byte_accuracy": 0.9944362017804155, + "eval_chrf": 96.0273311209701, + "eval_sacrebleu": 94.71325789004388, + "eval_word_accuracy": 0.9876373626373627, "num_input_tokens_seen": 229306368, - "perplexity": 1.0187690877110538, + "perplexity": 1.0288072209830508, "step": 3500 }, { "epoch": 0.7594115101687581, - "grad_norm": 1.0234375, - "learning_rate": 0.00019472999999999997, - "loss": 0.0269, + "grad_norm": 0.8203125, + "learning_rate": 0.00040995789473684204, + "loss": 0.0414, "num_input_tokens_seen": 229961728, "step": 3510, - "train_runtime": 1521.3869, - "train_tokens_per_second": 151152.69 + "train_runtime": 1204.34, + "train_tokens_per_second": 190944.188 }, { "epoch": 0.7615750757247944, - "grad_norm": 0.93359375, - "learning_rate": 0.00019443, - "loss": 0.0251, + "grad_norm": 0.9453125, + "learning_rate": 0.0004093263157894737, + "loss": 0.0403, "num_input_tokens_seen": 230617088, "step": 3520, - "train_runtime": 1524.8792, - "train_tokens_per_second": 151236.299 + "train_runtime": 1208.1589, + "train_tokens_per_second": 190883.08 }, { "epoch": 0.7637386412808308, - "grad_norm": 0.83984375, - "learning_rate": 0.00019412999999999999, - "loss": 0.0274, + "grad_norm": 0.97265625, + "learning_rate": 0.0004086947368421052, + "loss": 0.0409, "num_input_tokens_seen": 231272448, "step": 3530, - "train_runtime": 1528.3391, - "train_tokens_per_second": 151322.729 + "train_runtime": 1211.9754, + "train_tokens_per_second": 190822.731 }, { "epoch": 0.7659022068368672, - "grad_norm": 1.359375, - "learning_rate": 0.00019382999999999998, - "loss": 0.0279, + "grad_norm": 0.88671875, + "learning_rate": 0.00040806315789473685, + "loss": 0.0452, "num_input_tokens_seen": 231927808, "step": 3540, - "train_runtime": 1531.8037, - "train_tokens_per_second": 151408.311 + "train_runtime": 1215.7804, + "train_tokens_per_second": 190764.55 }, { "epoch": 0.7680657723929035, - "grad_norm": 1.0234375, - "learning_rate": 0.00019352999999999997, - "loss": 0.0252, + "grad_norm": 0.85546875, + "learning_rate": 0.0004074315789473684, + "loss": 0.0409, "num_input_tokens_seen": 232583168, "step": 3550, - "train_runtime": 1535.2546, - "train_tokens_per_second": 151494.856 + "train_runtime": 1219.5993, + "train_tokens_per_second": 190704.58 }, { "epoch": 0.7702293379489399, - "grad_norm": 1.1953125, - "learning_rate": 0.00019323, - "loss": 0.0281, + "grad_norm": 1.015625, + "learning_rate": 0.00040679999999999997, + "loss": 0.04, "num_input_tokens_seen": 233238528, "step": 3560, - "train_runtime": 1538.721, - "train_tokens_per_second": 151579.475 + "train_runtime": 1223.4146, + "train_tokens_per_second": 190645.526 }, { "epoch": 0.7723929035049762, - "grad_norm": 1.4609375, - "learning_rate": 0.00019292999999999998, - "loss": 0.0345, + "grad_norm": 0.9375, + "learning_rate": 0.00040616842105263155, + "loss": 0.0435, "num_input_tokens_seen": 233893888, "step": 3570, - "train_runtime": 1542.2131, - "train_tokens_per_second": 151661.196 + "train_runtime": 1227.2262, + "train_tokens_per_second": 190587.425 }, { "epoch": 0.7745564690610125, - "grad_norm": 0.70703125, - "learning_rate": 0.00019262999999999998, - "loss": 0.0286, + "grad_norm": 0.80078125, + "learning_rate": 0.0004055368421052631, + "loss": 0.0438, "num_input_tokens_seen": 234549248, "step": 3580, - "train_runtime": 1545.6775, - "train_tokens_per_second": 151745.273 + "train_runtime": 1231.0532, + "train_tokens_per_second": 190527.305 }, { "epoch": 0.7767200346170489, - "grad_norm": 1.0859375, - "learning_rate": 0.00019233, - "loss": 0.0276, + "grad_norm": 0.515625, + "learning_rate": 0.0004049052631578947, + "loss": 0.042, "num_input_tokens_seen": 235204608, "step": 3590, - "train_runtime": 1549.1841, - "train_tokens_per_second": 151824.826 + "train_runtime": 1234.8666, + "train_tokens_per_second": 190469.65 }, { "epoch": 0.7788836001730852, - "grad_norm": 1.0859375, - "learning_rate": 0.00019203, - "loss": 0.0277, + "grad_norm": 0.70703125, + "learning_rate": 0.00040427368421052626, + "loss": 0.0447, "num_input_tokens_seen": 235859968, "step": 3600, - "train_runtime": 1552.6563, - "train_tokens_per_second": 151907.392 + "train_runtime": 1238.6912, + "train_tokens_per_second": 190410.628 }, { "epoch": 0.7788836001730852, - "eval_loss": 0.021270038560032845, - "eval_runtime": 25.3392, - "eval_samples_per_second": 1.263, - "eval_steps_per_second": 0.039, + "eval_loss": 0.03048066981136799, + "eval_runtime": 1.5257, + "eval_samples_per_second": 20.974, + "eval_steps_per_second": 0.655, "num_input_tokens_seen": 235859968, "step": 3600 }, { "epoch": 0.7788836001730852, - "eval_byte_accuracy": 0.9966617210682492, - "eval_chrf": 38.63481409183779, - "eval_sacrebleu": 19.997296473650657, - "eval_word_accuracy": 0.9917582417582418, + "eval_byte_accuracy": 0.9933234421364985, + "eval_chrf": 94.5949510570737, + "eval_sacrebleu": 92.65942721026246, + "eval_word_accuracy": 0.9821428571428571, "num_input_tokens_seen": 235859968, - "perplexity": 1.021497858207355, + "perplexity": 1.0309499613989452, "step": 3600 }, { "epoch": 0.7810471657291216, - "grad_norm": 1.109375, - "learning_rate": 0.00019172999999999998, - "loss": 0.0263, + "grad_norm": 0.57421875, + "learning_rate": 0.0004036421052631579, + "loss": 0.0394, "num_input_tokens_seen": 236515328, "step": 3610, - "train_runtime": 1581.4446, - "train_tokens_per_second": 149556.51 + "train_runtime": 1244.0566, + "train_tokens_per_second": 190116.21 }, { "epoch": 0.783210731285158, - "grad_norm": 1.5546875, - "learning_rate": 0.00019142999999999997, - "loss": 0.0272, + "grad_norm": 0.7890625, + "learning_rate": 0.00040301052631578943, + "loss": 0.0396, "num_input_tokens_seen": 237170688, "step": 3620, - "train_runtime": 1584.9207, - "train_tokens_per_second": 149641.989 + "train_runtime": 1247.873, + "train_tokens_per_second": 190059.955 }, { "epoch": 0.7853742968411943, - "grad_norm": 1.4609375, - "learning_rate": 0.00019113, - "loss": 0.0269, + "grad_norm": 0.61328125, + "learning_rate": 0.000402378947368421, + "loss": 0.0422, "num_input_tokens_seen": 237821952, "step": 3630, - "train_runtime": 1588.4018, - "train_tokens_per_second": 149724.051 + "train_runtime": 1251.6719, + "train_tokens_per_second": 190003.424 }, { "epoch": 0.7875378623972307, - "grad_norm": 1.3203125, - "learning_rate": 0.00019083, - "loss": 0.0307, + "grad_norm": 0.74609375, + "learning_rate": 0.0004017473684210526, + "loss": 0.0433, "num_input_tokens_seen": 238477312, "step": 3640, - "train_runtime": 1591.8229, - "train_tokens_per_second": 149813.969 + "train_runtime": 1255.4702, + "train_tokens_per_second": 189950.593 }, { "epoch": 0.789701427953267, - "grad_norm": 0.87890625, - "learning_rate": 0.00019052999999999998, - "loss": 0.0269, + "grad_norm": 0.90625, + "learning_rate": 0.00040111578947368414, + "loss": 0.0405, "num_input_tokens_seen": 239132672, "step": 3650, - "train_runtime": 1595.2979, - "train_tokens_per_second": 149898.447 + "train_runtime": 1259.2841, + "train_tokens_per_second": 189895.733 }, { "epoch": 0.7918649935093033, - "grad_norm": 1.34375, - "learning_rate": 0.00019022999999999997, - "loss": 0.026, + "grad_norm": 0.81640625, + "learning_rate": 0.0004004842105263158, + "loss": 0.0427, "num_input_tokens_seen": 239788032, "step": 3660, - "train_runtime": 1598.7995, - "train_tokens_per_second": 149980.05 + "train_runtime": 1263.0992, + "train_tokens_per_second": 189841.019 }, { "epoch": 0.7940285590653396, - "grad_norm": 1.234375, - "learning_rate": 0.00018993, - "loss": 0.0266, + "grad_norm": 0.92578125, + "learning_rate": 0.0003998526315789473, + "loss": 0.0447, "num_input_tokens_seen": 240443392, "step": 3670, - "train_runtime": 1602.3126, - "train_tokens_per_second": 150060.224 + "train_runtime": 1266.9137, + "train_tokens_per_second": 189786.715 }, { "epoch": 0.796192124621376, - "grad_norm": 1.1640625, - "learning_rate": 0.00018962999999999999, - "loss": 0.026, + "grad_norm": 0.77734375, + "learning_rate": 0.00039922105263157895, + "loss": 0.04, "num_input_tokens_seen": 241098752, "step": 3680, - "train_runtime": 1605.8065, - "train_tokens_per_second": 150141.849 + "train_runtime": 1270.7206, + "train_tokens_per_second": 189733.882 }, { "epoch": 0.7983556901774124, - "grad_norm": 1.0859375, - "learning_rate": 0.00018932999999999998, - "loss": 0.0284, + "grad_norm": 0.83984375, + "learning_rate": 0.0003985894736842105, + "loss": 0.0432, "num_input_tokens_seen": 241754112, "step": 3690, - "train_runtime": 1609.29, - "train_tokens_per_second": 150224.078 + "train_runtime": 1274.5299, + "train_tokens_per_second": 189680.998 }, { "epoch": 0.8005192557334487, - "grad_norm": 1.0390625, - "learning_rate": 0.00018902999999999997, - "loss": 0.0542, + "grad_norm": 0.70703125, + "learning_rate": 0.0003979578947368421, + "loss": 0.0751, "num_input_tokens_seen": 242409472, "step": 3700, - "train_runtime": 1612.7756, - "train_tokens_per_second": 150305.762 + "train_runtime": 1278.3442, + "train_tokens_per_second": 189627.701 }, { "epoch": 0.8005192557334487, - "eval_loss": 0.020173341035842896, - "eval_runtime": 27.2733, - "eval_samples_per_second": 1.173, - "eval_steps_per_second": 0.037, + "eval_loss": 0.02744060382246971, + "eval_runtime": 1.8855, + "eval_samples_per_second": 16.971, + "eval_steps_per_second": 0.53, "num_input_tokens_seen": 242409472, "step": 3700 }, { "epoch": 0.8005192557334487, - "eval_byte_accuracy": 0.9977744807121661, - "eval_chrf": 38.512777966820224, - "eval_sacrebleu": 19.56653941561593, - "eval_word_accuracy": 0.9917582417582418, + "eval_byte_accuracy": 0.994807121661721, + "eval_chrf": 95.85159781134789, + "eval_sacrebleu": 93.59317163487975, + "eval_word_accuracy": 0.9848901098901099, "num_input_tokens_seen": 242409472, - "perplexity": 1.0203781981117432, + "perplexity": 1.0278205646814604, "step": 3700 }, { "epoch": 0.8026828212894851, - "grad_norm": 1.2578125, - "learning_rate": 0.00018873, - "loss": 0.0256, + "grad_norm": 0.8515625, + "learning_rate": 0.00039732631578947366, + "loss": 0.0393, "num_input_tokens_seen": 243064832, "step": 3710, - "train_runtime": 1643.5004, - "train_tokens_per_second": 147894.595 + "train_runtime": 1284.0643, + "train_tokens_per_second": 189293.349 }, { "epoch": 0.8048463868455215, - "grad_norm": 0.7109375, - "learning_rate": 0.00018842999999999998, - "loss": 0.0255, + "grad_norm": 0.9140625, + "learning_rate": 0.0003966947368421052, + "loss": 0.0396, "num_input_tokens_seen": 243720192, "step": 3720, - "train_runtime": 1646.9603, - "train_tokens_per_second": 147981.826 + "train_runtime": 1287.8863, + "train_tokens_per_second": 189240.46 }, { "epoch": 0.8070099524015578, - "grad_norm": 0.92578125, - "learning_rate": 0.00018812999999999998, - "loss": 0.0242, + "grad_norm": 0.59375, + "learning_rate": 0.00039606315789473683, + "loss": 0.0359, "num_input_tokens_seen": 244375552, "step": 3730, - "train_runtime": 1650.4131, - "train_tokens_per_second": 148069.323 + "train_runtime": 1291.7103, + "train_tokens_per_second": 189187.584 }, { "epoch": 0.8091735179575941, - "grad_norm": 0.8828125, - "learning_rate": 0.00018782999999999997, - "loss": 0.0245, + "grad_norm": 0.875, + "learning_rate": 0.00039543157894736836, + "loss": 0.0384, "num_input_tokens_seen": 245030912, "step": 3740, - "train_runtime": 1653.8836, - "train_tokens_per_second": 148154.873 + "train_runtime": 1295.5185, + "train_tokens_per_second": 189137.327 }, { "epoch": 0.8113370835136304, - "grad_norm": 1.09375, - "learning_rate": 0.00018753, - "loss": 0.0248, + "grad_norm": 0.86328125, + "learning_rate": 0.0003948, + "loss": 0.0383, "num_input_tokens_seen": 245686272, "step": 3750, - "train_runtime": 1657.3145, - "train_tokens_per_second": 148243.601 + "train_runtime": 1299.337, + "train_tokens_per_second": 189085.878 }, { "epoch": 0.8135006490696668, - "grad_norm": 1.1875, - "learning_rate": 0.00018722999999999998, - "loss": 0.0261, + "grad_norm": 0.66015625, + "learning_rate": 0.00039416842105263153, + "loss": 0.0401, "num_input_tokens_seen": 246337536, "step": 3760, - "train_runtime": 1660.7804, - "train_tokens_per_second": 148326.372 + "train_runtime": 1303.1231, + "train_tokens_per_second": 189036.279 }, { "epoch": 0.8156642146257032, - "grad_norm": 0.91796875, - "learning_rate": 0.00018692999999999997, - "loss": 0.0282, + "grad_norm": 0.75, + "learning_rate": 0.0003935368421052632, + "loss": 0.0424, "num_input_tokens_seen": 246988800, "step": 3770, - "train_runtime": 1664.2428, - "train_tokens_per_second": 148409.118 + "train_runtime": 1306.9192, + "train_tokens_per_second": 188985.517 }, { "epoch": 0.8178277801817395, - "grad_norm": 1.1328125, - "learning_rate": 0.00018662999999999997, - "loss": 0.0243, + "grad_norm": 0.89453125, + "learning_rate": 0.0003929052631578947, + "loss": 0.0392, "num_input_tokens_seen": 247644160, "step": 3780, - "train_runtime": 1667.6909, - "train_tokens_per_second": 148495.244 + "train_runtime": 1310.7374, + "train_tokens_per_second": 188934.986 }, { "epoch": 0.8199913457377759, - "grad_norm": 1.0390625, - "learning_rate": 0.00018632999999999999, - "loss": 0.0235, + "grad_norm": 0.80078125, + "learning_rate": 0.00039227368421052624, + "loss": 0.0377, "num_input_tokens_seen": 248299520, "step": 3790, - "train_runtime": 1671.189, - "train_tokens_per_second": 148576.566 + "train_runtime": 1314.5417, + "train_tokens_per_second": 188886.762 }, { "epoch": 0.8221549112938122, - "grad_norm": 1.109375, - "learning_rate": 0.00018602999999999998, - "loss": 0.0238, + "grad_norm": 0.875, + "learning_rate": 0.0003916421052631579, + "loss": 0.0386, "num_input_tokens_seen": 248954880, "step": 3800, - "train_runtime": 1674.6684, - "train_tokens_per_second": 148659.209 + "train_runtime": 1318.3632, + "train_tokens_per_second": 188836.345 }, { "epoch": 0.8221549112938122, - "eval_loss": 0.020359355956315994, - "eval_runtime": 25.3535, - "eval_samples_per_second": 1.262, - "eval_steps_per_second": 0.039, + "eval_loss": 0.02655559405684471, + "eval_runtime": 1.8736, + "eval_samples_per_second": 17.079, + "eval_steps_per_second": 0.534, "num_input_tokens_seen": 248954880, "step": 3800 }, { "epoch": 0.8221549112938122, - "eval_byte_accuracy": 0.9966617210682492, - "eval_chrf": 37.91166713749895, - "eval_sacrebleu": 20.269849183930177, - "eval_word_accuracy": 0.9917582417582418, - "num_input_tokens_seen": 248954880, - "perplexity": 1.0205680213355461, + "eval_byte_accuracy": 0.9940652818991098, + "eval_chrf": 94.92109122001662, + "eval_sacrebleu": 94.00295930011256, + "eval_word_accuracy": 0.9835164835164835, + "num_input_tokens_seen": 248954880, + "perplexity": 1.026911335841954, "step": 3800 }, { "epoch": 0.8243184768498486, - "grad_norm": 1.1328125, - "learning_rate": 0.00018572999999999997, - "loss": 0.0237, + "grad_norm": 0.8984375, + "learning_rate": 0.0003910105263157894, + "loss": 0.038, "num_input_tokens_seen": 249610240, "step": 3810, - "train_runtime": 1703.5499, - "train_tokens_per_second": 146523.584 + "train_runtime": 1324.0597, + "train_tokens_per_second": 188518.873 }, { "epoch": 0.8264820424058849, - "grad_norm": 1.046875, - "learning_rate": 0.00018543, - "loss": 0.024, + "grad_norm": 0.7421875, + "learning_rate": 0.00039037894736842105, + "loss": 0.0379, "num_input_tokens_seen": 250265600, "step": 3820, - "train_runtime": 1707.0114, - "train_tokens_per_second": 146610.385 + "train_runtime": 1327.8807, + "train_tokens_per_second": 188469.945 }, { "epoch": 0.8286456079619212, - "grad_norm": 0.765625, - "learning_rate": 0.00018512999999999998, - "loss": 0.0241, + "grad_norm": 0.515625, + "learning_rate": 0.0003897473684210526, + "loss": 0.0359, "num_input_tokens_seen": 250920960, "step": 3830, - "train_runtime": 1710.4712, - "train_tokens_per_second": 146696.977 + "train_runtime": 1331.6986, + "train_tokens_per_second": 188421.738 }, { "epoch": 0.8308091735179576, - "grad_norm": 1.4453125, - "learning_rate": 0.00018482999999999998, - "loss": 0.0247, + "grad_norm": 0.97265625, + "learning_rate": 0.0003891157894736842, + "loss": 0.041, "num_input_tokens_seen": 251576320, "step": 3840, - "train_runtime": 1713.9255, - "train_tokens_per_second": 146783.7 + "train_runtime": 1335.5172, + "train_tokens_per_second": 188373.698 }, { "epoch": 0.8329727390739939, - "grad_norm": 0.890625, - "learning_rate": 0.00018452999999999997, - "loss": 0.0226, + "grad_norm": 0.9140625, + "learning_rate": 0.00038848421052631576, + "loss": 0.0386, "num_input_tokens_seen": 252231680, "step": 3850, - "train_runtime": 1717.4087, - "train_tokens_per_second": 146867.593 + "train_runtime": 1339.3407, + "train_tokens_per_second": 188325.258 }, { "epoch": 0.8351363046300303, - "grad_norm": 1.0, - "learning_rate": 0.00018423, - "loss": 0.0245, + "grad_norm": 0.859375, + "learning_rate": 0.0003878526315789473, + "loss": 0.0411, "num_input_tokens_seen": 252887040, "step": 3860, - "train_runtime": 1720.9084, - "train_tokens_per_second": 146949.735 + "train_runtime": 1343.1562, + "train_tokens_per_second": 188278.21 }, { "epoch": 0.8372998701860667, - "grad_norm": 1.8203125, - "learning_rate": 0.00018392999999999998, - "loss": 0.0254, + "grad_norm": 0.9609375, + "learning_rate": 0.00038722105263157893, + "loss": 0.0393, "num_input_tokens_seen": 253542400, "step": 3870, - "train_runtime": 1724.3513, - "train_tokens_per_second": 147036.394 + "train_runtime": 1346.9779, + "train_tokens_per_second": 188230.558 }, { "epoch": 0.839463435742103, - "grad_norm": 0.98046875, - "learning_rate": 0.00018362999999999997, - "loss": 0.051, + "grad_norm": 0.71484375, + "learning_rate": 0.00038658947368421046, + "loss": 0.0748, "num_input_tokens_seen": 254197760, "step": 3880, - "train_runtime": 1727.821, - "train_tokens_per_second": 147120.425 + "train_runtime": 1350.7997, + "train_tokens_per_second": 188183.161 }, { "epoch": 0.8416270012981394, - "grad_norm": 1.0859375, - "learning_rate": 0.00018332999999999997, - "loss": 0.0258, + "grad_norm": 0.5703125, + "learning_rate": 0.0003859578947368421, + "loss": 0.0418, "num_input_tokens_seen": 254853120, "step": 3890, - "train_runtime": 1731.2786, - "train_tokens_per_second": 147205.143 + "train_runtime": 1354.6168, + "train_tokens_per_second": 188136.694 }, { "epoch": 0.8437905668541756, - "grad_norm": 0.94921875, - "learning_rate": 0.00018303, - "loss": 0.0234, + "grad_norm": 0.77734375, + "learning_rate": 0.00038532631578947363, + "loss": 0.0395, "num_input_tokens_seen": 255508480, "step": 3900, - "train_runtime": 1734.776, - "train_tokens_per_second": 147286.147 + "train_runtime": 1358.4315, + "train_tokens_per_second": 188090.809 }, { "epoch": 0.8437905668541756, - "eval_loss": 0.01944277621805668, - "eval_runtime": 26.2502, - "eval_samples_per_second": 1.219, - "eval_steps_per_second": 0.038, + "eval_loss": 0.026101363822817802, + "eval_runtime": 1.759, + "eval_samples_per_second": 18.192, + "eval_steps_per_second": 0.568, "num_input_tokens_seen": 255508480, "step": 3900 }, { "epoch": 0.8437905668541756, - "eval_byte_accuracy": 0.9970326409495549, - "eval_chrf": 38.475870640535014, - "eval_sacrebleu": 19.5141245397596, - "eval_word_accuracy": 0.9903846153846154, + "eval_byte_accuracy": 0.9929525222551929, + "eval_chrf": 95.31269349287258, + "eval_sacrebleu": 94.24185400796716, + "eval_word_accuracy": 0.9821428571428571, "num_input_tokens_seen": 255508480, - "perplexity": 1.0196330179337207, + "perplexity": 1.026444987588306, "step": 3900 }, { "epoch": 0.845954132410212, - "grad_norm": 1.15625, - "learning_rate": 0.00018272999999999998, - "loss": 0.0228, + "grad_norm": 0.625, + "learning_rate": 0.0003846947368421053, + "loss": 0.0377, "num_input_tokens_seen": 256163840, "step": 3910, - "train_runtime": 1764.4867, - "train_tokens_per_second": 145177.544 + "train_runtime": 1364.0191, + "train_tokens_per_second": 187800.768 }, { "epoch": 0.8481176979662484, - "grad_norm": 1.0546875, - "learning_rate": 0.00018242999999999997, - "loss": 0.0275, + "grad_norm": 0.765625, + "learning_rate": 0.0003840631578947368, + "loss": 0.0383, "num_input_tokens_seen": 256819200, "step": 3920, - "train_runtime": 1767.9829, - "train_tokens_per_second": 145261.132 + "train_runtime": 1367.8422, + "train_tokens_per_second": 187754.995 }, { "epoch": 0.8502812635222847, - "grad_norm": 1.6015625, - "learning_rate": 0.00018212999999999997, - "loss": 0.0263, + "grad_norm": 0.5390625, + "learning_rate": 0.00038343157894736834, + "loss": 0.0391, "num_input_tokens_seen": 257474560, "step": 3930, - "train_runtime": 1771.4539, - "train_tokens_per_second": 145346.463 + "train_runtime": 1371.6627, + "train_tokens_per_second": 187709.814 }, { "epoch": 0.8524448290783211, - "grad_norm": 1.1796875, - "learning_rate": 0.00018182999999999999, - "loss": 0.0255, + "grad_norm": 0.52734375, + "learning_rate": 0.0003828, + "loss": 0.0406, "num_input_tokens_seen": 258129920, "step": 3940, - "train_runtime": 1774.954, - "train_tokens_per_second": 145429.072 + "train_runtime": 1375.4817, + "train_tokens_per_second": 187665.108 }, { "epoch": 0.8546083946343574, - "grad_norm": 0.85546875, - "learning_rate": 0.00018152999999999998, - "loss": 0.0221, + "grad_norm": 0.61328125, + "learning_rate": 0.0003821684210526315, + "loss": 0.0362, "num_input_tokens_seen": 258785280, "step": 3950, - "train_runtime": 1778.5156, - "train_tokens_per_second": 145506.327 + "train_runtime": 1379.2745, + "train_tokens_per_second": 187624.208 }, { "epoch": 0.8567719601903938, - "grad_norm": 1.4609375, - "learning_rate": 0.00018122999999999997, - "loss": 0.028, + "grad_norm": 0.94140625, + "learning_rate": 0.00038153684210526315, + "loss": 0.0413, "num_input_tokens_seen": 259440640, "step": 3960, - "train_runtime": 1781.9723, - "train_tokens_per_second": 145591.848 + "train_runtime": 1383.0876, + "train_tokens_per_second": 187580.778 }, { "epoch": 0.8589355257464301, - "grad_norm": 1.0546875, - "learning_rate": 0.00018092999999999996, - "loss": 0.0208, + "grad_norm": 0.6640625, + "learning_rate": 0.0003809052631578947, + "loss": 0.0332, "num_input_tokens_seen": 260096000, "step": 3970, - "train_runtime": 1785.4217, - "train_tokens_per_second": 145677.625 + "train_runtime": 1386.8873, + "train_tokens_per_second": 187539.385 }, { "epoch": 0.8610990913024664, - "grad_norm": 0.94921875, - "learning_rate": 0.00018062999999999998, - "loss": 0.0227, + "grad_norm": 0.71484375, + "learning_rate": 0.0003802736842105263, + "loss": 0.0369, "num_input_tokens_seen": 260751360, "step": 3980, - "train_runtime": 1788.8941, - "train_tokens_per_second": 145761.204 + "train_runtime": 1390.6976, + "train_tokens_per_second": 187496.803 }, { "epoch": 0.8632626568585028, - "grad_norm": 0.75, - "learning_rate": 0.00018032999999999998, - "loss": 0.0236, + "grad_norm": 0.57421875, + "learning_rate": 0.00037964210526315786, + "loss": 0.0393, "num_input_tokens_seen": 261406720, "step": 3990, - "train_runtime": 1792.3669, - "train_tokens_per_second": 145844.421 + "train_runtime": 1394.514, + "train_tokens_per_second": 187453.636 }, { "epoch": 0.8654262224145391, - "grad_norm": 0.69921875, - "learning_rate": 0.00018002999999999997, - "loss": 0.0229, + "grad_norm": 0.8359375, + "learning_rate": 0.0003790105263157894, + "loss": 0.0374, "num_input_tokens_seen": 262062080, "step": 4000, - "train_runtime": 1795.8384, - "train_tokens_per_second": 145927.425 + "train_runtime": 1398.333, + "train_tokens_per_second": 187410.351 }, { "epoch": 0.8654262224145391, - "eval_loss": 0.02129935659468174, - "eval_runtime": 24.4541, - "eval_samples_per_second": 1.309, - "eval_steps_per_second": 0.041, + "eval_loss": 0.024643762037158012, + "eval_runtime": 1.7566, + "eval_samples_per_second": 18.217, + "eval_steps_per_second": 0.569, "num_input_tokens_seen": 262062080, "step": 4000 }, { "epoch": 0.8654262224145391, - "eval_byte_accuracy": 0.9970326409495549, - "eval_chrf": 37.837298473523404, - "eval_sacrebleu": 19.024673955277844, - "eval_word_accuracy": 0.9917582417582418, + "eval_byte_accuracy": 0.9933234421364985, + "eval_chrf": 94.19624263012378, + "eval_sacrebleu": 93.71978495314545, + "eval_word_accuracy": 0.9821428571428571, "num_input_tokens_seen": 262062080, - "perplexity": 1.0215278069559728, + "perplexity": 1.0249499294059543, "step": 4000 }, { "epoch": 0.8675897879705755, - "grad_norm": 0.80078125, - "learning_rate": 0.00017972999999999996, - "loss": 0.0234, + "grad_norm": 0.703125, + "learning_rate": 0.00037837894736842103, + "loss": 0.0389, "num_input_tokens_seen": 262717440, "step": 4010, - "train_runtime": 1823.8528, - "train_tokens_per_second": 144045.308 + "train_runtime": 1403.9596, + "train_tokens_per_second": 187126.066 }, { "epoch": 0.8697533535266119, - "grad_norm": 0.76171875, - "learning_rate": 0.00017942999999999998, - "loss": 0.0224, + "grad_norm": 0.94921875, + "learning_rate": 0.00037774736842105256, + "loss": 0.0375, "num_input_tokens_seen": 263372800, "step": 4020, - "train_runtime": 1827.3749, - "train_tokens_per_second": 144126.307 + "train_runtime": 1407.7746, + "train_tokens_per_second": 187084.494 }, { "epoch": 0.8719169190826482, - "grad_norm": 1.0234375, - "learning_rate": 0.00017912999999999997, - "loss": 0.0225, + "grad_norm": 0.76171875, + "learning_rate": 0.0003771157894736842, + "loss": 0.0361, "num_input_tokens_seen": 264028160, "step": 4030, - "train_runtime": 1830.8742, - "train_tokens_per_second": 144208.793 + "train_runtime": 1411.5751, + "train_tokens_per_second": 187045.062 }, { "epoch": 0.8740804846386846, - "grad_norm": 1.2734375, - "learning_rate": 0.00017882999999999997, - "loss": 0.0218, + "grad_norm": 0.70703125, + "learning_rate": 0.00037648421052631574, + "loss": 0.0344, "num_input_tokens_seen": 264683520, "step": 4040, - "train_runtime": 1834.372, - "train_tokens_per_second": 144291.079 + "train_runtime": 1415.3795, + "train_tokens_per_second": 187005.335 }, { "epoch": 0.8762440501947208, - "grad_norm": 0.6484375, - "learning_rate": 0.00017852999999999999, - "loss": 0.0224, + "grad_norm": 0.6953125, + "learning_rate": 0.0003758526315789474, + "loss": 0.0375, "num_input_tokens_seen": 265338880, "step": 4050, - "train_runtime": 1837.8445, - "train_tokens_per_second": 144375.043 + "train_runtime": 1419.1956, + "train_tokens_per_second": 186964.276 }, { "epoch": 0.8784076157507572, - "grad_norm": 0.86328125, - "learning_rate": 0.00017822999999999998, - "loss": 0.0227, + "grad_norm": 0.6796875, + "learning_rate": 0.0003752210526315789, + "loss": 0.037, "num_input_tokens_seen": 265994240, "step": 4060, - "train_runtime": 1841.3029, - "train_tokens_per_second": 144459.798 + "train_runtime": 1423.0053, + "train_tokens_per_second": 186924.278 }, { "epoch": 0.8805711813067936, - "grad_norm": 1.40625, - "learning_rate": 0.00017792999999999997, - "loss": 0.0309, + "grad_norm": 0.8203125, + "learning_rate": 0.00037458947368421044, + "loss": 0.0369, "num_input_tokens_seen": 266649600, "step": 4070, - "train_runtime": 1844.8087, - "train_tokens_per_second": 144540.517 + "train_runtime": 1426.8192, + "train_tokens_per_second": 186883.94 }, { "epoch": 0.8827347468628299, - "grad_norm": 0.859375, - "learning_rate": 0.00017762999999999996, - "loss": 0.0229, + "grad_norm": 0.73046875, + "learning_rate": 0.0003739578947368421, + "loss": 0.037, "num_input_tokens_seen": 267304960, "step": 4080, - "train_runtime": 1848.285, - "train_tokens_per_second": 144623.237 + "train_runtime": 1430.6353, + "train_tokens_per_second": 186843.537 }, { "epoch": 0.8848983124188663, - "grad_norm": 1.1953125, - "learning_rate": 0.00017732999999999998, - "loss": 0.0228, + "grad_norm": 0.82421875, + "learning_rate": 0.0003733263157894736, + "loss": 0.0365, "num_input_tokens_seen": 267960320, "step": 4090, - "train_runtime": 1851.7424, - "train_tokens_per_second": 144707.122 + "train_runtime": 1434.4492, + "train_tokens_per_second": 186803.634 }, { "epoch": 0.8870618779749027, - "grad_norm": 1.1796875, - "learning_rate": 0.00017702999999999998, - "loss": 0.0226, + "grad_norm": 0.57421875, + "learning_rate": 0.00037269473684210525, + "loss": 0.037, "num_input_tokens_seen": 268615680, "step": 4100, - "train_runtime": 1855.2161, - "train_tokens_per_second": 144789.43 + "train_runtime": 1438.2647, + "train_tokens_per_second": 186763.736 }, { "epoch": 0.8870618779749027, - "eval_loss": 0.013824949972331524, - "eval_runtime": 25.0045, - "eval_samples_per_second": 1.28, - "eval_steps_per_second": 0.04, + "eval_loss": 0.0261303149163723, + "eval_runtime": 1.6603, + "eval_samples_per_second": 19.273, + "eval_steps_per_second": 0.602, "num_input_tokens_seen": 268615680, "step": 4100 }, { "epoch": 0.8870618779749027, - "eval_byte_accuracy": 0.9981454005934718, - "eval_chrf": 37.8309332104278, - "eval_sacrebleu": 18.907949871600724, - "eval_word_accuracy": 0.9945054945054945, + "eval_byte_accuracy": 0.9933234421364985, + "eval_chrf": 94.41058320134658, + "eval_sacrebleu": 92.20217616355528, + "eval_word_accuracy": 0.9807692307692307, "num_input_tokens_seen": 268615680, - "perplexity": 1.0139209565115523, + "perplexity": 1.0264747047233398, "step": 4100 }, { "epoch": 0.889225443530939, - "grad_norm": 0.89453125, - "learning_rate": 0.00017672999999999997, - "loss": 0.0222, + "grad_norm": 0.63671875, + "learning_rate": 0.0003720631578947368, + "loss": 0.0336, "num_input_tokens_seen": 269271040, "step": 4110, - "train_runtime": 1883.7045, - "train_tokens_per_second": 142947.599 + "train_runtime": 1443.7537, + "train_tokens_per_second": 186507.598 }, { "epoch": 0.8913890090869754, - "grad_norm": 1.234375, - "learning_rate": 0.00017642999999999996, - "loss": 0.0255, + "grad_norm": 0.83203125, + "learning_rate": 0.00037143157894736843, + "loss": 0.038, "num_input_tokens_seen": 269926400, "step": 4120, - "train_runtime": 1887.1979, - "train_tokens_per_second": 143030.255 + "train_runtime": 1447.5712, + "train_tokens_per_second": 186468.482 }, { "epoch": 0.8935525746430116, - "grad_norm": 0.99609375, - "learning_rate": 0.00017612999999999998, - "loss": 0.0233, + "grad_norm": 1.359375, + "learning_rate": 0.00037079999999999996, + "loss": 0.0386, "num_input_tokens_seen": 270581760, "step": 4130, - "train_runtime": 1890.7345, - "train_tokens_per_second": 143109.339 + "train_runtime": 1451.3821, + "train_tokens_per_second": 186430.404 }, { "epoch": 0.895716140199048, - "grad_norm": 0.7890625, - "learning_rate": 0.00017582999999999998, - "loss": 0.0256, + "grad_norm": 0.74609375, + "learning_rate": 0.0003701684210526315, + "loss": 0.0405, "num_input_tokens_seen": 271237120, "step": 4140, - "train_runtime": 1894.2627, - "train_tokens_per_second": 143188.757 + "train_runtime": 1455.1991, + "train_tokens_per_second": 186391.754 }, { "epoch": 0.8978797057550844, - "grad_norm": 0.92578125, - "learning_rate": 0.00017552999999999997, - "loss": 0.0232, + "grad_norm": 0.890625, + "learning_rate": 0.00036953684210526313, + "loss": 0.0353, "num_input_tokens_seen": 271892480, "step": 4150, - "train_runtime": 1897.7272, - "train_tokens_per_second": 143272.692 + "train_runtime": 1459.023, + "train_tokens_per_second": 186352.426 }, { "epoch": 0.9000432713111207, - "grad_norm": 0.8984375, - "learning_rate": 0.00017522999999999996, - "loss": 0.0247, + "grad_norm": 0.87109375, + "learning_rate": 0.00036890526315789466, + "loss": 0.04, "num_input_tokens_seen": 272547840, "step": 4160, - "train_runtime": 1901.1936, - "train_tokens_per_second": 143356.173 + "train_runtime": 1462.8409, + "train_tokens_per_second": 186314.065 }, { "epoch": 0.9022068368671571, - "grad_norm": 1.0234375, - "learning_rate": 0.00017492999999999998, - "loss": 0.0208, + "grad_norm": 0.734375, + "learning_rate": 0.0003682736842105263, + "loss": 0.0354, "num_input_tokens_seen": 273203200, "step": 4170, - "train_runtime": 1904.6462, - "train_tokens_per_second": 143440.394 + "train_runtime": 1466.6408, + "train_tokens_per_second": 186278.193 }, { "epoch": 0.9043704024231934, - "grad_norm": 0.99609375, - "learning_rate": 0.00017462999999999997, - "loss": 0.0222, + "grad_norm": 0.6328125, + "learning_rate": 0.00036764210526315784, + "loss": 0.0353, "num_input_tokens_seen": 273858560, "step": 4180, - "train_runtime": 1908.0962, - "train_tokens_per_second": 143524.503 + "train_runtime": 1470.4587, + "train_tokens_per_second": 186240.221 }, { "epoch": 0.9065339679792298, - "grad_norm": 1.421875, - "learning_rate": 0.00017432999999999997, - "loss": 0.0285, + "grad_norm": 0.73828125, + "learning_rate": 0.0003670105263157895, + "loss": 0.0401, "num_input_tokens_seen": 274513920, "step": 4190, - "train_runtime": 1911.5793, - "train_tokens_per_second": 143605.823 + "train_runtime": 1474.2745, + "train_tokens_per_second": 186202.717 }, { "epoch": 0.9086975335352662, - "grad_norm": 1.0859375, - "learning_rate": 0.00017402999999999996, - "loss": 0.023, + "grad_norm": 0.9921875, + "learning_rate": 0.000366378947368421, + "loss": 0.0387, "num_input_tokens_seen": 275165184, "step": 4200, - "train_runtime": 1915.039, - "train_tokens_per_second": 143686.466 + "train_runtime": 1478.0643, + "train_tokens_per_second": 186165.908 }, { "epoch": 0.9086975335352662, - "eval_loss": 0.016472337767481804, - "eval_runtime": 24.8457, - "eval_samples_per_second": 1.288, - "eval_steps_per_second": 0.04, + "eval_loss": 0.025060011073946953, + "eval_runtime": 1.8349, + "eval_samples_per_second": 17.439, + "eval_steps_per_second": 0.545, "num_input_tokens_seen": 275165184, "step": 4200 }, { "epoch": 0.9086975335352662, - "eval_byte_accuracy": 0.9988872403560831, - "eval_chrf": 39.80598308099466, - "eval_sacrebleu": 22.725860440231653, - "eval_word_accuracy": 0.9958791208791209, + "eval_byte_accuracy": 0.9936943620178041, + "eval_chrf": 95.7358495183548, + "eval_sacrebleu": 94.40739967319129, + "eval_word_accuracy": 0.9862637362637363, "num_input_tokens_seen": 275165184, - "perplexity": 1.0166087547293445, + "perplexity": 1.025376652632231, "step": 4200 }, { "epoch": 0.9108610990913024, - "grad_norm": 1.1328125, - "learning_rate": 0.00017372999999999998, - "loss": 0.0293, + "grad_norm": 0.84765625, + "learning_rate": 0.00036574736842105254, + "loss": 0.0399, "num_input_tokens_seen": 275820544, "step": 4210, - "train_runtime": 1943.3669, - "train_tokens_per_second": 141929.218 + "train_runtime": 1483.7398, + "train_tokens_per_second": 185895.497 }, { "epoch": 0.9130246646473388, - "grad_norm": 1.1484375, - "learning_rate": 0.00017342999999999997, - "loss": 0.0224, + "grad_norm": 1.2265625, + "learning_rate": 0.0003651157894736842, + "loss": 0.0403, "num_input_tokens_seen": 276475904, "step": 4220, - "train_runtime": 1946.8323, - "train_tokens_per_second": 142013.211 + "train_runtime": 1487.5562, + "train_tokens_per_second": 185859.129 }, { "epoch": 0.9151882302033751, - "grad_norm": 1.1015625, - "learning_rate": 0.00017312999999999996, - "loss": 0.0475, + "grad_norm": 0.6171875, + "learning_rate": 0.0003644842105263157, + "loss": 0.0697, "num_input_tokens_seen": 277131264, "step": 4230, - "train_runtime": 1950.2708, - "train_tokens_per_second": 142098.865 + "train_runtime": 1491.3729, + "train_tokens_per_second": 185822.919 }, { "epoch": 0.9173517957594115, - "grad_norm": 1.0546875, - "learning_rate": 0.00017282999999999996, - "loss": 0.0263, + "grad_norm": 0.76953125, + "learning_rate": 0.00036385263157894736, + "loss": 0.0414, "num_input_tokens_seen": 277786624, "step": 4240, - "train_runtime": 1953.7154, - "train_tokens_per_second": 142183.769 + "train_runtime": 1495.1934, + "train_tokens_per_second": 185786.418 }, { "epoch": 0.9195153613154479, - "grad_norm": 1.1640625, - "learning_rate": 0.00017252999999999998, - "loss": 0.024, + "grad_norm": 0.484375, + "learning_rate": 0.0003632210526315789, + "loss": 0.0373, "num_input_tokens_seen": 278441984, "step": 4250, - "train_runtime": 1957.176, - "train_tokens_per_second": 142267.216 + "train_runtime": 1499.009, + "train_tokens_per_second": 185750.71 }, { "epoch": 0.9216789268714842, - "grad_norm": 0.96484375, - "learning_rate": 0.00017223, - "loss": 0.0206, + "grad_norm": 0.7421875, + "learning_rate": 0.00036258947368421053, + "loss": 0.0331, "num_input_tokens_seen": 279097344, "step": 4260, - "train_runtime": 1960.6391, - "train_tokens_per_second": 142350.186 + "train_runtime": 1502.8257, + "train_tokens_per_second": 185715.05 }, { "epoch": 0.9238424924275206, - "grad_norm": 0.9140625, - "learning_rate": 0.00017193, - "loss": 0.0216, + "grad_norm": 0.64453125, + "learning_rate": 0.00036195789473684206, + "loss": 0.0345, "num_input_tokens_seen": 279748608, "step": 4270, - "train_runtime": 1964.0975, - "train_tokens_per_second": 142431.124 + "train_runtime": 1506.619, + "train_tokens_per_second": 185679.735 }, { "epoch": 0.926006057983557, - "grad_norm": 1.0859375, - "learning_rate": 0.00017163, - "loss": 0.0229, + "grad_norm": 0.57421875, + "learning_rate": 0.0003613263157894736, + "loss": 0.0363, "num_input_tokens_seen": 280403968, "step": 4280, - "train_runtime": 1967.603, - "train_tokens_per_second": 142510.441 + "train_runtime": 1510.4341, + "train_tokens_per_second": 185644.628 }, { "epoch": 0.9281696235395932, - "grad_norm": 0.62109375, - "learning_rate": 0.00017133, - "loss": 0.0238, + "grad_norm": 0.58984375, + "learning_rate": 0.00036069473684210523, + "loss": 0.0367, "num_input_tokens_seen": 281059328, "step": 4290, - "train_runtime": 1971.0493, - "train_tokens_per_second": 142593.756 + "train_runtime": 1514.2516, + "train_tokens_per_second": 185609.403 }, { "epoch": 0.9303331890956296, - "grad_norm": 1.234375, - "learning_rate": 0.00017103, - "loss": 0.022, + "grad_norm": 0.50390625, + "learning_rate": 0.00036006315789473677, + "loss": 0.0371, "num_input_tokens_seen": 281714688, "step": 4300, - "train_runtime": 1974.4893, - "train_tokens_per_second": 142677.243 + "train_runtime": 1518.064, + "train_tokens_per_second": 185574.972 }, { "epoch": 0.9303331890956296, - "eval_loss": 0.01627390645444393, - "eval_runtime": 24.6722, - "eval_samples_per_second": 1.297, - "eval_steps_per_second": 0.041, + "eval_loss": 0.026061272248625755, + "eval_runtime": 1.6122, + "eval_samples_per_second": 19.849, + "eval_steps_per_second": 0.62, "num_input_tokens_seen": 281714688, "step": 4300 }, { "epoch": 0.9303331890956296, - "eval_byte_accuracy": 0.9985163204747775, - "eval_chrf": 41.374914604150135, - "eval_sacrebleu": 26.62445725572083, - "eval_word_accuracy": 0.9958791208791209, + "eval_byte_accuracy": 0.9936943620178041, + "eval_chrf": 94.55529374351134, + "eval_sacrebleu": 92.55941688947077, + "eval_word_accuracy": 0.9821428571428571, "num_input_tokens_seen": 281714688, - "perplexity": 1.0164070477324518, + "perplexity": 1.0264038366178412, "step": 4300 }, { "epoch": 0.9324967546516659, - "grad_norm": 0.75390625, - "learning_rate": 0.00017073, - "loss": 0.0217, + "grad_norm": 0.68359375, + "learning_rate": 0.0003594315789473684, + "loss": 0.0369, "num_input_tokens_seen": 282370048, "step": 4310, - "train_runtime": 2002.6045, - "train_tokens_per_second": 141001.405 + "train_runtime": 1523.4951, + "train_tokens_per_second": 185343.586 }, { "epoch": 0.9346603202077023, - "grad_norm": 1.34375, - "learning_rate": 0.00017043, - "loss": 0.0223, + "grad_norm": 0.875, + "learning_rate": 0.00035879999999999994, + "loss": 0.0359, "num_input_tokens_seen": 283025408, "step": 4320, - "train_runtime": 2006.1088, - "train_tokens_per_second": 141081.781 + "train_runtime": 1527.3009, + "train_tokens_per_second": 185310.831 }, { "epoch": 0.9368238857637387, - "grad_norm": 1.0390625, - "learning_rate": 0.00017013, - "loss": 0.0298, + "grad_norm": 0.83203125, + "learning_rate": 0.0003581684210526316, + "loss": 0.043, "num_input_tokens_seen": 283680768, "step": 4330, - "train_runtime": 2009.5915, - "train_tokens_per_second": 141163.401 + "train_runtime": 1531.1173, + "train_tokens_per_second": 185276.971 }, { "epoch": 0.938987451319775, - "grad_norm": 0.9609375, - "learning_rate": 0.00016983, - "loss": 0.0202, + "grad_norm": 0.66796875, + "learning_rate": 0.0003575368421052631, + "loss": 0.0338, "num_input_tokens_seen": 284336128, "step": 4340, - "train_runtime": 2013.1075, - "train_tokens_per_second": 141242.398 + "train_runtime": 1534.9394, + "train_tokens_per_second": 185242.579 }, { "epoch": 0.9411510168758114, - "grad_norm": 0.9765625, - "learning_rate": 0.00016953, - "loss": 0.0208, + "grad_norm": 0.56640625, + "learning_rate": 0.00035690526315789475, + "loss": 0.0357, "num_input_tokens_seen": 284991488, "step": 4350, - "train_runtime": 2016.5804, - "train_tokens_per_second": 141324.138 + "train_runtime": 1538.7526, + "train_tokens_per_second": 185209.432 }, { "epoch": 0.9433145824318477, - "grad_norm": 0.75390625, - "learning_rate": 0.00016923, - "loss": 0.0258, + "grad_norm": 0.93359375, + "learning_rate": 0.0003562736842105263, + "loss": 0.0425, "num_input_tokens_seen": 285646848, "step": 4360, - "train_runtime": 2020.0645, - "train_tokens_per_second": 141404.817 + "train_runtime": 1542.5657, + "train_tokens_per_second": 185176.458 }, { "epoch": 0.945478147987884, - "grad_norm": 0.76953125, - "learning_rate": 0.00016893, - "loss": 0.0205, + "grad_norm": 0.78515625, + "learning_rate": 0.0003556421052631578, + "loss": 0.0356, "num_input_tokens_seen": 286302208, "step": 4370, - "train_runtime": 2023.5811, - "train_tokens_per_second": 141482.942 + "train_runtime": 1546.3779, + "train_tokens_per_second": 185143.756 }, { "epoch": 0.9476417135439203, - "grad_norm": 0.91796875, - "learning_rate": 0.00016863, - "loss": 0.0228, + "grad_norm": 0.76953125, + "learning_rate": 0.00035501052631578946, + "loss": 0.0384, "num_input_tokens_seen": 286957568, "step": 4380, - "train_runtime": 2027.0784, - "train_tokens_per_second": 141562.147 + "train_runtime": 1550.1751, + "train_tokens_per_second": 185112.997 }, { "epoch": 0.9498052790999567, - "grad_norm": 0.82421875, - "learning_rate": 0.00016833, - "loss": 0.0223, + "grad_norm": 0.478515625, + "learning_rate": 0.000354378947368421, + "loss": 0.0358, "num_input_tokens_seen": 287612928, "step": 4390, - "train_runtime": 2030.5133, - "train_tokens_per_second": 141645.431 + "train_runtime": 1553.9759, + "train_tokens_per_second": 185081.972 }, { "epoch": 0.9519688446559931, - "grad_norm": 1.28125, - "learning_rate": 0.00016803, - "loss": 0.0242, + "grad_norm": 0.68359375, + "learning_rate": 0.00035374736842105263, + "loss": 0.0389, "num_input_tokens_seen": 288268288, "step": 4400, - "train_runtime": 2033.9631, - "train_tokens_per_second": 141727.394 + "train_runtime": 1557.8016, + "train_tokens_per_second": 185048.144 }, { "epoch": 0.9519688446559931, - "eval_loss": 0.016258614137768745, - "eval_runtime": 25.0587, - "eval_samples_per_second": 1.277, - "eval_steps_per_second": 0.04, + "eval_loss": 0.025347542017698288, + "eval_runtime": 1.5674, + "eval_samples_per_second": 20.416, + "eval_steps_per_second": 0.638, "num_input_tokens_seen": 288268288, "step": 4400 }, { "epoch": 0.9519688446559931, - "eval_byte_accuracy": 0.9981454005934718, - "eval_chrf": 40.45497738851531, - "eval_sacrebleu": 23.113535845653246, - "eval_word_accuracy": 0.9931318681318682, + "eval_byte_accuracy": 0.9933234421364985, + "eval_chrf": 94.8674310164017, + "eval_sacrebleu": 94.82084502261215, + "eval_word_accuracy": 0.9835164835164835, "num_input_tokens_seen": 288268288, - "perplexity": 1.0163915046328522, + "perplexity": 1.0256715225389428, "step": 4400 }, { "epoch": 0.9541324102120294, - "grad_norm": 0.921875, - "learning_rate": 0.00016773, - "loss": 0.0218, + "grad_norm": 0.9296875, + "learning_rate": 0.00035311578947368416, + "loss": 0.0384, "num_input_tokens_seen": 288923648, "step": 4410, - "train_runtime": 2062.5278, - "train_tokens_per_second": 140082.305 + "train_runtime": 1563.1975, + "train_tokens_per_second": 184828.623 }, { "epoch": 0.9562959757680658, - "grad_norm": 0.8125, - "learning_rate": 0.00016743, - "loss": 0.0214, + "grad_norm": 0.421875, + "learning_rate": 0.0003524842105263158, + "loss": 0.0373, "num_input_tokens_seen": 289579008, "step": 4420, - "train_runtime": 2066.0195, - "train_tokens_per_second": 140162.769 + "train_runtime": 1567.0228, + "train_tokens_per_second": 184795.663 }, { "epoch": 0.9584595413241022, - "grad_norm": 1.1484375, - "learning_rate": 0.00016713, - "loss": 0.0247, + "grad_norm": 0.98828125, + "learning_rate": 0.00035185263157894734, + "loss": 0.0397, "num_input_tokens_seen": 290234368, "step": 4430, - "train_runtime": 2069.5276, - "train_tokens_per_second": 140241.843 + "train_runtime": 1570.8514, + "train_tokens_per_second": 184762.465 }, { "epoch": 0.9606231068801385, - "grad_norm": 0.89453125, - "learning_rate": 0.00016683, - "loss": 0.0225, + "grad_norm": 0.8828125, + "learning_rate": 0.00035122105263157887, + "loss": 0.0374, "num_input_tokens_seen": 290889728, "step": 4440, - "train_runtime": 2072.9996, - "train_tokens_per_second": 140323.1 + "train_runtime": 1574.6696, + "train_tokens_per_second": 184730.646 }, { "epoch": 0.9627866724361748, - "grad_norm": 0.94921875, - "learning_rate": 0.00016653, - "loss": 0.0218, + "grad_norm": 1.0703125, + "learning_rate": 0.0003505894736842105, + "loss": 0.039, "num_input_tokens_seen": 291545088, "step": 4450, - "train_runtime": 2076.5048, - "train_tokens_per_second": 140401.84 + "train_runtime": 1578.4859, + "train_tokens_per_second": 184699.202 }, { "epoch": 0.9649502379922111, - "grad_norm": 0.8125, - "learning_rate": 0.00016622999999999999, - "loss": 0.0226, + "grad_norm": 0.8359375, + "learning_rate": 0.00034995789473684204, + "loss": 0.0353, "num_input_tokens_seen": 292200448, "step": 4460, - "train_runtime": 2079.9511, - "train_tokens_per_second": 140484.291 + "train_runtime": 1582.3064, + "train_tokens_per_second": 184667.428 }, { "epoch": 0.9671138035482475, - "grad_norm": 0.88671875, - "learning_rate": 0.00016593, - "loss": 0.022, + "grad_norm": 0.96484375, + "learning_rate": 0.0003493263157894737, + "loss": 0.0359, "num_input_tokens_seen": 292855808, "step": 4470, - "train_runtime": 2083.4598, - "train_tokens_per_second": 140562.254 + "train_runtime": 1586.1211, + "train_tokens_per_second": 184636.469 }, { "epoch": 0.9692773691042839, - "grad_norm": 0.921875, - "learning_rate": 0.00016563, - "loss": 0.0196, + "grad_norm": 0.7578125, + "learning_rate": 0.0003486947368421052, + "loss": 0.0332, "num_input_tokens_seen": 293511168, "step": 4480, - "train_runtime": 2086.9409, - "train_tokens_per_second": 140641.821 + "train_runtime": 1589.9408, + "train_tokens_per_second": 184605.094 }, { "epoch": 0.9714409346603202, - "grad_norm": 0.79296875, - "learning_rate": 0.00016533, - "loss": 0.0217, + "grad_norm": 0.7421875, + "learning_rate": 0.00034806315789473685, + "loss": 0.0365, "num_input_tokens_seen": 294166528, "step": 4490, - "train_runtime": 2090.448, - "train_tokens_per_second": 140719.37 + "train_runtime": 1593.7578, + "train_tokens_per_second": 184574.168 }, { "epoch": 0.9736045002163566, - "grad_norm": 0.8671875, - "learning_rate": 0.00016502999999999998, - "loss": 0.0242, + "grad_norm": 0.640625, + "learning_rate": 0.0003474315789473684, + "loss": 0.0392, "num_input_tokens_seen": 294821888, "step": 4500, - "train_runtime": 2093.9502, - "train_tokens_per_second": 140796.991 + "train_runtime": 1597.5794, + "train_tokens_per_second": 184542.868 }, { "epoch": 0.9736045002163566, - "eval_loss": 0.01589326187968254, - "eval_runtime": 25.1405, - "eval_samples_per_second": 1.273, - "eval_steps_per_second": 0.04, + "eval_loss": 0.024211181327700615, + "eval_runtime": 1.7479, + "eval_samples_per_second": 18.308, + "eval_steps_per_second": 0.572, "num_input_tokens_seen": 294821888, "step": 4500 }, { "epoch": 0.9736045002163566, - "eval_byte_accuracy": 0.9985163204747775, - "eval_chrf": 39.111786334079426, - "eval_sacrebleu": 21.51948581361402, - "eval_word_accuracy": 0.9945054945054945, + "eval_byte_accuracy": 0.9936943620178041, + "eval_chrf": 94.58267760158972, + "eval_sacrebleu": 93.44801868564782, + "eval_word_accuracy": 0.9835164835164835, + "num_input_tokens_seen": 294821888, + "perplexity": 1.0245066517219334, + "step": 4500 + }, + { + "epoch": 0.9736045002163566, + "eval_loss": 0.024180177599191666, + "eval_runtime": 2.8794, + "eval_samples_per_second": 11.113, + "eval_steps_per_second": 0.347, + "num_input_tokens_seen": 294821888, + "step": 4500 + }, + { + "epoch": 0.9736045002163566, + "eval_byte_accuracy": 0.9940652818991098, + "eval_chrf": 95.26030523847231, + "eval_sacrebleu": 93.68260424037459, + "eval_word_accuracy": 0.9848901098901099, "num_input_tokens_seen": 294821888, - "perplexity": 1.0160202315283984, + "perplexity": 1.0244748886882367, "step": 4500 }, { "epoch": 0.975768065772393, - "grad_norm": 0.59765625, - "learning_rate": 0.00016473, - "loss": 0.019, + "grad_norm": 0.439453125, + "learning_rate": 0.0003467999999999999, + "loss": 0.0303, "num_input_tokens_seen": 295477248, "step": 4510, - "train_runtime": 2122.7271, - "train_tokens_per_second": 139197.003 + "train_runtime": 8.1902, + "train_tokens_per_second": 36076993.874 }, { "epoch": 0.9779316313284292, - "grad_norm": 0.81640625, - "learning_rate": 0.00016443, - "loss": 0.0261, + "grad_norm": 0.4375, + "learning_rate": 0.00034616842105263156, + "loss": 0.0342, "num_input_tokens_seen": 296132608, "step": 4520, - "train_runtime": 2126.2194, - "train_tokens_per_second": 139276.6 + "train_runtime": 12.0015, + "train_tokens_per_second": 24674670.481 }, { "epoch": 0.9800951968844656, - "grad_norm": 0.97265625, - "learning_rate": 0.00016413, - "loss": 0.0226, + "grad_norm": 0.458984375, + "learning_rate": 0.0003455368421052631, + "loss": 0.0322, "num_input_tokens_seen": 296787968, "step": 4530, - "train_runtime": 2129.6691, - "train_tokens_per_second": 139358.724 + "train_runtime": 15.8385, + "train_tokens_per_second": 18738336.118 }, { "epoch": 0.9822587624405019, - "grad_norm": 0.859375, - "learning_rate": 0.00016382999999999998, - "loss": 0.0198, + "grad_norm": 0.4140625, + "learning_rate": 0.00034490526315789473, + "loss": 0.0279, "num_input_tokens_seen": 297443328, "step": 4540, - "train_runtime": 2133.1573, - "train_tokens_per_second": 139438.065 + "train_runtime": 19.6683, + "train_tokens_per_second": 15123011.172 }, { "epoch": 0.9844223279965383, - "grad_norm": 0.73828125, - "learning_rate": 0.00016353, - "loss": 0.0212, + "grad_norm": 0.50390625, + "learning_rate": 0.00034427368421052626, + "loss": 0.0302, "num_input_tokens_seen": 298098688, "step": 4550, - "train_runtime": 2136.6572, - "train_tokens_per_second": 139516.382 + "train_runtime": 23.5035, + "train_tokens_per_second": 12683170.2 }, { "epoch": 0.9865858935525746, - "grad_norm": 1.4609375, - "learning_rate": 0.00016323, - "loss": 0.026, + "grad_norm": 0.466796875, + "learning_rate": 0.0003436421052631579, + "loss": 0.0295, "num_input_tokens_seen": 298754048, "step": 4560, - "train_runtime": 2140.1724, - "train_tokens_per_second": 139593.447 + "train_runtime": 27.3289, + "train_tokens_per_second": 10931815.69 }, { "epoch": 0.988749459108611, - "grad_norm": 1.4296875, - "learning_rate": 0.00016293, - "loss": 0.0227, + "grad_norm": 0.640625, + "learning_rate": 0.00034301052631578944, + "loss": 0.0296, "num_input_tokens_seen": 299405312, "step": 4570, - "train_runtime": 2143.645, - "train_tokens_per_second": 139671.129 + "train_runtime": 31.1358, + "train_tokens_per_second": 9616105.982 }, { "epoch": 0.9909130246646474, - "grad_norm": 1.0546875, - "learning_rate": 0.00016263, - "loss": 0.0215, + "grad_norm": 0.578125, + "learning_rate": 0.000342378947368421, + "loss": 0.0277, "num_input_tokens_seen": 300056576, "step": 4580, - "train_runtime": 2147.1526, - "train_tokens_per_second": 139746.274 + "train_runtime": 34.9281, + "train_tokens_per_second": 8590693.677 }, { "epoch": 0.9930765902206837, - "grad_norm": 0.9765625, - "learning_rate": 0.00016233, - "loss": 0.0231, + "grad_norm": 0.451171875, + "learning_rate": 0.0003417473684210526, + "loss": 0.0323, "num_input_tokens_seen": 300711936, "step": 4590, - "train_runtime": 2150.6129, - "train_tokens_per_second": 139826.154 + "train_runtime": 38.7343, + "train_tokens_per_second": 7763459.853 }, { "epoch": 0.99524015577672, - "grad_norm": 1.1953125, - "learning_rate": 0.00016203, - "loss": 0.0224, + "grad_norm": 0.498046875, + "learning_rate": 0.00034111578947368414, + "loss": 0.0284, "num_input_tokens_seen": 301367296, "step": 4600, - "train_runtime": 2154.0484, - "train_tokens_per_second": 139907.393 + "train_runtime": 42.5467, + "train_tokens_per_second": 7083205.808 }, { "epoch": 0.99524015577672, - "eval_loss": 0.014580282382667065, - "eval_runtime": 28.4901, - "eval_samples_per_second": 1.123, - "eval_steps_per_second": 0.035, + "eval_loss": 0.020042533054947853, + "eval_runtime": 1.6332, + "eval_samples_per_second": 19.593, + "eval_steps_per_second": 0.612, "num_input_tokens_seen": 301367296, "step": 4600 }, { "epoch": 0.99524015577672, - "eval_byte_accuracy": 0.9977744807121661, - "eval_chrf": 38.4395522222846, - "eval_sacrebleu": 20.212855511128847, - "eval_word_accuracy": 0.9931318681318682, + "eval_byte_accuracy": 0.9951780415430267, + "eval_chrf": 96.63362661333757, + "eval_sacrebleu": 95.8953860263406, + "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 301367296, - "perplexity": 1.0146870931790266, + "perplexity": 1.0202447332292253, "step": 4600 }, { "epoch": 0.9974037213327563, - "grad_norm": 0.63671875, - "learning_rate": 0.00016172999999999998, - "loss": 0.0223, + "grad_norm": 0.515625, + "learning_rate": 0.0003404842105263158, + "loss": 0.0309, "num_input_tokens_seen": 302022656, "step": 4610, - "train_runtime": 2185.9922, - "train_tokens_per_second": 138162.733 + "train_runtime": 48.0092, + "train_tokens_per_second": 6290930.799 }, { "epoch": 0.9995672868887927, - "grad_norm": 1.0234375, - "learning_rate": 0.00016143, - "loss": 0.0208, + "grad_norm": 0.458984375, + "learning_rate": 0.0003398526315789473, + "loss": 0.0289, "num_input_tokens_seen": 302678016, "step": 4620, - "train_runtime": 2189.4567, - "train_tokens_per_second": 138243.435 + "train_runtime": 51.8316, + "train_tokens_per_second": 5839641.358 }, { "epoch": 1.0017308524448292, - "grad_norm": 1.078125, - "learning_rate": 0.00016113, - "loss": 0.0198, + "grad_norm": 0.48828125, + "learning_rate": 0.00033922105263157896, + "loss": 0.0265, "num_input_tokens_seen": 303321088, "step": 4630, - "train_runtime": 2193.0169, - "train_tokens_per_second": 138312.245 + "train_runtime": 55.6955, + "train_tokens_per_second": 5446064.044 }, { "epoch": 1.0038944180008653, - "grad_norm": 0.9453125, - "learning_rate": 0.00016083, - "loss": 0.0205, + "grad_norm": 0.455078125, + "learning_rate": 0.0003385894736842105, + "loss": 0.0274, "num_input_tokens_seen": 303976448, "step": 4640, - "train_runtime": 2196.4717, - "train_tokens_per_second": 138393.065 + "train_runtime": 59.5237, + "train_tokens_per_second": 5106813.968 }, { "epoch": 1.0060579835569017, - "grad_norm": 1.0546875, - "learning_rate": 0.00016052999999999998, - "loss": 0.0207, + "grad_norm": 0.52734375, + "learning_rate": 0.0003379578947368421, + "loss": 0.0263, "num_input_tokens_seen": 304631808, "step": 4650, - "train_runtime": 2199.9404, - "train_tokens_per_second": 138472.752 + "train_runtime": 63.3483, + "train_tokens_per_second": 4808840.309 }, { "epoch": 1.008221549112938, - "grad_norm": 1.0703125, - "learning_rate": 0.00016023, - "loss": 0.0198, + "grad_norm": 0.73046875, + "learning_rate": 0.00033732631578947366, + "loss": 0.0269, "num_input_tokens_seen": 305287168, "step": 4660, - "train_runtime": 2203.3914, - "train_tokens_per_second": 138553.306 + "train_runtime": 67.1647, + "train_tokens_per_second": 4545353.422 }, { "epoch": 1.0103851146689744, - "grad_norm": 0.78515625, - "learning_rate": 0.00015993, - "loss": 0.0194, + "grad_norm": 0.59765625, + "learning_rate": 0.0003366947368421052, + "loss": 0.0265, "num_input_tokens_seen": 305942528, "step": 4670, - "train_runtime": 2206.9019, - "train_tokens_per_second": 138629.875 + "train_runtime": 70.9861, + "train_tokens_per_second": 4309893.533 }, { "epoch": 1.0125486802250108, - "grad_norm": 0.85546875, - "learning_rate": 0.00015963, - "loss": 0.0197, + "grad_norm": 0.6796875, + "learning_rate": 0.00033606315789473683, + "loss": 0.0269, "num_input_tokens_seen": 306597888, "step": 4680, - "train_runtime": 2210.3781, - "train_tokens_per_second": 138708.345 + "train_runtime": 74.8102, + "train_tokens_per_second": 4098342.089 }, { "epoch": 1.0147122457810471, - "grad_norm": 0.9765625, - "learning_rate": 0.00015932999999999998, - "loss": 0.0206, + "grad_norm": 0.474609375, + "learning_rate": 0.00033543157894736837, + "loss": 0.0282, "num_input_tokens_seen": 307253248, "step": 4690, - "train_runtime": 2213.8276, - "train_tokens_per_second": 138788.244 + "train_runtime": 78.6348, + "train_tokens_per_second": 3907343.298 }, { "epoch": 1.0168758113370835, - "grad_norm": 0.80859375, - "learning_rate": 0.00015903, - "loss": 0.02, + "grad_norm": 0.404296875, + "learning_rate": 0.0003348, + "loss": 0.0272, "num_input_tokens_seen": 307908608, "step": 4700, - "train_runtime": 2217.3272, - "train_tokens_per_second": 138864.756 + "train_runtime": 82.46, + "train_tokens_per_second": 3734036.121 }, { "epoch": 1.0168758113370835, - "eval_loss": 0.013260102830827236, - "eval_runtime": 25.3957, - "eval_samples_per_second": 1.26, - "eval_steps_per_second": 0.039, + "eval_loss": 0.020144794136285782, + "eval_runtime": 1.6689, + "eval_samples_per_second": 19.175, + "eval_steps_per_second": 0.599, "num_input_tokens_seen": 307908608, "step": 4700 }, { "epoch": 1.0168758113370835, - "eval_byte_accuracy": 0.9992581602373887, - "eval_chrf": 39.14699097768269, - "eval_sacrebleu": 21.108709414233278, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9944362017804155, + "eval_chrf": 96.01941443962484, + "eval_sacrebleu": 94.43788582746568, + "eval_word_accuracy": 0.9862637362637363, "num_input_tokens_seen": 307908608, - "perplexity": 1.0133484078740087, + "perplexity": 1.0203490698935738, "step": 4700 }, { "epoch": 1.0190393768931199, - "grad_norm": 0.78515625, - "learning_rate": 0.00015873, - "loss": 0.0203, + "grad_norm": 0.44921875, + "learning_rate": 0.00033416842105263154, + "loss": 0.0281, "num_input_tokens_seen": 308563968, "step": 4710, - "train_runtime": 2246.181, - "train_tokens_per_second": 137372.707 + "train_runtime": 87.973, + "train_tokens_per_second": 3507485.0 }, { "epoch": 1.0212029424491562, - "grad_norm": 0.95703125, - "learning_rate": 0.00015842999999999999, - "loss": 0.0213, + "grad_norm": 0.453125, + "learning_rate": 0.0003335368421052631, + "loss": 0.0266, "num_input_tokens_seen": 309219328, "step": 4720, - "train_runtime": 2249.6575, - "train_tokens_per_second": 137451.737 + "train_runtime": 91.7926, + "train_tokens_per_second": 3368675.086 }, { "epoch": 1.0233665080051926, - "grad_norm": 0.59375, - "learning_rate": 0.00015812999999999998, - "loss": 0.0206, + "grad_norm": 0.55078125, + "learning_rate": 0.0003329052631578947, + "loss": 0.0279, "num_input_tokens_seen": 309874688, "step": 4730, - "train_runtime": 2253.1778, - "train_tokens_per_second": 137527.843 + "train_runtime": 95.6175, + "train_tokens_per_second": 3240774.088 }, { "epoch": 1.025530073561229, - "grad_norm": 0.7109375, - "learning_rate": 0.00015783, - "loss": 0.0193, + "grad_norm": 0.37109375, + "learning_rate": 0.00033227368421052624, + "loss": 0.0232, "num_input_tokens_seen": 310530048, "step": 4740, - "train_runtime": 2256.7102, - "train_tokens_per_second": 137602.98 + "train_runtime": 99.4401, + "train_tokens_per_second": 3122786.399 }, { "epoch": 1.0276936391172653, - "grad_norm": 1.2109375, - "learning_rate": 0.00015753, - "loss": 0.0214, + "grad_norm": 0.921875, + "learning_rate": 0.0003316421052631579, + "loss": 0.0279, "num_input_tokens_seen": 311185408, "step": 4750, - "train_runtime": 2260.2026, - "train_tokens_per_second": 137680.315 + "train_runtime": 103.265, + "train_tokens_per_second": 3013463.926 }, { "epoch": 1.0298572046733017, - "grad_norm": 0.7265625, - "learning_rate": 0.00015722999999999998, - "loss": 0.0203, + "grad_norm": 0.8125, + "learning_rate": 0.0003310105263157894, + "loss": 0.0267, "num_input_tokens_seen": 311840768, "step": 4760, - "train_runtime": 2263.6652, - "train_tokens_per_second": 137759.227 + "train_runtime": 107.0815, + "train_tokens_per_second": 2912180.92 }, { "epoch": 1.032020770229338, - "grad_norm": 0.76953125, - "learning_rate": 0.00015692999999999998, - "loss": 0.0186, + "grad_norm": 0.90234375, + "learning_rate": 0.00033037894736842106, + "loss": 0.0289, "num_input_tokens_seen": 312496128, "step": 4770, - "train_runtime": 2267.1138, - "train_tokens_per_second": 137838.748 + "train_runtime": 110.8903, + "train_tokens_per_second": 2818065.378 }, { "epoch": 1.0341843357853744, - "grad_norm": 1.015625, - "learning_rate": 0.00015663, - "loss": 0.0173, + "grad_norm": 0.53515625, + "learning_rate": 0.0003297473684210526, + "loss": 0.0247, "num_input_tokens_seen": 313151488, "step": 4780, - "train_runtime": 2270.5655, - "train_tokens_per_second": 137917.841 + "train_runtime": 114.7073, + "train_tokens_per_second": 2730004.96 }, { "epoch": 1.0363479013414107, - "grad_norm": 0.51171875, - "learning_rate": 0.00015633, - "loss": 0.0179, + "grad_norm": 0.703125, + "learning_rate": 0.0003291157894736842, + "loss": 0.0267, "num_input_tokens_seen": 313806848, "step": 4790, - "train_runtime": 2274.0937, - "train_tokens_per_second": 137992.049 + "train_runtime": 118.5372, + "train_tokens_per_second": 2647328.086 }, { "epoch": 1.0385114668974469, - "grad_norm": 0.7109375, - "learning_rate": 0.00015602999999999998, - "loss": 0.0182, + "grad_norm": 0.4375, + "learning_rate": 0.00032848421052631576, + "loss": 0.025, "num_input_tokens_seen": 314462208, "step": 4800, - "train_runtime": 2277.6233, - "train_tokens_per_second": 138065.943 + "train_runtime": 122.361, + "train_tokens_per_second": 2569953.792 }, { "epoch": 1.0385114668974469, - "eval_loss": 0.014184832572937012, - "eval_runtime": 23.9935, - "eval_samples_per_second": 1.334, - "eval_steps_per_second": 0.042, + "eval_loss": 0.018589144572615623, + "eval_runtime": 1.8825, + "eval_samples_per_second": 16.998, + "eval_steps_per_second": 0.531, "num_input_tokens_seen": 314462208, "step": 4800 }, { "epoch": 1.0385114668974469, - "eval_byte_accuracy": 0.9988872403560831, - "eval_chrf": 39.569473872061764, - "eval_sacrebleu": 20.2857332683276, - "eval_word_accuracy": 0.9958791208791209, + "eval_byte_accuracy": 0.9951780415430267, + "eval_chrf": 96.23348374614099, + "eval_sacrebleu": 94.52630209470658, + "eval_word_accuracy": 0.9876373626373627, "num_input_tokens_seen": 314462208, - "perplexity": 1.0142859146893002, + "perplexity": 1.0187629983138469, "step": 4800 }, { "epoch": 1.0406750324534833, - "grad_norm": 0.9765625, - "learning_rate": 0.00015573, - "loss": 0.0189, + "grad_norm": 0.55859375, + "learning_rate": 0.00032785263157894735, + "loss": 0.0261, "num_input_tokens_seen": 315117568, "step": 4810, - "train_runtime": 2305.0829, - "train_tokens_per_second": 136705.527 + "train_runtime": 128.0843, + "train_tokens_per_second": 2460235.566 }, { "epoch": 1.0428385980095196, - "grad_norm": 0.96484375, - "learning_rate": 0.00015543, - "loss": 0.0198, + "grad_norm": 0.458984375, + "learning_rate": 0.00032722105263157894, + "loss": 0.0267, "num_input_tokens_seen": 315772928, "step": 4820, - "train_runtime": 2308.5354, - "train_tokens_per_second": 136784.963 + "train_runtime": 131.9081, + "train_tokens_per_second": 2393886.109 }, { "epoch": 1.045002163565556, - "grad_norm": 1.140625, - "learning_rate": 0.00015513, - "loss": 0.019, + "grad_norm": 0.5859375, + "learning_rate": 0.00032658947368421047, + "loss": 0.0268, "num_input_tokens_seen": 316428288, "step": 4830, - "train_runtime": 2311.98, - "train_tokens_per_second": 136864.63 + "train_runtime": 135.7375, + "train_tokens_per_second": 2331178.327 }, { "epoch": 1.0471657291215923, - "grad_norm": 1.6640625, - "learning_rate": 0.00015482999999999998, - "loss": 0.0212, + "grad_norm": 0.55078125, + "learning_rate": 0.0003259578947368421, + "loss": 0.0285, "num_input_tokens_seen": 317083648, "step": 4840, - "train_runtime": 2315.5066, - "train_tokens_per_second": 136939.214 + "train_runtime": 139.5602, + "train_tokens_per_second": 2272020.269 }, { "epoch": 1.0493292946776287, - "grad_norm": 1.140625, - "learning_rate": 0.00015453, - "loss": 0.0185, + "grad_norm": 0.640625, + "learning_rate": 0.00032532631578947364, + "loss": 0.0257, "num_input_tokens_seen": 317739008, "step": 4850, - "train_runtime": 2318.9973, - "train_tokens_per_second": 137015.688 + "train_runtime": 143.3904, + "train_tokens_per_second": 2215901.171 }, { "epoch": 1.051492860233665, - "grad_norm": 0.70703125, - "learning_rate": 0.00015423, - "loss": 0.0191, + "grad_norm": 0.384765625, + "learning_rate": 0.0003246947368421052, + "loss": 0.0272, "num_input_tokens_seen": 318394368, "step": 4860, - "train_runtime": 2322.4751, - "train_tokens_per_second": 137092.694 + "train_runtime": 147.2228, + "train_tokens_per_second": 2162670.788 }, { "epoch": 1.0536564257897014, "grad_norm": 0.69921875, - "learning_rate": 0.00015392999999999998, - "loss": 0.0187, + "learning_rate": 0.0003240631578947368, + "loss": 0.0267, "num_input_tokens_seen": 319049728, "step": 4870, - "train_runtime": 2325.976, - "train_tokens_per_second": 137168.108 + "train_runtime": 151.0482, + "train_tokens_per_second": 2112237.293 }, { "epoch": 1.0558199913457378, - "grad_norm": 0.95703125, - "learning_rate": 0.00015362999999999998, - "loss": 0.0176, + "grad_norm": 0.578125, + "learning_rate": 0.0003234315789473684, + "loss": 0.0261, "num_input_tokens_seen": 319705088, "step": 4880, - "train_runtime": 2329.5053, - "train_tokens_per_second": 137241.62 + "train_runtime": 154.8725, + "train_tokens_per_second": 2064311.503 }, { "epoch": 1.0579835569017741, - "grad_norm": 0.7578125, - "learning_rate": 0.00015333, - "loss": 0.0178, + "grad_norm": 0.66015625, + "learning_rate": 0.0003228, + "loss": 0.0242, "num_input_tokens_seen": 320360448, "step": 4890, - "train_runtime": 2332.997, - "train_tokens_per_second": 137317.128 + "train_runtime": 158.6951, + "train_tokens_per_second": 2018717.011 }, { "epoch": 1.0601471224578105, - "grad_norm": 0.734375, - "learning_rate": 0.00015303, - "loss": 0.0212, + "grad_norm": 0.77734375, + "learning_rate": 0.0003221684210526315, + "loss": 0.0295, "num_input_tokens_seen": 321015808, "step": 4900, - "train_runtime": 2336.5093, - "train_tokens_per_second": 137391.194 + "train_runtime": 162.5123, + "train_tokens_per_second": 1975332.311 }, { "epoch": 1.0601471224578105, - "eval_loss": 0.013760020956397057, - "eval_runtime": 25.696, - "eval_samples_per_second": 1.245, - "eval_steps_per_second": 0.039, + "eval_loss": 0.020329073071479797, + "eval_runtime": 1.7307, + "eval_samples_per_second": 18.49, + "eval_steps_per_second": 0.578, "num_input_tokens_seen": 321015808, "step": 4900 }, { "epoch": 1.0601471224578105, - "eval_byte_accuracy": 0.9988872403560831, - "eval_chrf": 39.61415471750501, - "eval_sacrebleu": 21.234715325807436, - "eval_word_accuracy": 0.9958791208791209, + "eval_byte_accuracy": 0.9944362017804155, + "eval_chrf": 95.49067447945914, + "eval_sacrebleu": 93.90610177072331, + "eval_word_accuracy": 0.9848901098901099, "num_input_tokens_seen": 321015808, - "perplexity": 1.0138551257587967, + "perplexity": 1.0205371160596415, "step": 4900 }, { "epoch": 1.0623106880138469, - "grad_norm": 0.796875, - "learning_rate": 0.00015272999999999998, - "loss": 0.02, + "grad_norm": 0.46484375, + "learning_rate": 0.00032153684210526316, + "loss": 0.0281, "num_input_tokens_seen": 321671168, "step": 4910, - "train_runtime": 2365.6961, - "train_tokens_per_second": 135973.155 + "train_runtime": 168.09, + "train_tokens_per_second": 1913684.312 }, { "epoch": 1.0644742535698832, - "grad_norm": 0.80859375, - "learning_rate": 0.00015242999999999998, - "loss": 0.0243, + "grad_norm": 0.6328125, + "learning_rate": 0.0003209052631578947, + "loss": 0.0271, "num_input_tokens_seen": 322326528, "step": 4920, - "train_runtime": 2369.1935, - "train_tokens_per_second": 136049.05 + "train_runtime": 171.9119, + "train_tokens_per_second": 1874951.703 }, { "epoch": 1.0666378191259196, - "grad_norm": 0.51171875, - "learning_rate": 0.00015213, - "loss": 0.0184, + "grad_norm": 0.6484375, + "learning_rate": 0.0003202736842105263, + "loss": 0.0262, "num_input_tokens_seen": 322981888, "step": 4930, - "train_runtime": 2372.6912, - "train_tokens_per_second": 136124.707 + "train_runtime": 175.7226, + "train_tokens_per_second": 1838021.179 }, { "epoch": 1.068801384681956, - "grad_norm": 0.94140625, - "learning_rate": 0.00015183, - "loss": 0.0185, + "grad_norm": 0.486328125, + "learning_rate": 0.00031964210526315786, + "loss": 0.0256, "num_input_tokens_seen": 323637248, "step": 4940, - "train_runtime": 2376.1732, - "train_tokens_per_second": 136201.033 + "train_runtime": 179.5451, + "train_tokens_per_second": 1802539.801 }, { "epoch": 1.0709649502379923, - "grad_norm": 0.953125, - "learning_rate": 0.00015152999999999998, - "loss": 0.0179, + "grad_norm": 0.65234375, + "learning_rate": 0.00031901052631578945, + "loss": 0.025, "num_input_tokens_seen": 324292608, "step": 4950, - "train_runtime": 2379.6579, - "train_tokens_per_second": 136276.987 + "train_runtime": 183.36, + "train_tokens_per_second": 1768611.34 }, { "epoch": 1.0731285157940285, - "grad_norm": 0.70703125, - "learning_rate": 0.00015122999999999997, - "loss": 0.0207, + "grad_norm": 0.5546875, + "learning_rate": 0.00031837894736842104, + "loss": 0.0265, "num_input_tokens_seen": 324947968, "step": 4960, - "train_runtime": 2383.1184, - "train_tokens_per_second": 136354.099 + "train_runtime": 187.1847, + "train_tokens_per_second": 1735974.791 }, { "epoch": 1.0752920813500648, - "grad_norm": 0.53125, - "learning_rate": 0.00015093, - "loss": 0.0187, + "grad_norm": 0.76171875, + "learning_rate": 0.00031774736842105257, + "loss": 0.0275, "num_input_tokens_seen": 325603328, "step": 4970, - "train_runtime": 2386.5619, - "train_tokens_per_second": 136431.967 + "train_runtime": 190.9969, + "train_tokens_per_second": 1704757.413 }, { "epoch": 1.0774556469061012, - "grad_norm": 0.9453125, - "learning_rate": 0.00015062999999999999, - "loss": 0.024, + "grad_norm": 0.53515625, + "learning_rate": 0.0003171157894736842, + "loss": 0.0274, "num_input_tokens_seen": 326258688, "step": 4980, - "train_runtime": 2389.9897, - "train_tokens_per_second": 136510.497 + "train_runtime": 194.8145, + "train_tokens_per_second": 1674714.319 }, { "epoch": 1.0796192124621375, - "grad_norm": 0.7890625, - "learning_rate": 0.00015032999999999998, - "loss": 0.0187, + "grad_norm": 0.51953125, + "learning_rate": 0.00031648421052631574, + "loss": 0.0252, "num_input_tokens_seen": 326914048, "step": 4990, - "train_runtime": 2393.4624, - "train_tokens_per_second": 136586.249 + "train_runtime": 198.6353, + "train_tokens_per_second": 1645800.089 }, { "epoch": 1.081782778018174, - "grad_norm": 1.0, - "learning_rate": 0.00015002999999999997, - "loss": 0.0183, + "grad_norm": 0.59765625, + "learning_rate": 0.00031585263157894733, + "loss": 0.0239, "num_input_tokens_seen": 327569408, "step": 5000, - "train_runtime": 2396.974, - "train_tokens_per_second": 136659.559 + "train_runtime": 202.4467, + "train_tokens_per_second": 1618052.623 }, { "epoch": 1.081782778018174, - "eval_loss": 0.013022142462432384, - "eval_runtime": 23.9443, - "eval_samples_per_second": 1.336, - "eval_steps_per_second": 0.042, + "eval_loss": 0.01753578893840313, + "eval_runtime": 1.8451, + "eval_samples_per_second": 17.344, + "eval_steps_per_second": 0.542, "num_input_tokens_seen": 327569408, "step": 5000 }, { "epoch": 1.081782778018174, - "eval_byte_accuracy": 0.9992581602373887, - "eval_chrf": 39.9681810741888, - "eval_sacrebleu": 22.95724485507563, - "eval_word_accuracy": 0.9972527472527473, + "eval_byte_accuracy": 0.995919881305638, + "eval_chrf": 96.50768150331436, + "eval_sacrebleu": 95.31161095839255, + "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 327569408, - "perplexity": 1.0131072998017794, + "perplexity": 1.0176904435596141, "step": 5000 }, { "epoch": 1.0839463435742103, - "grad_norm": 0.74609375, - "learning_rate": 0.00014973, - "loss": 0.0173, + "grad_norm": 0.5, + "learning_rate": 0.0003152210526315789, + "loss": 0.0233, "num_input_tokens_seen": 328224768, "step": 5010, - "train_runtime": 2424.5305, - "train_tokens_per_second": 135376.632 + "train_runtime": 208.189, + "train_tokens_per_second": 1576571.175 }, { "epoch": 1.0861099091302466, - "grad_norm": 0.90625, - "learning_rate": 0.00014942999999999998, - "loss": 0.0229, + "grad_norm": 0.46875, + "learning_rate": 0.0003145894736842105, + "loss": 0.0251, "num_input_tokens_seen": 328880128, "step": 5020, - "train_runtime": 2428.0354, - "train_tokens_per_second": 135451.127 + "train_runtime": 212.0222, + "train_tokens_per_second": 1551158.906 }, { "epoch": 1.088273474686283, - "grad_norm": 1.015625, - "learning_rate": 0.00014912999999999998, - "loss": 0.0188, + "grad_norm": 0.640625, + "learning_rate": 0.0003139578947368421, + "loss": 0.026, "num_input_tokens_seen": 329535488, "step": 5030, - "train_runtime": 2431.5473, - "train_tokens_per_second": 135525.018 + "train_runtime": 215.8507, + "train_tokens_per_second": 1526682.433 }, { "epoch": 1.0904370402423194, - "grad_norm": 0.6953125, - "learning_rate": 0.00014882999999999997, - "loss": 0.0196, + "grad_norm": 0.490234375, + "learning_rate": 0.0003133263157894737, + "loss": 0.0272, "num_input_tokens_seen": 330186752, "step": 5040, - "train_runtime": 2434.9833, - "train_tokens_per_second": 135601.238 + "train_runtime": 219.6399, + "train_tokens_per_second": 1503309.199 }, { "epoch": 1.0926006057983557, - "grad_norm": 0.58203125, - "learning_rate": 0.00014853, - "loss": 0.0181, + "grad_norm": 0.478515625, + "learning_rate": 0.00031269473684210526, + "loss": 0.0256, "num_input_tokens_seen": 330842112, "step": 5050, - "train_runtime": 2438.4714, - "train_tokens_per_second": 135676.029 + "train_runtime": 223.4569, + "train_tokens_per_second": 1480563.289 }, { "epoch": 1.094764171354392, - "grad_norm": 0.91015625, - "learning_rate": 0.00014822999999999998, - "loss": 0.0186, + "grad_norm": 0.63671875, + "learning_rate": 0.0003120631578947368, + "loss": 0.0246, "num_input_tokens_seen": 331497472, "step": 5060, - "train_runtime": 2441.9742, - "train_tokens_per_second": 135749.782 + "train_runtime": 227.2758, + "train_tokens_per_second": 1458569.335 }, { "epoch": 1.0969277369104284, - "grad_norm": 0.75, - "learning_rate": 0.00014792999999999997, - "loss": 0.0173, + "grad_norm": 0.70703125, + "learning_rate": 0.0003114315789473684, + "loss": 0.024, "num_input_tokens_seen": 332152832, "step": 5070, - "train_runtime": 2445.4801, - "train_tokens_per_second": 135823.157 + "train_runtime": 231.1046, + "train_tokens_per_second": 1437240.386 }, { "epoch": 1.0990913024664648, - "grad_norm": 0.65234375, - "learning_rate": 0.00014763, - "loss": 0.0187, + "grad_norm": 0.443359375, + "learning_rate": 0.00031079999999999997, + "loss": 0.0261, "num_input_tokens_seen": 332808192, "step": 5080, - "train_runtime": 2448.9796, - "train_tokens_per_second": 135896.676 + "train_runtime": 234.9359, + "train_tokens_per_second": 1416591.304 }, { "epoch": 1.1012548680225012, - "grad_norm": 0.765625, - "learning_rate": 0.00014733, - "loss": 0.0173, + "grad_norm": 0.6015625, + "learning_rate": 0.00031016842105263155, + "loss": 0.0253, "num_input_tokens_seen": 333463552, "step": 5090, - "train_runtime": 2452.4836, - "train_tokens_per_second": 135969.74 + "train_runtime": 238.7688, + "train_tokens_per_second": 1396596.039 }, { "epoch": 1.1034184335785375, - "grad_norm": 0.53515625, - "learning_rate": 0.00014702999999999998, - "loss": 0.0171, + "grad_norm": 0.5078125, + "learning_rate": 0.00030953684210526314, + "loss": 0.0249, "num_input_tokens_seen": 334118912, "step": 5100, - "train_runtime": 2455.9827, - "train_tokens_per_second": 136042.863 + "train_runtime": 242.5974, + "train_tokens_per_second": 1377256.985 }, { "epoch": 1.1034184335785375, - "eval_loss": 0.011633491143584251, - "eval_runtime": 26.5648, - "eval_samples_per_second": 1.205, - "eval_steps_per_second": 0.038, + "eval_loss": 0.018276168033480644, + "eval_runtime": 1.8788, + "eval_samples_per_second": 17.032, + "eval_steps_per_second": 0.532, "num_input_tokens_seen": 334118912, "step": 5100 }, { "epoch": 1.1034184335785375, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 40.55907125598693, - "eval_sacrebleu": 23.625602022598223, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.995919881305638, + "eval_chrf": 96.9547828716358, + "eval_sacrebleu": 96.30918177844974, + "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 334118912, - "perplexity": 1.0117014233757702, + "perplexity": 1.018444199287346, "step": 5100 }, { "epoch": 1.1055819991345737, - "grad_norm": 0.65625, - "learning_rate": 0.00014672999999999997, - "loss": 0.0168, + "grad_norm": 0.72265625, + "learning_rate": 0.0003089052631578947, + "loss": 0.0233, "num_input_tokens_seen": 334774272, "step": 5110, - "train_runtime": 2485.9999, - "train_tokens_per_second": 134663.83 + "train_runtime": 248.3149, + "train_tokens_per_second": 1348184.603 }, { "epoch": 1.10774556469061, - "grad_norm": 0.984375, - "learning_rate": 0.00014643, - "loss": 0.0178, + "grad_norm": 0.470703125, + "learning_rate": 0.0003082736842105263, + "loss": 0.023, "num_input_tokens_seen": 335429632, "step": 5120, - "train_runtime": 2489.4697, - "train_tokens_per_second": 134739.391 + "train_runtime": 252.1428, + "train_tokens_per_second": 1330316.02 }, { "epoch": 1.1099091302466464, - "grad_norm": 0.68359375, - "learning_rate": 0.00014612999999999998, - "loss": 0.0206, + "grad_norm": 0.423828125, + "learning_rate": 0.00030764210526315784, + "loss": 0.0269, "num_input_tokens_seen": 336084992, "step": 5130, - "train_runtime": 2492.9711, - "train_tokens_per_second": 134813.03 + "train_runtime": 255.9686, + "train_tokens_per_second": 1312992.994 }, { "epoch": 1.1120726958026828, - "grad_norm": 0.6953125, - "learning_rate": 0.00014582999999999998, - "loss": 0.0161, + "grad_norm": 0.490234375, + "learning_rate": 0.00030701052631578943, + "loss": 0.0216, "num_input_tokens_seen": 336740352, "step": 5140, - "train_runtime": 2496.4628, - "train_tokens_per_second": 134886.992 + "train_runtime": 259.7909, + "train_tokens_per_second": 1296197.746 }, { "epoch": 1.1142362613587191, - "grad_norm": 0.828125, - "learning_rate": 0.00014552999999999997, - "loss": 0.018, + "grad_norm": 0.54296875, + "learning_rate": 0.000306378947368421, + "loss": 0.0246, "num_input_tokens_seen": 337395712, "step": 5150, - "train_runtime": 2499.9139, - "train_tokens_per_second": 134962.936 + "train_runtime": 263.6165, + "train_tokens_per_second": 1279873.366 }, { "epoch": 1.1163998269147555, - "grad_norm": 0.93359375, - "learning_rate": 0.00014523, - "loss": 0.0172, + "grad_norm": 0.470703125, + "learning_rate": 0.0003057473684210526, + "loss": 0.0238, "num_input_tokens_seen": 338051072, "step": 5160, - "train_runtime": 2503.4023, - "train_tokens_per_second": 135036.656 + "train_runtime": 267.4291, + "train_tokens_per_second": 1264077.394 }, { "epoch": 1.1185633924707918, - "grad_norm": 0.6796875, - "learning_rate": 0.00014492999999999998, - "loss": 0.0187, + "grad_norm": 0.51171875, + "learning_rate": 0.0003051157894736842, + "loss": 0.0237, "num_input_tokens_seen": 338706432, "step": 5170, - "train_runtime": 2506.9035, - "train_tokens_per_second": 135109.48 + "train_runtime": 271.2587, + "train_tokens_per_second": 1248647.458 }, { "epoch": 1.1207269580268282, - "grad_norm": 0.86328125, - "learning_rate": 0.00014462999999999998, - "loss": 0.0217, + "grad_norm": 0.71875, + "learning_rate": 0.0003044842105263158, + "loss": 0.024, "num_input_tokens_seen": 339361792, "step": 5180, - "train_runtime": 2510.3979, - "train_tokens_per_second": 135182.47 + "train_runtime": 275.0804, + "train_tokens_per_second": 1233682.374 }, { "epoch": 1.1228905235828646, - "grad_norm": 0.65625, - "learning_rate": 0.00014433, - "loss": 0.0179, + "grad_norm": 0.5625, + "learning_rate": 0.00030385263157894736, + "loss": 0.0238, "num_input_tokens_seen": 340017152, "step": 5190, - "train_runtime": 2513.8628, - "train_tokens_per_second": 135256.844 + "train_runtime": 278.9047, + "train_tokens_per_second": 1219115.936 }, { "epoch": 1.125054089138901, - "grad_norm": 0.59765625, - "learning_rate": 0.00014403, - "loss": 0.0187, + "grad_norm": 0.67578125, + "learning_rate": 0.0003032210526315789, + "loss": 0.0262, "num_input_tokens_seen": 340672512, "step": 5200, - "train_runtime": 2517.325, - "train_tokens_per_second": 135331.157 + "train_runtime": 282.7305, + "train_tokens_per_second": 1204937.424 }, { "epoch": 1.125054089138901, - "eval_loss": 0.011913910508155823, - "eval_runtime": 26.0777, - "eval_samples_per_second": 1.227, - "eval_steps_per_second": 0.038, + "eval_loss": 0.016979500651359558, + "eval_runtime": 1.6264, + "eval_samples_per_second": 19.675, + "eval_steps_per_second": 0.615, "num_input_tokens_seen": 340672512, "step": 5200 }, { "epoch": 1.125054089138901, - "eval_byte_accuracy": 0.9992581602373887, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9972527472527473, + "eval_byte_accuracy": 0.995919881305638, + "eval_chrf": 97.02684713927562, + "eval_sacrebleu": 96.68143920500805, + "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 340672512, - "perplexity": 1.0119851638273487, + "perplexity": 1.017124471722373, "step": 5200 }, { "epoch": 1.1272176546949373, - "grad_norm": 0.5625, - "learning_rate": 0.00014373, - "loss": 0.0166, + "grad_norm": 0.376953125, + "learning_rate": 0.0003025894736842105, + "loss": 0.0229, "num_input_tokens_seen": 341327872, "step": 5210, - "train_runtime": 2546.8986, - "train_tokens_per_second": 134017.062 + "train_runtime": 288.1928, + "train_tokens_per_second": 1184373.426 }, { "epoch": 1.1293812202509736, - "grad_norm": 0.65234375, - "learning_rate": 0.00014343, - "loss": 0.0169, + "grad_norm": 0.828125, + "learning_rate": 0.00030195789473684207, + "loss": 0.0241, "num_input_tokens_seen": 341983232, "step": 5220, - "train_runtime": 2550.3703, - "train_tokens_per_second": 134091.6 + "train_runtime": 292.0166, + "train_tokens_per_second": 1171108.686 }, { "epoch": 1.13154478580701, - "grad_norm": 0.79296875, - "learning_rate": 0.00014313, - "loss": 0.0204, + "grad_norm": 0.640625, + "learning_rate": 0.00030132631578947365, + "loss": 0.0273, "num_input_tokens_seen": 342638592, "step": 5230, - "train_runtime": 2553.8704, - "train_tokens_per_second": 134164.44 + "train_runtime": 295.8511, + "train_tokens_per_second": 1158145.403 }, { "epoch": 1.1337083513630464, - "grad_norm": 0.63671875, - "learning_rate": 0.00014282999999999999, - "loss": 0.0173, + "grad_norm": 0.6875, + "learning_rate": 0.00030069473684210524, + "loss": 0.0242, "num_input_tokens_seen": 343293952, "step": 5240, - "train_runtime": 2557.4007, - "train_tokens_per_second": 134235.497 + "train_runtime": 299.6779, + "train_tokens_per_second": 1145543.28 }, { "epoch": 1.1358719169190827, - "grad_norm": 0.71875, - "learning_rate": 0.00014253, - "loss": 0.0184, + "grad_norm": 0.478515625, + "learning_rate": 0.0003000631578947368, + "loss": 0.0261, "num_input_tokens_seen": 343949312, "step": 5250, - "train_runtime": 2560.9276, - "train_tokens_per_second": 134306.536 + "train_runtime": 303.4995, + "train_tokens_per_second": 1133278.165 }, { "epoch": 1.1380354824751189, - "grad_norm": 0.69140625, - "learning_rate": 0.00014223, - "loss": 0.0163, + "grad_norm": 0.546875, + "learning_rate": 0.0002994315789473684, + "loss": 0.0232, "num_input_tokens_seen": 344604672, "step": 5260, - "train_runtime": 2564.4258, - "train_tokens_per_second": 134378.883 + "train_runtime": 307.3262, + "train_tokens_per_second": 1121299.327 }, { "epoch": 1.1401990480311555, - "grad_norm": 0.6171875, - "learning_rate": 0.00014193, - "loss": 0.0182, + "grad_norm": 0.6015625, + "learning_rate": 0.0002988, + "loss": 0.0254, "num_input_tokens_seen": 345260032, "step": 5270, - "train_runtime": 2567.9172, - "train_tokens_per_second": 134451.388 + "train_runtime": 311.1516, + "train_tokens_per_second": 1109619.945 }, { "epoch": 1.1423626135871916, - "grad_norm": 0.6328125, - "learning_rate": 0.00014162999999999998, - "loss": 0.0176, + "grad_norm": 0.578125, + "learning_rate": 0.00029816842105263153, + "loss": 0.0228, "num_input_tokens_seen": 345915392, "step": 5280, - "train_runtime": 2571.42, - "train_tokens_per_second": 134523.1 + "train_runtime": 314.9762, + "train_tokens_per_second": 1098227.147 }, { "epoch": 1.144526179143228, - "grad_norm": 0.60546875, - "learning_rate": 0.00014133, - "loss": 0.0173, + "grad_norm": 0.51953125, + "learning_rate": 0.0002975368421052631, + "loss": 0.0241, "num_input_tokens_seen": 346570752, "step": 5290, - "train_runtime": 2574.8925, - "train_tokens_per_second": 134596.202 + "train_runtime": 318.8009, + "train_tokens_per_second": 1087107.321 }, { "epoch": 1.1466897446992643, - "grad_norm": 0.82421875, - "learning_rate": 0.00014103, - "loss": 0.0211, + "grad_norm": 0.400390625, + "learning_rate": 0.0002969052631578947, + "loss": 0.0254, "num_input_tokens_seen": 347226112, "step": 5300, - "train_runtime": 2578.3801, - "train_tokens_per_second": 134668.318 + "train_runtime": 322.6315, + "train_tokens_per_second": 1076231.314 }, { "epoch": 1.1466897446992643, - "eval_loss": 0.012271178886294365, - "eval_runtime": 26.2679, - "eval_samples_per_second": 1.218, - "eval_steps_per_second": 0.038, + "eval_loss": 0.017398551106452942, + "eval_runtime": 1.5555, + "eval_samples_per_second": 20.573, + "eval_steps_per_second": 0.643, "num_input_tokens_seen": 347226112, "step": 5300 }, { "epoch": 1.1466897446992643, - "eval_byte_accuracy": 0.9988872403560831, - "eval_chrf": 40.16732584448379, - "eval_sacrebleu": 21.50832274268183, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.994807121661721, + "eval_chrf": 96.3910110996503, + "eval_sacrebleu": 95.04714140916596, + "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 347226112, - "perplexity": 1.012346778718466, + "perplexity": 1.0175507875128094, "step": 5300 }, { "epoch": 1.1488533102553007, - "grad_norm": 0.6171875, - "learning_rate": 0.00014073, - "loss": 0.0181, + "grad_norm": 0.39453125, + "learning_rate": 0.0002962736842105263, + "loss": 0.0242, "num_input_tokens_seen": 347881472, "step": 5310, - "train_runtime": 2608.1519, - "train_tokens_per_second": 133382.368 + "train_runtime": 328.0278, + "train_tokens_per_second": 1060524.345 }, { "epoch": 1.151016875811337, - "grad_norm": 0.9453125, - "learning_rate": 0.00014042999999999998, - "loss": 0.0189, + "grad_norm": 0.66796875, + "learning_rate": 0.0002956421052631579, + "loss": 0.0252, "num_input_tokens_seen": 348536832, "step": 5320, - "train_runtime": 2611.6141, - "train_tokens_per_second": 133456.481 + "train_runtime": 331.855, + "train_tokens_per_second": 1050268.479 }, { "epoch": 1.1531804413673734, - "grad_norm": 0.5625, - "learning_rate": 0.00014013, - "loss": 0.0185, + "grad_norm": 0.57421875, + "learning_rate": 0.00029501052631578946, + "loss": 0.025, "num_input_tokens_seen": 349192192, "step": 5330, - "train_runtime": 2615.1129, - "train_tokens_per_second": 133528.536 + "train_runtime": 335.6748, + "train_tokens_per_second": 1040269.264 }, { "epoch": 1.1553440069234098, - "grad_norm": 0.66796875, - "learning_rate": 0.00013983, - "loss": 0.0191, + "grad_norm": 0.5859375, + "learning_rate": 0.00029437894736842105, + "loss": 0.0245, "num_input_tokens_seen": 349847552, "step": 5340, - "train_runtime": 2618.6186, - "train_tokens_per_second": 133600.039 + "train_runtime": 339.4914, + "train_tokens_per_second": 1030504.966 }, { "epoch": 1.1575075724794461, - "grad_norm": 0.8125, - "learning_rate": 0.00013953, - "loss": 0.018, + "grad_norm": 0.66015625, + "learning_rate": 0.00029374736842105264, + "loss": 0.0258, "num_input_tokens_seen": 350502912, "step": 5350, - "train_runtime": 2622.0728, - "train_tokens_per_second": 133673.981 + "train_runtime": 343.3195, + "train_tokens_per_second": 1020923.503 }, { "epoch": 1.1596711380354825, - "grad_norm": 0.95703125, - "learning_rate": 0.00013922999999999998, - "loss": 0.0188, + "grad_norm": 0.63671875, + "learning_rate": 0.00029311578947368417, + "loss": 0.0242, "num_input_tokens_seen": 351158272, "step": 5360, - "train_runtime": 2625.5518, - "train_tokens_per_second": 133746.463 + "train_runtime": 347.1488, + "train_tokens_per_second": 1011549.652 }, { "epoch": 1.1618347035915189, - "grad_norm": 0.84375, - "learning_rate": 0.00013893, - "loss": 0.0182, + "grad_norm": 0.59765625, + "learning_rate": 0.00029248421052631575, + "loss": 0.0243, "num_input_tokens_seen": 351813632, "step": 5370, - "train_runtime": 2629.0109, - "train_tokens_per_second": 133819.767 + "train_runtime": 350.956, + "train_tokens_per_second": 1002443.602 }, { "epoch": 1.1639982691475552, - "grad_norm": 0.76953125, - "learning_rate": 0.00013863, - "loss": 0.0188, + "grad_norm": 0.4921875, + "learning_rate": 0.00029185263157894734, + "loss": 0.0245, "num_input_tokens_seen": 352468992, "step": 5380, - "train_runtime": 2632.5057, - "train_tokens_per_second": 133891.064 + "train_runtime": 354.7755, + "train_tokens_per_second": 993498.788 }, { "epoch": 1.1661618347035916, - "grad_norm": 0.75, - "learning_rate": 0.00013832999999999999, - "loss": 0.018, + "grad_norm": 0.70703125, + "learning_rate": 0.00029122105263157893, + "loss": 0.0228, "num_input_tokens_seen": 353124352, "step": 5390, - "train_runtime": 2636.0126, - "train_tokens_per_second": 133961.558 + "train_runtime": 358.6124, + "train_tokens_per_second": 984696.408 }, { "epoch": 1.168325400259628, - "grad_norm": 0.88671875, - "learning_rate": 0.00013802999999999998, - "loss": 0.0163, + "grad_norm": 0.52734375, + "learning_rate": 0.0002905894736842105, + "loss": 0.0239, "num_input_tokens_seen": 353779712, "step": 5400, - "train_runtime": 2639.4856, - "train_tokens_per_second": 134033.585 + "train_runtime": 362.4359, + "train_tokens_per_second": 976116.534 }, { "epoch": 1.168325400259628, - "eval_loss": 0.012020495720207691, - "eval_runtime": 22.3255, - "eval_samples_per_second": 1.433, - "eval_steps_per_second": 0.045, + "eval_loss": 0.016983406618237495, + "eval_runtime": 1.6338, + "eval_samples_per_second": 19.586, + "eval_steps_per_second": 0.612, "num_input_tokens_seen": 353779712, "step": 5400 }, { "epoch": 1.168325400259628, - "eval_byte_accuracy": 0.9988872403560831, - "eval_chrf": 38.973454428794405, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9972527472527473, + "eval_byte_accuracy": 0.995919881305638, + "eval_chrf": 97.53769868351773, + "eval_sacrebleu": 97.30034967398244, + "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 353779712, - "perplexity": 1.0120930322291146, + "perplexity": 1.0171284445846291, "step": 5400 }, { "epoch": 1.1704889658156643, - "grad_norm": 0.609375, - "learning_rate": 0.00013773, - "loss": 0.0153, + "grad_norm": 0.5859375, + "learning_rate": 0.0002899578947368421, + "loss": 0.0215, "num_input_tokens_seen": 354435072, "step": 5410, - "train_runtime": 2665.2581, - "train_tokens_per_second": 132983.396 + "train_runtime": 367.8932, + "train_tokens_per_second": 963418.478 }, { "epoch": 1.1726525313717007, - "grad_norm": 0.5, - "learning_rate": 0.00013743, - "loss": 0.0159, + "grad_norm": 0.470703125, + "learning_rate": 0.0002893263157894737, + "loss": 0.0222, "num_input_tokens_seen": 355090432, "step": 5420, - "train_runtime": 2668.7454, - "train_tokens_per_second": 133055.191 + "train_runtime": 371.7262, + "train_tokens_per_second": 955247.186 }, { "epoch": 1.1748160969277368, - "grad_norm": 0.73828125, - "learning_rate": 0.00013712999999999998, - "loss": 0.0169, + "grad_norm": 0.5, + "learning_rate": 0.0002886947368421052, + "loss": 0.0237, "num_input_tokens_seen": 355745792, "step": 5430, - "train_runtime": 2672.2508, - "train_tokens_per_second": 133125.897 + "train_runtime": 375.5435, + "train_tokens_per_second": 947282.496 }, { "epoch": 1.1769796624837732, - "grad_norm": 0.59765625, - "learning_rate": 0.00013683, - "loss": 0.0153, + "grad_norm": 0.375, + "learning_rate": 0.0002880631578947368, + "loss": 0.0232, "num_input_tokens_seen": 356401152, "step": 5440, - "train_runtime": 2675.7241, - "train_tokens_per_second": 133198.021 + "train_runtime": 379.3575, + "train_tokens_per_second": 939486.215 }, { "epoch": 1.1791432280398095, - "grad_norm": 0.490234375, - "learning_rate": 0.00013653, - "loss": 0.016, + "grad_norm": 0.71484375, + "learning_rate": 0.0002874315789473684, + "loss": 0.0245, "num_input_tokens_seen": 357056512, "step": 5450, - "train_runtime": 2679.1701, - "train_tokens_per_second": 133271.313 + "train_runtime": 383.1768, + "train_tokens_per_second": 931832.247 }, { "epoch": 1.181306793595846, - "grad_norm": 0.60546875, - "learning_rate": 0.00013623, - "loss": 0.015, + "grad_norm": 0.4765625, + "learning_rate": 0.0002868, + "loss": 0.0223, "num_input_tokens_seen": 357711872, "step": 5460, - "train_runtime": 2682.5967, - "train_tokens_per_second": 133345.38 + "train_runtime": 386.9794, + "train_tokens_per_second": 924369.237 }, { "epoch": 1.1834703591518823, - "grad_norm": 0.80859375, - "learning_rate": 0.00013592999999999998, - "loss": 0.0154, + "grad_norm": 0.361328125, + "learning_rate": 0.00028616842105263156, + "loss": 0.0206, "num_input_tokens_seen": 358367232, "step": 5470, - "train_runtime": 2686.0566, - "train_tokens_per_second": 133417.604 + "train_runtime": 390.809, + "train_tokens_per_second": 916988.113 }, { "epoch": 1.1856339247079186, - "grad_norm": 0.765625, - "learning_rate": 0.00013563, - "loss": 0.0162, + "grad_norm": 0.50390625, + "learning_rate": 0.00028553684210526315, + "loss": 0.0233, "num_input_tokens_seen": 359022592, "step": 5480, - "train_runtime": 2689.5661, - "train_tokens_per_second": 133487.178 + "train_runtime": 394.6294, + "train_tokens_per_second": 909771.547 }, { "epoch": 1.187797490263955, - "grad_norm": 0.4765625, - "learning_rate": 0.00013533, - "loss": 0.0173, + "grad_norm": 0.48828125, + "learning_rate": 0.00028490526315789474, + "loss": 0.0232, "num_input_tokens_seen": 359673856, "step": 5490, - "train_runtime": 2693.0549, - "train_tokens_per_second": 133556.079 + "train_runtime": 398.4372, + "train_tokens_per_second": 902711.587 }, { "epoch": 1.1899610558199913, - "grad_norm": 0.65625, - "learning_rate": 0.00013502999999999999, - "loss": 0.0202, + "grad_norm": 0.5390625, + "learning_rate": 0.0002842736842105263, + "loss": 0.0268, "num_input_tokens_seen": 360329216, "step": 5500, - "train_runtime": 2696.5588, - "train_tokens_per_second": 133625.573 + "train_runtime": 402.2563, + "train_tokens_per_second": 895770.167 }, { "epoch": 1.1899610558199913, - "eval_loss": 0.012636326253414154, - "eval_runtime": 23.1265, - "eval_samples_per_second": 1.384, - "eval_steps_per_second": 0.043, + "eval_loss": 0.015771791338920593, + "eval_runtime": 1.6199, + "eval_samples_per_second": 19.754, + "eval_steps_per_second": 0.617, "num_input_tokens_seen": 360329216, "step": 5500 }, { "epoch": 1.1899610558199913, - "eval_byte_accuracy": 0.9988872403560831, - "eval_chrf": 38.973454428794405, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9972527472527473, + "eval_byte_accuracy": 0.9951780415430267, + "eval_chrf": 95.97882814418759, + "eval_sacrebleu": 95.13971801972568, + "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 360329216, - "perplexity": 1.0127165019769564, + "perplexity": 1.0158968224968823, "step": 5500 }, { "epoch": 1.1921246213760277, - "grad_norm": 0.890625, - "learning_rate": 0.00013472999999999998, - "loss": 0.0215, + "grad_norm": 0.53515625, + "learning_rate": 0.00028364210526315786, + "loss": 0.0223, "num_input_tokens_seen": 360984576, "step": 5510, - "train_runtime": 2723.3146, - "train_tokens_per_second": 132553.386 + "train_runtime": 407.7707, + "train_tokens_per_second": 885263.641 }, { "epoch": 1.194288186932064, - "grad_norm": 0.91015625, - "learning_rate": 0.00013443, - "loss": 0.0158, + "grad_norm": 0.462890625, + "learning_rate": 0.00028301052631578944, + "loss": 0.0215, "num_input_tokens_seen": 361639936, "step": 5520, - "train_runtime": 2726.7412, - "train_tokens_per_second": 132627.156 + "train_runtime": 411.5863, + "train_tokens_per_second": 878649.158 }, { "epoch": 1.1964517524881004, - "grad_norm": 0.71484375, - "learning_rate": 0.00013413, - "loss": 0.0166, + "grad_norm": 0.515625, + "learning_rate": 0.00028237894736842103, + "loss": 0.0217, "num_input_tokens_seen": 362295296, "step": 5530, - "train_runtime": 2730.1807, - "train_tokens_per_second": 132700.115 + "train_runtime": 415.4216, + "train_tokens_per_second": 872114.779 }, { "epoch": 1.1986153180441368, - "grad_norm": 0.93359375, - "learning_rate": 0.00013382999999999998, - "loss": 0.0156, + "grad_norm": 0.5078125, + "learning_rate": 0.0002817473684210526, + "loss": 0.0201, "num_input_tokens_seen": 362950656, "step": 5540, - "train_runtime": 2733.6784, - "train_tokens_per_second": 132770.062 + "train_runtime": 419.2418, + "train_tokens_per_second": 865731.155 }, { "epoch": 1.2007788836001732, - "grad_norm": 0.5078125, - "learning_rate": 0.00013352999999999998, - "loss": 0.0159, + "grad_norm": 0.5703125, + "learning_rate": 0.0002811157894736842, + "loss": 0.0218, "num_input_tokens_seen": 363606016, "step": 5550, - "train_runtime": 2737.2037, - "train_tokens_per_second": 132838.491 + "train_runtime": 423.0643, + "train_tokens_per_second": 859458.108 }, { "epoch": 1.2029424491562095, - "grad_norm": 0.70703125, - "learning_rate": 0.00013323, - "loss": 0.0171, + "grad_norm": 0.60546875, + "learning_rate": 0.0002804842105263158, + "loss": 0.0238, "num_input_tokens_seen": 364261376, "step": 5560, - "train_runtime": 2740.7217, - "train_tokens_per_second": 132907.099 + "train_runtime": 426.8852, + "train_tokens_per_second": 853300.583 }, { "epoch": 1.2051060147122459, - "grad_norm": 0.515625, - "learning_rate": 0.00013293, - "loss": 0.019, + "grad_norm": 0.5625, + "learning_rate": 0.0002798526315789474, + "loss": 0.0242, "num_input_tokens_seen": 364916736, "step": 5570, - "train_runtime": 2744.1753, - "train_tokens_per_second": 132978.656 + "train_runtime": 430.7066, + "train_tokens_per_second": 847251.408 }, { "epoch": 1.207269580268282, - "grad_norm": 0.86328125, - "learning_rate": 0.00013262999999999998, - "loss": 0.0161, + "grad_norm": 0.353515625, + "learning_rate": 0.00027922105263157896, + "loss": 0.022, "num_input_tokens_seen": 365572096, "step": 5580, - "train_runtime": 2747.6635, - "train_tokens_per_second": 133048.348 + "train_runtime": 434.5293, + "train_tokens_per_second": 841306.025 }, { "epoch": 1.2094331458243186, - "grad_norm": 0.443359375, - "learning_rate": 0.00013232999999999997, - "loss": 0.0149, + "grad_norm": 0.54296875, + "learning_rate": 0.0002785894736842105, + "loss": 0.0224, "num_input_tokens_seen": 366227456, "step": 5590, - "train_runtime": 2751.1918, - "train_tokens_per_second": 133115.93 + "train_runtime": 438.3537, + "train_tokens_per_second": 835461.001 }, { "epoch": 1.2115967113803547, - "grad_norm": 0.6796875, - "learning_rate": 0.00013203, - "loss": 0.0175, + "grad_norm": 0.58203125, + "learning_rate": 0.0002779578947368421, + "loss": 0.0225, "num_input_tokens_seen": 366878720, "step": 5600, - "train_runtime": 2754.6341, - "train_tokens_per_second": 133186.01 + "train_runtime": 442.1531, + "train_tokens_per_second": 829754.863 }, { "epoch": 1.2115967113803547, - "eval_loss": 0.012803957797586918, - "eval_runtime": 22.3704, - "eval_samples_per_second": 1.43, - "eval_steps_per_second": 0.045, + "eval_loss": 0.01541847176849842, + "eval_runtime": 1.7839, + "eval_samples_per_second": 17.938, + "eval_steps_per_second": 0.561, "num_input_tokens_seen": 366878720, "step": 5600 }, { "epoch": 1.2115967113803547, - "eval_byte_accuracy": 0.9992581602373887, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9972527472527473, + "eval_byte_accuracy": 0.9955489614243324, + "eval_chrf": 96.54446339411348, + "eval_sacrebleu": 95.8953860263406, + "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 366878720, - "perplexity": 1.0128862794376234, + "perplexity": 1.0155379496700943, "step": 5600 }, { "epoch": 1.213760276936391, - "grad_norm": 0.66015625, - "learning_rate": 0.00013173, - "loss": 0.0161, + "grad_norm": 0.50390625, + "learning_rate": 0.00027732631578947367, + "loss": 0.0221, "num_input_tokens_seen": 367534080, "step": 5610, - "train_runtime": 2780.4898, - "train_tokens_per_second": 132183.217 + "train_runtime": 447.7745, + "train_tokens_per_second": 820801.684 }, { "epoch": 1.2159238424924275, - "grad_norm": 0.6015625, - "learning_rate": 0.00013142999999999998, - "loss": 0.0184, + "grad_norm": 0.43359375, + "learning_rate": 0.00027669473684210525, + "loss": 0.0205, "num_input_tokens_seen": 368189440, "step": 5620, - "train_runtime": 2783.9369, - "train_tokens_per_second": 132254.951 + "train_runtime": 451.6045, + "train_tokens_per_second": 815291.722 }, { "epoch": 1.2180874080484638, - "grad_norm": 0.89453125, - "learning_rate": 0.00013112999999999997, - "loss": 0.0181, + "grad_norm": 0.53125, + "learning_rate": 0.00027606315789473684, + "loss": 0.0242, "num_input_tokens_seen": 368844800, "step": 5630, - "train_runtime": 2787.4017, - "train_tokens_per_second": 132325.669 + "train_runtime": 455.4277, + "train_tokens_per_second": 809886.68 }, { "epoch": 1.2202509736045002, - "grad_norm": 1.109375, - "learning_rate": 0.00013083, - "loss": 0.0172, + "grad_norm": 0.46484375, + "learning_rate": 0.0002754315789473684, + "loss": 0.0244, "num_input_tokens_seen": 369500160, "step": 5640, - "train_runtime": 2790.8607, - "train_tokens_per_second": 132396.49 + "train_runtime": 459.243, + "train_tokens_per_second": 804585.215 }, { "epoch": 1.2224145391605366, - "grad_norm": 0.7421875, - "learning_rate": 0.00013052999999999999, - "loss": 0.0161, + "grad_norm": 0.6328125, + "learning_rate": 0.0002748, + "loss": 0.0215, "num_input_tokens_seen": 370151424, "step": 5650, - "train_runtime": 2794.273, - "train_tokens_per_second": 132467.881 + "train_runtime": 463.0536, + "train_tokens_per_second": 799370.528 }, { "epoch": 1.224578104716573, - "grad_norm": 0.71875, - "learning_rate": 0.00013022999999999998, - "loss": 0.0172, + "grad_norm": 0.484375, + "learning_rate": 0.00027416842105263154, + "loss": 0.0247, "num_input_tokens_seen": 370806784, "step": 5660, - "train_runtime": 2797.7412, - "train_tokens_per_second": 132537.915 + "train_runtime": 466.8798, + "train_tokens_per_second": 794223.279 }, { "epoch": 1.2267416702726093, - "grad_norm": 0.953125, - "learning_rate": 0.00012992999999999997, - "loss": 0.0182, + "grad_norm": 0.43359375, + "learning_rate": 0.00027353684210526313, + "loss": 0.0263, "num_input_tokens_seen": 371462144, "step": 5670, - "train_runtime": 2801.2359, - "train_tokens_per_second": 132606.519 + "train_runtime": 470.6811, + "train_tokens_per_second": 789201.358 }, { "epoch": 1.2289052358286456, - "grad_norm": 0.734375, - "learning_rate": 0.00012963, - "loss": 0.0178, + "grad_norm": 0.546875, + "learning_rate": 0.0002729052631578947, + "loss": 0.0248, "num_input_tokens_seen": 372117504, "step": 5680, - "train_runtime": 2804.7373, - "train_tokens_per_second": 132674.636 + "train_runtime": 474.4964, + "train_tokens_per_second": 784236.796 }, { "epoch": 1.231068801384682, - "grad_norm": 0.74609375, - "learning_rate": 0.00012932999999999998, - "loss": 0.0171, + "grad_norm": 0.73828125, + "learning_rate": 0.0002722736842105263, + "loss": 0.0236, "num_input_tokens_seen": 372768768, "step": 5690, - "train_runtime": 2808.2237, - "train_tokens_per_second": 132741.835 + "train_runtime": 478.295, + "train_tokens_per_second": 779369.954 }, { "epoch": 1.2332323669407184, - "grad_norm": 0.93359375, - "learning_rate": 0.00012902999999999998, - "loss": 0.0171, + "grad_norm": 0.4375, + "learning_rate": 0.0002716421052631579, + "loss": 0.0213, "num_input_tokens_seen": 373424128, "step": 5700, - "train_runtime": 2811.7561, - "train_tokens_per_second": 132808.153 + "train_runtime": 482.1137, + "train_tokens_per_second": 774556.193 }, { "epoch": 1.2332323669407184, - "eval_loss": 0.012524046935141087, - "eval_runtime": 23.4476, - "eval_samples_per_second": 1.365, - "eval_steps_per_second": 0.043, + "eval_loss": 0.01559929084032774, + "eval_runtime": 1.636, + "eval_samples_per_second": 19.56, + "eval_steps_per_second": 0.611, "num_input_tokens_seen": 373424128, "step": 5700 }, { "epoch": 1.2332323669407184, - "eval_byte_accuracy": 0.9988872403560831, - "eval_chrf": 39.04011223573844, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9972527472527473, + "eval_byte_accuracy": 0.9962908011869436, + "eval_chrf": 96.9001440282776, + "eval_sacrebleu": 95.64541250374113, + "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 373424128, - "perplexity": 1.0126028012417505, + "perplexity": 1.0157215949023408, "step": 5700 }, { "epoch": 1.2353959324967547, - "grad_norm": 0.8984375, - "learning_rate": 0.00012873, - "loss": 0.017, + "grad_norm": 0.458984375, + "learning_rate": 0.0002710105263157895, + "loss": 0.0231, "num_input_tokens_seen": 374079488, "step": 5710, - "train_runtime": 2838.6865, - "train_tokens_per_second": 131779.079 + "train_runtime": 487.5803, + "train_tokens_per_second": 767216.19 }, { "epoch": 1.237559498052791, - "grad_norm": 0.8046875, - "learning_rate": 0.00012843, - "loss": 0.0183, + "grad_norm": 0.4140625, + "learning_rate": 0.00027037894736842106, + "loss": 0.0235, "num_input_tokens_seen": 374734848, "step": 5720, - "train_runtime": 2842.1648, - "train_tokens_per_second": 131848.389 + "train_runtime": 491.3926, + "train_tokens_per_second": 762597.695 }, { "epoch": 1.2397230636088272, - "grad_norm": 0.64453125, - "learning_rate": 0.00012812999999999998, - "loss": 0.0157, + "grad_norm": 0.57421875, + "learning_rate": 0.00026974736842105265, + "loss": 0.0225, "num_input_tokens_seen": 375386112, "step": 5730, - "train_runtime": 2845.6089, - "train_tokens_per_second": 131917.675 + "train_runtime": 495.1891, + "train_tokens_per_second": 758066.187 }, { "epoch": 1.2418866291648638, - "grad_norm": 0.640625, - "learning_rate": 0.00012782999999999997, - "loss": 0.0165, + "grad_norm": 0.48828125, + "learning_rate": 0.0002691157894736842, + "loss": 0.0213, "num_input_tokens_seen": 376041472, "step": 5740, - "train_runtime": 2849.1186, - "train_tokens_per_second": 131985.196 + "train_runtime": 499.0112, + "train_tokens_per_second": 753573.282 }, { "epoch": 1.2440501947209, - "grad_norm": 0.58203125, - "learning_rate": 0.00012753, - "loss": 0.0178, + "grad_norm": 0.44921875, + "learning_rate": 0.00026848421052631577, + "loss": 0.0248, "num_input_tokens_seen": 376696832, "step": 5750, - "train_runtime": 2852.6296, - "train_tokens_per_second": 132052.488 + "train_runtime": 502.8267, + "train_tokens_per_second": 749158.34 }, { "epoch": 1.2462137602769363, - "grad_norm": 0.447265625, - "learning_rate": 0.00012722999999999999, - "loss": 0.0149, + "grad_norm": 0.443359375, + "learning_rate": 0.00026785263157894735, + "loss": 0.0213, "num_input_tokens_seen": 377352192, "step": 5760, - "train_runtime": 2856.1461, - "train_tokens_per_second": 132119.358 + "train_runtime": 506.6473, + "train_tokens_per_second": 744802.561 }, { "epoch": 1.2483773258329727, - "grad_norm": 0.6015625, - "learning_rate": 0.00012692999999999998, - "loss": 0.0166, + "grad_norm": 0.48828125, + "learning_rate": 0.00026722105263157894, + "loss": 0.0253, "num_input_tokens_seen": 378007552, "step": 5770, - "train_runtime": 2859.6547, - "train_tokens_per_second": 132186.431 + "train_runtime": 510.4719, + "train_tokens_per_second": 740506.029 }, { "epoch": 1.250540891389009, - "grad_norm": 0.7890625, - "learning_rate": 0.00012662999999999997, - "loss": 0.0155, + "grad_norm": 0.484375, + "learning_rate": 0.0002665894736842105, + "loss": 0.0212, "num_input_tokens_seen": 378662912, "step": 5780, - "train_runtime": 2863.1641, - "train_tokens_per_second": 132253.306 + "train_runtime": 514.2971, + "train_tokens_per_second": 736272.655 }, { "epoch": 1.2527044569450454, - "grad_norm": 0.48828125, - "learning_rate": 0.00012633, - "loss": 0.0162, + "grad_norm": 0.46875, + "learning_rate": 0.0002659578947368421, + "loss": 0.0225, "num_input_tokens_seen": 379318272, "step": 5790, - "train_runtime": 2866.6685, - "train_tokens_per_second": 132320.245 + "train_runtime": 518.1139, + "train_tokens_per_second": 732113.734 }, { "epoch": 1.2548680225010818, - "grad_norm": 0.765625, - "learning_rate": 0.00012602999999999998, - "loss": 0.0156, + "grad_norm": 0.4296875, + "learning_rate": 0.0002653263157894737, + "loss": 0.0227, "num_input_tokens_seen": 379973632, "step": 5800, - "train_runtime": 2870.1575, - "train_tokens_per_second": 132387.728 + "train_runtime": 521.9145, + "train_tokens_per_second": 728038.134 }, { "epoch": 1.2548680225010818, - "eval_loss": 0.011442948132753372, - "eval_runtime": 23.1059, - "eval_samples_per_second": 1.385, - "eval_steps_per_second": 0.043, + "eval_loss": 0.015982676297426224, + "eval_runtime": 1.702, + "eval_samples_per_second": 18.802, + "eval_steps_per_second": 0.588, "num_input_tokens_seen": 379973632, "step": 5800 }, { "epoch": 1.2548680225010818, - "eval_byte_accuracy": 0.9992581602373887, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9972527472527473, + "eval_byte_accuracy": 0.9962908011869436, + "eval_chrf": 97.25120016329974, + "eval_sacrebleu": 95.26949349674726, + "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 379973632, - "perplexity": 1.011508669105071, + "perplexity": 1.0161110824474469, "step": 5800 }, { "epoch": 1.2570315880571181, - "grad_norm": 0.83203125, - "learning_rate": 0.00012572999999999998, - "loss": 0.0165, + "grad_norm": 0.6796875, + "learning_rate": 0.00026469473684210523, + "loss": 0.0226, "num_input_tokens_seen": 380628992, "step": 5810, - "train_runtime": 2896.7849, - "train_tokens_per_second": 131397.05 + "train_runtime": 527.4555, + "train_tokens_per_second": 721632.439 }, { "epoch": 1.2591951536131545, - "grad_norm": 0.68359375, - "learning_rate": 0.00012543, - "loss": 0.0181, + "grad_norm": 0.361328125, + "learning_rate": 0.0002640631578947368, + "loss": 0.0235, "num_input_tokens_seen": 381284352, "step": 5820, - "train_runtime": 2900.2753, - "train_tokens_per_second": 131464.884 + "train_runtime": 531.2775, + "train_tokens_per_second": 717674.632 }, { "epoch": 1.2613587191691908, - "grad_norm": 0.486328125, - "learning_rate": 0.00012513, - "loss": 0.0146, + "grad_norm": 0.66015625, + "learning_rate": 0.0002634315789473684, + "loss": 0.0213, "num_input_tokens_seen": 381939712, "step": 5830, - "train_runtime": 2903.782, - "train_tokens_per_second": 131531.813 + "train_runtime": 535.1066, + "train_tokens_per_second": 713763.812 }, { "epoch": 1.2635222847252272, - "grad_norm": 0.44921875, - "learning_rate": 0.00012483, - "loss": 0.015, + "grad_norm": 0.337890625, + "learning_rate": 0.0002628, + "loss": 0.0217, "num_input_tokens_seen": 382595072, "step": 5840, - "train_runtime": 2907.2445, - "train_tokens_per_second": 131600.582 + "train_runtime": 538.9122, + "train_tokens_per_second": 709939.579 }, { "epoch": 1.2656858502812636, - "grad_norm": 0.80078125, - "learning_rate": 0.00012453, - "loss": 0.0153, + "grad_norm": 0.458984375, + "learning_rate": 0.0002621684210526316, + "loss": 0.0209, "num_input_tokens_seen": 383250432, "step": 5850, - "train_runtime": 2910.7544, - "train_tokens_per_second": 131667.046 + "train_runtime": 542.7301, + "train_tokens_per_second": 706152.935 }, { "epoch": 1.2678494158373, - "grad_norm": 0.74609375, - "learning_rate": 0.00012423, - "loss": 0.0161, + "grad_norm": 0.671875, + "learning_rate": 0.00026153684210526316, + "loss": 0.0237, "num_input_tokens_seen": 383905792, "step": 5860, - "train_runtime": 2914.2467, - "train_tokens_per_second": 131734.141 + "train_runtime": 546.548, + "train_tokens_per_second": 702419.116 }, { "epoch": 1.2700129813933363, - "grad_norm": 1.203125, - "learning_rate": 0.00012393, - "loss": 0.0169, + "grad_norm": 0.54296875, + "learning_rate": 0.00026090526315789475, + "loss": 0.0226, "num_input_tokens_seen": 384561152, "step": 5870, - "train_runtime": 2917.7683, - "train_tokens_per_second": 131799.756 + "train_runtime": 550.3699, + "train_tokens_per_second": 698732.126 }, { "epoch": 1.2721765469493724, - "grad_norm": 0.6875, - "learning_rate": 0.00012363, - "loss": 0.0209, + "grad_norm": 0.3515625, + "learning_rate": 0.0002602736842105263, + "loss": 0.0238, "num_input_tokens_seen": 385216512, "step": 5880, - "train_runtime": 2921.2766, - "train_tokens_per_second": 131865.811 + "train_runtime": 554.185, + "train_tokens_per_second": 695104.493 }, { "epoch": 1.274340112505409, - "grad_norm": 0.80859375, - "learning_rate": 0.00012333, - "loss": 0.0157, + "grad_norm": 0.416015625, + "learning_rate": 0.00025964210526315787, + "loss": 0.0209, "num_input_tokens_seen": 385871872, "step": 5890, - "train_runtime": 2924.7459, - "train_tokens_per_second": 131933.467 + "train_runtime": 558.0069, + "train_tokens_per_second": 691518.079 }, { "epoch": 1.2765036780614452, - "grad_norm": 0.640625, - "learning_rate": 0.00012303, - "loss": 0.0179, + "grad_norm": 0.49609375, + "learning_rate": 0.00025901052631578946, + "loss": 0.0261, "num_input_tokens_seen": 386527232, "step": 5900, - "train_runtime": 2928.1915, - "train_tokens_per_second": 132002.035 + "train_runtime": 561.8248, + "train_tokens_per_second": 687985.319 }, { "epoch": 1.2765036780614452, - "eval_loss": 0.011508772149682045, - "eval_runtime": 24.8359, - "eval_samples_per_second": 1.288, - "eval_steps_per_second": 0.04, + "eval_loss": 0.016280530020594597, + "eval_runtime": 1.7769, + "eval_samples_per_second": 18.009, + "eval_steps_per_second": 0.563, "num_input_tokens_seen": 386527232, "step": 5900 }, { "epoch": 1.2765036780614452, - "eval_byte_accuracy": 0.9992581602373887, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9972527472527473, + "eval_byte_accuracy": 0.9970326409495549, + "eval_chrf": 97.0096297173289, + "eval_sacrebleu": 96.23024322319195, + "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 386527232, - "perplexity": 1.0115752528602109, + "perplexity": 1.0164137799940642, "step": 5900 }, { "epoch": 1.2786672436174817, - "grad_norm": 1.0703125, - "learning_rate": 0.00012272999999999999, - "loss": 0.0202, + "grad_norm": 0.60546875, + "learning_rate": 0.00025837894736842104, + "loss": 0.0243, "num_input_tokens_seen": 387182592, "step": 5910, - "train_runtime": 2956.4854, - "train_tokens_per_second": 130960.427 + "train_runtime": 567.4347, + "train_tokens_per_second": 682338.577 }, { "epoch": 1.2808308091735179, - "grad_norm": 0.8203125, - "learning_rate": 0.00012243, - "loss": 0.017, + "grad_norm": 0.439453125, + "learning_rate": 0.00025774736842105263, + "loss": 0.0219, "num_input_tokens_seen": 387837952, "step": 5920, - "train_runtime": 2959.9962, - "train_tokens_per_second": 131026.504 + "train_runtime": 571.2533, + "train_tokens_per_second": 678924.606 }, { "epoch": 1.2829943747295542, - "grad_norm": 0.7734375, - "learning_rate": 0.00012213, - "loss": 0.0174, + "grad_norm": 0.310546875, + "learning_rate": 0.0002571157894736842, + "loss": 0.0237, "num_input_tokens_seen": 388493312, "step": 5930, - "train_runtime": 2963.5076, - "train_tokens_per_second": 131092.395 + "train_runtime": 575.0688, + "train_tokens_per_second": 675559.711 }, { "epoch": 1.2851579402855906, - "grad_norm": 0.6875, - "learning_rate": 0.00012182999999999999, - "loss": 0.0157, + "grad_norm": 0.453125, + "learning_rate": 0.0002564842105263158, + "loss": 0.0215, "num_input_tokens_seen": 389148672, "step": 5940, - "train_runtime": 2966.9941, - "train_tokens_per_second": 131159.233 + "train_runtime": 578.8832, + "train_tokens_per_second": 672240.439 }, { "epoch": 1.287321505841627, - "grad_norm": 0.90625, - "learning_rate": 0.00012153, - "loss": 0.0168, + "grad_norm": 0.55078125, + "learning_rate": 0.00025585263157894733, + "loss": 0.0223, "num_input_tokens_seen": 389804032, "step": 5950, - "train_runtime": 2970.4985, - "train_tokens_per_second": 131225.124 + "train_runtime": 582.6975, + "train_tokens_per_second": 668964.678 }, { "epoch": 1.2894850713976633, - "grad_norm": 0.43359375, - "learning_rate": 0.00012122999999999999, - "loss": 0.016, + "grad_norm": 0.55078125, + "learning_rate": 0.0002552210526315789, + "loss": 0.0233, "num_input_tokens_seen": 390459392, "step": 5960, - "train_runtime": 2974.0044, - "train_tokens_per_second": 131290.792 + "train_runtime": 586.5197, + "train_tokens_per_second": 665722.568 }, { "epoch": 1.2916486369536997, - "grad_norm": 0.57421875, - "learning_rate": 0.00012093, - "loss": 0.0161, + "grad_norm": 0.412109375, + "learning_rate": 0.0002545894736842105, + "loss": 0.0213, "num_input_tokens_seen": 391114752, "step": 5970, - "train_runtime": 2977.5246, - "train_tokens_per_second": 131355.676 + "train_runtime": 590.3469, + "train_tokens_per_second": 662516.839 }, { "epoch": 1.293812202509736, - "grad_norm": 0.5859375, - "learning_rate": 0.00012062999999999999, - "loss": 0.016, + "grad_norm": 0.419921875, + "learning_rate": 0.0002539578947368421, + "loss": 0.023, "num_input_tokens_seen": 391770112, "step": 5980, - "train_runtime": 2981.0411, - "train_tokens_per_second": 131420.565 + "train_runtime": 594.1612, + "train_tokens_per_second": 659366.757 }, { "epoch": 1.2959757680657724, - "grad_norm": 0.60546875, - "learning_rate": 0.00012033, - "loss": 0.016, + "grad_norm": 0.5625, + "learning_rate": 0.0002533263157894737, + "loss": 0.023, "num_input_tokens_seen": 392425472, "step": 5990, - "train_runtime": 2984.483, - "train_tokens_per_second": 131488.594 + "train_runtime": 597.9746, + "train_tokens_per_second": 656257.765 }, { "epoch": 1.2981393336218088, - "grad_norm": 0.7265625, - "learning_rate": 0.00012002999999999999, - "loss": 0.0167, + "grad_norm": 0.59375, + "learning_rate": 0.00025269473684210527, + "loss": 0.0231, "num_input_tokens_seen": 393080832, "step": 6000, - "train_runtime": 2987.9968, - "train_tokens_per_second": 131553.299 + "train_runtime": 601.8047, + "train_tokens_per_second": 653170.044 }, { "epoch": 1.2981393336218088, - "eval_loss": 0.011910198256373405, - "eval_runtime": 24.2316, - "eval_samples_per_second": 1.321, - "eval_steps_per_second": 0.041, + "eval_loss": 0.015439599752426147, + "eval_runtime": 1.6914, + "eval_samples_per_second": 18.919, + "eval_steps_per_second": 0.591, "num_input_tokens_seen": 393080832, "step": 6000 }, { "epoch": 1.2981393336218088, - "eval_byte_accuracy": 0.9985163204747775, - "eval_chrf": 39.04011223573844, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9958791208791209, + "eval_byte_accuracy": 0.9955489614243324, + "eval_chrf": 96.31324033462046, + "eval_sacrebleu": 95.31161095839255, + "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 393080832, - "perplexity": 1.0119814070905935, + "perplexity": 1.0155594061662383, "step": 6000 }, { "epoch": 1.3003028991778451, - "grad_norm": 0.88671875, - "learning_rate": 0.00011973, - "loss": 0.0162, + "grad_norm": 0.498046875, + "learning_rate": 0.00025206315789473685, + "loss": 0.0227, "num_input_tokens_seen": 393736192, "step": 6010, - "train_runtime": 3015.8473, - "train_tokens_per_second": 130555.744 + "train_runtime": 607.3786, + "train_tokens_per_second": 648255.009 }, { "epoch": 1.3024664647338815, - "grad_norm": 0.94921875, - "learning_rate": 0.00011942999999999999, - "loss": 0.0162, + "grad_norm": 0.36328125, + "learning_rate": 0.0002514315789473684, + "loss": 0.0212, "num_input_tokens_seen": 394391552, "step": 6020, - "train_runtime": 3019.2929, - "train_tokens_per_second": 130623.814 + "train_runtime": 611.1994, + "train_tokens_per_second": 645274.738 }, { "epoch": 1.3046300302899179, - "grad_norm": 0.71875, - "learning_rate": 0.00011912999999999999, - "loss": 0.0162, + "grad_norm": 0.326171875, + "learning_rate": 0.00025079999999999997, + "loss": 0.0225, "num_input_tokens_seen": 395046912, "step": 6030, - "train_runtime": 3022.7401, - "train_tokens_per_second": 130691.658 + "train_runtime": 615.0088, + "train_tokens_per_second": 642343.486 }, { "epoch": 1.3067935958459542, - "grad_norm": 1.0, - "learning_rate": 0.00011882999999999999, - "loss": 0.0195, + "grad_norm": 0.478515625, + "learning_rate": 0.00025016842105263156, + "loss": 0.0255, "num_input_tokens_seen": 395702272, "step": 6040, - "train_runtime": 3026.2704, - "train_tokens_per_second": 130755.757 + "train_runtime": 618.8251, + "train_tokens_per_second": 639441.233 }, { "epoch": 1.3089571614019904, - "grad_norm": 0.71875, - "learning_rate": 0.00011852999999999999, - "loss": 0.0166, + "grad_norm": 0.42578125, + "learning_rate": 0.00024953684210526314, + "loss": 0.0221, "num_input_tokens_seen": 396357632, "step": 6050, - "train_runtime": 3029.7839, - "train_tokens_per_second": 130820.43 + "train_runtime": 622.6396, + "train_tokens_per_second": 636576.348 }, { "epoch": 1.311120726958027, - "grad_norm": 0.88671875, - "learning_rate": 0.00011823, - "loss": 0.0161, + "grad_norm": 0.416015625, + "learning_rate": 0.00024890526315789473, + "loss": 0.0212, "num_input_tokens_seen": 397012992, "step": 6060, - "train_runtime": 3033.2605, - "train_tokens_per_second": 130886.546 + "train_runtime": 626.4648, + "train_tokens_per_second": 633735.535 }, { "epoch": 1.313284292514063, - "grad_norm": 0.53125, - "learning_rate": 0.00011792999999999999, - "loss": 0.0176, + "grad_norm": 0.50390625, + "learning_rate": 0.0002482736842105263, + "loss": 0.0236, "num_input_tokens_seen": 397668352, "step": 6070, - "train_runtime": 3036.6935, - "train_tokens_per_second": 130954.39 + "train_runtime": 630.2994, + "train_tokens_per_second": 630919.772 }, { "epoch": 1.3154478580700995, - "grad_norm": 0.58984375, - "learning_rate": 0.00011763, - "loss": 0.0151, + "grad_norm": 0.6875, + "learning_rate": 0.0002476421052631579, + "loss": 0.0202, "num_input_tokens_seen": 398323712, "step": 6080, - "train_runtime": 3040.1267, - "train_tokens_per_second": 131022.075 + "train_runtime": 634.1117, + "train_tokens_per_second": 628160.174 }, { "epoch": 1.3176114236261358, - "grad_norm": 0.70703125, - "learning_rate": 0.00011732999999999999, - "loss": 0.0169, + "grad_norm": 0.49609375, + "learning_rate": 0.00024701052631578943, + "loss": 0.0232, "num_input_tokens_seen": 398974976, "step": 6090, - "train_runtime": 3043.5607, - "train_tokens_per_second": 131088.227 + "train_runtime": 637.9159, + "train_tokens_per_second": 625435.07 }, { "epoch": 1.3197749891821722, - "grad_norm": 0.66015625, - "learning_rate": 0.00011703, - "loss": 0.0166, + "grad_norm": 0.6953125, + "learning_rate": 0.000246378947368421, + "loss": 0.0223, "num_input_tokens_seen": 399626240, "step": 6100, - "train_runtime": 3046.9968, - "train_tokens_per_second": 131154.138 + "train_runtime": 641.7163, + "train_tokens_per_second": 622745.948 }, { "epoch": 1.3197749891821722, - "eval_loss": 0.011824293993413448, - "eval_runtime": 24.8363, - "eval_samples_per_second": 1.288, - "eval_steps_per_second": 0.04, + "eval_loss": 0.014141172170639038, + "eval_runtime": 1.7688, + "eval_samples_per_second": 18.092, + "eval_steps_per_second": 0.565, "num_input_tokens_seen": 399626240, "step": 6100 }, { "epoch": 1.3197749891821722, - "eval_byte_accuracy": 0.9992581602373887, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9972527472527473, + "eval_byte_accuracy": 0.9977744807121661, + "eval_chrf": 98.58235971529302, + "eval_sacrebleu": 97.87900330273517, + "eval_word_accuracy": 0.9945054945054945, "num_input_tokens_seen": 399626240, - "perplexity": 1.011894477307561, + "perplexity": 1.014241631524937, "step": 6100 }, { "epoch": 1.3219385547382085, - "grad_norm": 0.703125, - "learning_rate": 0.00011672999999999999, - "loss": 0.0149, + "grad_norm": 0.515625, + "learning_rate": 0.0002457473684210526, + "loss": 0.021, "num_input_tokens_seen": 400281600, "step": 6110, - "train_runtime": 3075.3675, - "train_tokens_per_second": 130157.323 + "train_runtime": 647.3233, + "train_tokens_per_second": 618364.309 }, { "epoch": 1.324102120294245, - "grad_norm": 0.53515625, - "learning_rate": 0.00011643, - "loss": 0.0145, + "grad_norm": 0.546875, + "learning_rate": 0.0002451157894736842, + "loss": 0.0203, "num_input_tokens_seen": 400936960, "step": 6120, - "train_runtime": 3078.8412, - "train_tokens_per_second": 130223.334 + "train_runtime": 651.1178, + "train_tokens_per_second": 615767.119 }, { "epoch": 1.3262656858502813, - "grad_norm": 0.71875, - "learning_rate": 0.00011612999999999999, - "loss": 0.0166, + "grad_norm": 0.42578125, + "learning_rate": 0.0002444842105263158, + "loss": 0.0224, "num_input_tokens_seen": 401592320, "step": 6130, - "train_runtime": 3082.2854, - "train_tokens_per_second": 130290.438 + "train_runtime": 654.9362, + "train_tokens_per_second": 613177.74 }, { "epoch": 1.3284292514063176, - "grad_norm": 0.474609375, - "learning_rate": 0.00011583, - "loss": 0.015, + "grad_norm": 0.51171875, + "learning_rate": 0.00024385263157894737, + "loss": 0.0204, "num_input_tokens_seen": 402247680, "step": 6140, - "train_runtime": 3085.7343, - "train_tokens_per_second": 130357.197 + "train_runtime": 658.7591, + "train_tokens_per_second": 610614.248 }, { "epoch": 1.330592816962354, - "grad_norm": 0.3984375, - "learning_rate": 0.00011552999999999999, - "loss": 0.0149, + "grad_norm": 0.6171875, + "learning_rate": 0.00024322105263157893, + "loss": 0.0209, "num_input_tokens_seen": 402903040, "step": 6150, - "train_runtime": 3089.2917, - "train_tokens_per_second": 130419.226 + "train_runtime": 662.5781, + "train_tokens_per_second": 608083.892 }, { "epoch": 1.3327563825183903, - "grad_norm": 0.59765625, - "learning_rate": 0.00011522999999999999, - "loss": 0.016, + "grad_norm": 0.58984375, + "learning_rate": 0.00024258947368421049, + "loss": 0.0214, "num_input_tokens_seen": 403558400, "step": 6160, - "train_runtime": 3092.738, - "train_tokens_per_second": 130485.804 + "train_runtime": 666.4075, + "train_tokens_per_second": 605573.016 }, { "epoch": 1.3349199480744267, - "grad_norm": 0.5625, - "learning_rate": 0.00011492999999999999, - "loss": 0.0165, + "grad_norm": 0.6015625, + "learning_rate": 0.00024195789473684207, + "loss": 0.0226, "num_input_tokens_seen": 404213760, "step": 6170, - "train_runtime": 3096.198, - "train_tokens_per_second": 130551.651 + "train_runtime": 670.2283, + "train_tokens_per_second": 603098.629 }, { "epoch": 1.337083513630463, - "grad_norm": 0.7578125, - "learning_rate": 0.00011462999999999999, - "loss": 0.016, + "grad_norm": 0.65234375, + "learning_rate": 0.00024132631578947366, + "loss": 0.0214, "num_input_tokens_seen": 404869120, "step": 6180, - "train_runtime": 3099.6759, - "train_tokens_per_second": 130616.598 + "train_runtime": 674.0552, + "train_tokens_per_second": 600646.812 }, { "epoch": 1.3392470791864994, - "grad_norm": 0.69921875, - "learning_rate": 0.00011432999999999998, - "loss": 0.0163, + "grad_norm": 0.44140625, + "learning_rate": 0.00024069473684210524, + "loss": 0.0223, "num_input_tokens_seen": 405524480, "step": 6190, - "train_runtime": 3103.1748, - "train_tokens_per_second": 130680.513 + "train_runtime": 677.8865, + "train_tokens_per_second": 598218.858 }, { "epoch": 1.3414106447425356, - "grad_norm": 0.51953125, - "learning_rate": 0.00011402999999999999, - "loss": 0.0141, + "grad_norm": 0.345703125, + "learning_rate": 0.00024006315789473683, + "loss": 0.0199, "num_input_tokens_seen": 406179840, "step": 6200, - "train_runtime": 3106.6486, - "train_tokens_per_second": 130745.342 + "train_runtime": 681.6967, + "train_tokens_per_second": 595836.593 }, { "epoch": 1.3414106447425356, - "eval_loss": 0.011933584697544575, - "eval_runtime": 24.5823, - "eval_samples_per_second": 1.302, - "eval_steps_per_second": 0.041, + "eval_loss": 0.015185019001364708, + "eval_runtime": 1.6993, + "eval_samples_per_second": 18.831, + "eval_steps_per_second": 0.588, "num_input_tokens_seen": 406179840, "step": 6200 }, { "epoch": 1.3414106447425356, - "eval_byte_accuracy": 0.9992581602373887, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9972527472527473, + "eval_byte_accuracy": 0.9962908011869436, + "eval_chrf": 97.8403707869436, + "eval_sacrebleu": 96.97610863887425, + "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 406179840, - "perplexity": 1.012005074010978, + "perplexity": 1.015300897196969, "step": 6200 }, { "epoch": 1.3435742102985722, - "grad_norm": 0.72265625, - "learning_rate": 0.00011372999999999998, - "loss": 0.0158, + "grad_norm": 0.57421875, + "learning_rate": 0.00023943157894736842, + "loss": 0.0232, "num_input_tokens_seen": 406835200, "step": 6210, - "train_runtime": 3134.6851, - "train_tokens_per_second": 129785.031 + "train_runtime": 687.2335, + "train_tokens_per_second": 591989.788 }, { "epoch": 1.3457377758546083, - "grad_norm": 0.84375, - "learning_rate": 0.00011342999999999999, - "loss": 0.0175, + "grad_norm": 0.609375, + "learning_rate": 0.0002388, + "loss": 0.0208, "num_input_tokens_seen": 407490560, "step": 6220, - "train_runtime": 3138.1398, - "train_tokens_per_second": 129850.989 + "train_runtime": 691.056, + "train_tokens_per_second": 589663.599 }, { "epoch": 1.3479013414106447, - "grad_norm": 0.5625, - "learning_rate": 0.00011312999999999998, - "loss": 0.0147, + "grad_norm": 0.484375, + "learning_rate": 0.00023816842105263154, + "loss": 0.0204, "num_input_tokens_seen": 408145920, "step": 6230, - "train_runtime": 3141.6605, - "train_tokens_per_second": 129914.076 + "train_runtime": 694.8687, + "train_tokens_per_second": 587371.244 }, { "epoch": 1.350064906966681, - "grad_norm": 0.65625, - "learning_rate": 0.00011282999999999999, - "loss": 0.0143, + "grad_norm": 0.357421875, + "learning_rate": 0.00023753684210526312, + "loss": 0.0195, "num_input_tokens_seen": 408801280, "step": 6240, - "train_runtime": 3145.1694, - "train_tokens_per_second": 129977.509 + "train_runtime": 698.6775, + "train_tokens_per_second": 585107.283 }, { "epoch": 1.3522284725227174, - "grad_norm": 0.68359375, - "learning_rate": 0.00011252999999999998, - "loss": 0.0158, + "grad_norm": 0.60546875, + "learning_rate": 0.0002369052631578947, + "loss": 0.0223, "num_input_tokens_seen": 409456640, "step": 6250, - "train_runtime": 3148.6247, - "train_tokens_per_second": 130043.012 + "train_runtime": 702.4988, + "train_tokens_per_second": 582857.396 }, { "epoch": 1.3543920380787537, - "grad_norm": 0.87109375, - "learning_rate": 0.00011222999999999999, - "loss": 0.0159, + "grad_norm": 0.59375, + "learning_rate": 0.0002362736842105263, + "loss": 0.0215, "num_input_tokens_seen": 410112000, "step": 6260, - "train_runtime": 3152.1194, - "train_tokens_per_second": 130106.746 + "train_runtime": 706.3213, + "train_tokens_per_second": 580630.921 }, { "epoch": 1.35655560363479, - "grad_norm": 1.125, - "learning_rate": 0.00011192999999999998, - "loss": 0.0162, + "grad_norm": 0.51171875, + "learning_rate": 0.00023564210526315788, + "loss": 0.0216, "num_input_tokens_seen": 410767360, "step": 6270, - "train_runtime": 3155.5948, - "train_tokens_per_second": 130171.138 + "train_runtime": 710.1478, + "train_tokens_per_second": 578425.136 }, { "epoch": 1.3587191691908265, - "grad_norm": 0.333984375, - "learning_rate": 0.00011162999999999999, - "loss": 0.0153, + "grad_norm": 0.359375, + "learning_rate": 0.00023501052631578947, + "loss": 0.02, "num_input_tokens_seen": 411422720, "step": 6280, - "train_runtime": 3159.0366, - "train_tokens_per_second": 130236.769 + "train_runtime": 713.9708, + "train_tokens_per_second": 576245.865 }, { "epoch": 1.3608827347468628, - "grad_norm": 0.58984375, - "learning_rate": 0.00011132999999999998, - "loss": 0.0171, + "grad_norm": 0.578125, + "learning_rate": 0.00023437894736842105, + "loss": 0.0226, "num_input_tokens_seen": 412078080, "step": 6290, - "train_runtime": 3162.4885, - "train_tokens_per_second": 130301.844 + "train_runtime": 717.7901, + "train_tokens_per_second": 574092.768 }, { "epoch": 1.3630463003028992, - "grad_norm": 0.59375, - "learning_rate": 0.00011102999999999999, - "loss": 0.0153, + "grad_norm": 0.357421875, + "learning_rate": 0.00023374736842105264, + "loss": 0.0209, "num_input_tokens_seen": 412733440, "step": 6300, - "train_runtime": 3165.9675, - "train_tokens_per_second": 130365.658 + "train_runtime": 721.6105, + "train_tokens_per_second": 571961.491 }, { "epoch": 1.3630463003028992, - "eval_loss": 0.011224345304071903, - "eval_runtime": 24.7005, - "eval_samples_per_second": 1.296, - "eval_steps_per_second": 0.04, + "eval_loss": 0.01683359406888485, + "eval_runtime": 1.7461, + "eval_samples_per_second": 18.326, + "eval_steps_per_second": 0.573, "num_input_tokens_seen": 412733440, "step": 6300 }, { "epoch": 1.3630463003028992, - "eval_byte_accuracy": 0.9988872403560831, - "eval_chrf": 38.77284168441046, - "eval_sacrebleu": 19.337263244966266, - "eval_word_accuracy": 0.9958791208791209, + "eval_byte_accuracy": 0.9970326409495549, + "eval_chrf": 97.50495693707096, + "eval_sacrebleu": 96.96578008549908, + "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 412733440, - "perplexity": 1.0112875746155898, + "perplexity": 1.0169760773928707, "step": 6300 }, { "epoch": 1.3652098658589356, - "grad_norm": 0.8828125, - "learning_rate": 0.00011072999999999999, - "loss": 0.0169, + "grad_norm": 0.41015625, + "learning_rate": 0.00023311578947368417, + "loss": 0.0212, "num_input_tokens_seen": 413388800, "step": 6310, - "train_runtime": 3194.121, - "train_tokens_per_second": 129421.771 + "train_runtime": 727.2029, + "train_tokens_per_second": 568464.163 }, { "epoch": 1.367373431414972, - "grad_norm": 0.578125, - "learning_rate": 0.00011042999999999998, - "loss": 0.0147, + "grad_norm": 0.380859375, + "learning_rate": 0.00023248421052631576, + "loss": 0.0194, "num_input_tokens_seen": 414044160, "step": 6320, - "train_runtime": 3197.5805, - "train_tokens_per_second": 129486.705 + "train_runtime": 731.0202, + "train_tokens_per_second": 566392.203 }, { "epoch": 1.3695369969710083, - "grad_norm": 0.498046875, - "learning_rate": 0.00011012999999999999, - "loss": 0.018, + "grad_norm": 0.5546875, + "learning_rate": 0.00023185263157894735, + "loss": 0.0223, "num_input_tokens_seen": 414699520, "step": 6330, - "train_runtime": 3201.0978, - "train_tokens_per_second": 129549.156 + "train_runtime": 734.8416, + "train_tokens_per_second": 564338.689 }, { "epoch": 1.3717005625270446, - "grad_norm": 0.7109375, - "learning_rate": 0.00010982999999999998, - "loss": 0.0147, + "grad_norm": 0.578125, + "learning_rate": 0.00023122105263157893, + "loss": 0.0198, "num_input_tokens_seen": 415354880, "step": 6340, - "train_runtime": 3204.5824, - "train_tokens_per_second": 129612.793 + "train_runtime": 738.6717, + "train_tokens_per_second": 562299.732 }, { "epoch": 1.3738641280830808, - "grad_norm": 0.609375, - "learning_rate": 0.00010952999999999999, - "loss": 0.0159, + "grad_norm": 0.447265625, + "learning_rate": 0.00023058947368421052, + "loss": 0.0219, "num_input_tokens_seen": 416010240, "step": 6350, - "train_runtime": 3208.0947, - "train_tokens_per_second": 129675.174 + "train_runtime": 742.4996, + "train_tokens_per_second": 560283.436 }, { "epoch": 1.3760276936391174, - "grad_norm": 0.6953125, - "learning_rate": 0.00010922999999999998, - "loss": 0.0159, + "grad_norm": 0.5, + "learning_rate": 0.0002299578947368421, + "loss": 0.0206, "num_input_tokens_seen": 416665600, "step": 6360, - "train_runtime": 3211.6079, - "train_tokens_per_second": 129737.383 + "train_runtime": 746.3237, + "train_tokens_per_second": 558290.723 }, { "epoch": 1.3781912591951535, - "grad_norm": 0.60546875, - "learning_rate": 0.00010892999999999999, - "loss": 0.0154, + "grad_norm": 0.53515625, + "learning_rate": 0.0002293263157894737, + "loss": 0.0223, "num_input_tokens_seen": 417320960, "step": 6370, - "train_runtime": 3215.0799, - "train_tokens_per_second": 129801.117 + "train_runtime": 750.1461, + "train_tokens_per_second": 556319.563 }, { "epoch": 1.38035482475119, - "grad_norm": 0.640625, - "learning_rate": 0.00010862999999999998, - "loss": 0.0155, + "grad_norm": 0.4453125, + "learning_rate": 0.00022869473684210522, + "loss": 0.0214, "num_input_tokens_seen": 417976320, "step": 6380, - "train_runtime": 3218.5892, - "train_tokens_per_second": 129863.208 + "train_runtime": 753.968, + "train_tokens_per_second": 554368.783 }, { "epoch": 1.3825183903072262, - "grad_norm": 0.53515625, - "learning_rate": 0.00010832999999999999, - "loss": 0.014, + "grad_norm": 0.4296875, + "learning_rate": 0.0002280631578947368, + "loss": 0.0188, "num_input_tokens_seen": 418631680, "step": 6390, - "train_runtime": 3222.1004, - "train_tokens_per_second": 129925.089 + "train_runtime": 757.7935, + "train_tokens_per_second": 552435.046 }, { "epoch": 1.3846819558632626, - "grad_norm": 0.609375, - "learning_rate": 0.00010802999999999998, - "loss": 0.0146, + "grad_norm": 0.34765625, + "learning_rate": 0.0002274315789473684, + "loss": 0.0197, "num_input_tokens_seen": 419287040, "step": 6400, - "train_runtime": 3225.6089, - "train_tokens_per_second": 129986.944 + "train_runtime": 761.6173, + "train_tokens_per_second": 550521.964 }, { "epoch": 1.3846819558632626, - "eval_loss": 0.010300984606146812, - "eval_runtime": 23.78, - "eval_samples_per_second": 1.346, - "eval_steps_per_second": 0.042, + "eval_loss": 0.015446390956640244, + "eval_runtime": 1.7867, + "eval_samples_per_second": 17.91, + "eval_steps_per_second": 0.56, "num_input_tokens_seen": 419287040, "step": 6400 }, { "epoch": 1.3846819558632626, - "eval_byte_accuracy": 0.9992581602373887, - "eval_chrf": 39.04011223573844, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.995919881305638, + "eval_chrf": 97.04447388777369, + "eval_sacrebleu": 94.67835385995274, + "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 419287040, - "perplexity": 1.0103542223915845, + "perplexity": 1.0155663030609763, "step": 6400 }, { "epoch": 1.386845521419299, - "grad_norm": 0.341796875, - "learning_rate": 0.00010772999999999999, - "loss": 0.0156, + "grad_norm": 0.66015625, + "learning_rate": 0.00022679999999999998, + "loss": 0.022, "num_input_tokens_seen": 419942400, "step": 6410, - "train_runtime": 3252.8501, - "train_tokens_per_second": 129099.833 + "train_runtime": 767.2444, + "train_tokens_per_second": 547338.49 }, { "epoch": 1.3890090869753353, - "grad_norm": 0.546875, - "learning_rate": 0.00010742999999999998, - "loss": 0.0143, + "grad_norm": 0.470703125, + "learning_rate": 0.00022616842105263157, + "loss": 0.0199, "num_input_tokens_seen": 420597760, "step": 6420, - "train_runtime": 3256.3385, - "train_tokens_per_second": 129162.789 + "train_runtime": 771.0717, + "train_tokens_per_second": 545471.684 }, { "epoch": 1.3911726525313717, - "grad_norm": 0.953125, - "learning_rate": 0.00010712999999999999, - "loss": 0.0186, + "grad_norm": 0.455078125, + "learning_rate": 0.00022553684210526316, + "loss": 0.0213, "num_input_tokens_seen": 421253120, "step": 6430, - "train_runtime": 3259.7833, - "train_tokens_per_second": 129227.337 + "train_runtime": 774.9024, + "train_tokens_per_second": 543620.847 }, { "epoch": 1.393336218087408, - "grad_norm": 0.65234375, - "learning_rate": 0.00010683, - "loss": 0.0144, + "grad_norm": 0.51171875, + "learning_rate": 0.00022490526315789474, + "loss": 0.0205, "num_input_tokens_seen": 421908480, "step": 6440, - "train_runtime": 3263.2017, - "train_tokens_per_second": 129292.799 + "train_runtime": 778.7188, + "train_tokens_per_second": 541798.28 }, { "epoch": 1.3954997836434444, - "grad_norm": 0.734375, - "learning_rate": 0.00010653, - "loss": 0.0157, + "grad_norm": 0.5, + "learning_rate": 0.00022427368421052627, + "loss": 0.0223, "num_input_tokens_seen": 422563840, "step": 6450, - "train_runtime": 3266.6742, - "train_tokens_per_second": 129355.98 + "train_runtime": 782.5419, + "train_tokens_per_second": 539988.798 }, { "epoch": 1.3976633491994808, - "grad_norm": 0.78125, - "learning_rate": 0.00010623, - "loss": 0.0158, + "grad_norm": 0.62109375, + "learning_rate": 0.00022364210526315786, + "loss": 0.0205, "num_input_tokens_seen": 423219200, "step": 6460, - "train_runtime": 3270.1411, - "train_tokens_per_second": 129419.246 + "train_runtime": 786.3586, + "train_tokens_per_second": 538201.273 }, { "epoch": 1.3998269147555171, - "grad_norm": 0.44921875, - "learning_rate": 0.00010593, - "loss": 0.0163, + "grad_norm": 0.458984375, + "learning_rate": 0.00022301052631578945, + "loss": 0.02, "num_input_tokens_seen": 423874560, "step": 6470, - "train_runtime": 3273.6423, - "train_tokens_per_second": 129481.025 + "train_runtime": 790.1794, + "train_tokens_per_second": 536428.278 }, { "epoch": 1.4019904803115535, - "grad_norm": 0.69140625, - "learning_rate": 0.00010563, - "loss": 0.0146, + "grad_norm": 0.359375, + "learning_rate": 0.00022237894736842103, + "loss": 0.0193, "num_input_tokens_seen": 424529920, "step": 6480, - "train_runtime": 3277.1533, - "train_tokens_per_second": 129542.285 + "train_runtime": 794.0007, + "train_tokens_per_second": 534671.99 }, { "epoch": 1.4041540458675899, - "grad_norm": 0.82421875, - "learning_rate": 0.00010533, - "loss": 0.015, + "grad_norm": 0.50390625, + "learning_rate": 0.00022174736842105262, + "loss": 0.0225, "num_input_tokens_seen": 425185280, "step": 6490, - "train_runtime": 3280.6687, - "train_tokens_per_second": 129603.237 + "train_runtime": 797.8185, + "train_tokens_per_second": 532934.816 }, { "epoch": 1.4063176114236262, - "grad_norm": 0.609375, - "learning_rate": 0.00010503, - "loss": 0.0154, + "grad_norm": 0.53125, + "learning_rate": 0.0002211157894736842, + "loss": 0.0226, "num_input_tokens_seen": 425840640, "step": 6500, - "train_runtime": 3284.1504, - "train_tokens_per_second": 129665.388 + "train_runtime": 801.6401, + "train_tokens_per_second": 531211.776 }, { "epoch": 1.4063176114236262, - "eval_loss": 0.011522230692207813, - "eval_runtime": 24.5083, - "eval_samples_per_second": 1.306, - "eval_steps_per_second": 0.041, + "eval_loss": 0.014038060791790485, + "eval_runtime": 1.82, + "eval_samples_per_second": 17.582, + "eval_steps_per_second": 0.549, "num_input_tokens_seen": 425840640, "step": 6500 }, { "epoch": 1.4063176114236262, - "eval_byte_accuracy": 0.9988872403560831, - "eval_chrf": 38.93726014211083, - "eval_sacrebleu": 19.56600861766589, - "eval_word_accuracy": 0.9958791208791209, + "eval_byte_accuracy": 0.9970326409495549, + "eval_chrf": 98.33382876029357, + "eval_sacrebleu": 97.89877731533007, + "eval_word_accuracy": 0.9931318681318682, "num_input_tokens_seen": 425840640, - "perplexity": 1.0115888672803843, + "perplexity": 1.014137057063326, "step": 6500 }, { "epoch": 1.4084811769796626, - "grad_norm": 0.7890625, - "learning_rate": 0.00010473, - "loss": 0.0155, + "grad_norm": 0.5625, + "learning_rate": 0.0002204842105263158, + "loss": 0.0206, "num_input_tokens_seen": 426496000, "step": 6510, - "train_runtime": 3312.2675, - "train_tokens_per_second": 128762.545 + "train_runtime": 807.3224, + "train_tokens_per_second": 528284.6 }, { "epoch": 1.4106447425356987, - "grad_norm": 0.69921875, - "learning_rate": 0.00010443, - "loss": 0.0154, + "grad_norm": 0.52734375, + "learning_rate": 0.00021985263157894733, + "loss": 0.0209, "num_input_tokens_seen": 427151360, "step": 6520, - "train_runtime": 3315.7861, - "train_tokens_per_second": 128823.556 + "train_runtime": 811.1474, + "train_tokens_per_second": 526601.42 }, { "epoch": 1.4128083080917353, - "grad_norm": 0.734375, - "learning_rate": 0.00010413, - "loss": 0.0145, + "grad_norm": 0.55078125, + "learning_rate": 0.0002192210526315789, + "loss": 0.0204, "num_input_tokens_seen": 427806720, "step": 6530, - "train_runtime": 3319.2581, - "train_tokens_per_second": 128886.247 + "train_runtime": 814.968, + "train_tokens_per_second": 524936.804 }, { "epoch": 1.4149718736477714, - "grad_norm": 0.53125, - "learning_rate": 0.00010383, - "loss": 0.0147, + "grad_norm": 0.4921875, + "learning_rate": 0.0002185894736842105, + "loss": 0.0182, "num_input_tokens_seen": 428462080, "step": 6540, - "train_runtime": 3322.7224, - "train_tokens_per_second": 128949.105 + "train_runtime": 818.7958, + "train_tokens_per_second": 523283.211 }, { "epoch": 1.4171354392038078, - "grad_norm": 0.48046875, - "learning_rate": 0.00010352999999999999, - "loss": 0.0142, + "grad_norm": 0.380859375, + "learning_rate": 0.00021795789473684208, + "loss": 0.0181, "num_input_tokens_seen": 429117440, "step": 6550, - "train_runtime": 3326.2161, - "train_tokens_per_second": 129010.691 + "train_runtime": 822.6201, + "train_tokens_per_second": 521647.178 }, { "epoch": 1.4192990047598442, - "grad_norm": 0.625, - "learning_rate": 0.00010323, - "loss": 0.0148, + "grad_norm": 0.55859375, + "learning_rate": 0.00021732631578947367, + "loss": 0.019, "num_input_tokens_seen": 429772800, "step": 6560, - "train_runtime": 3329.6826, - "train_tokens_per_second": 129073.202 + "train_runtime": 826.4515, + "train_tokens_per_second": 520021.829 }, { "epoch": 1.4214625703158805, - "grad_norm": 0.66015625, - "learning_rate": 0.00010292999999999999, - "loss": 0.0155, + "grad_norm": 0.490234375, + "learning_rate": 0.00021669473684210526, + "loss": 0.021, "num_input_tokens_seen": 430428160, "step": 6570, - "train_runtime": 3333.1394, - "train_tokens_per_second": 129135.963 + "train_runtime": 830.2566, + "train_tokens_per_second": 518427.851 }, { "epoch": 1.4236261358719169, - "grad_norm": 0.53515625, - "learning_rate": 0.00010263, - "loss": 0.0155, + "grad_norm": 0.56640625, + "learning_rate": 0.00021606315789473684, + "loss": 0.0201, "num_input_tokens_seen": 431083520, "step": 6580, - "train_runtime": 3336.619, - "train_tokens_per_second": 129197.705 + "train_runtime": 834.0781, + "train_tokens_per_second": 516838.294 }, { "epoch": 1.4257897014279532, - "grad_norm": 0.5078125, - "learning_rate": 0.00010232999999999999, - "loss": 0.0158, + "grad_norm": 0.5390625, + "learning_rate": 0.00021543157894736838, + "loss": 0.0207, "num_input_tokens_seen": 431738880, "step": 6590, - "train_runtime": 3340.0708, - "train_tokens_per_second": 129260.399 + "train_runtime": 837.9037, + "train_tokens_per_second": 515260.748 }, { "epoch": 1.4279532669839896, - "grad_norm": 0.51953125, - "learning_rate": 0.00010203, - "loss": 0.0154, + "grad_norm": 0.5234375, + "learning_rate": 0.00021479999999999996, + "loss": 0.02, "num_input_tokens_seen": 432394240, "step": 6600, - "train_runtime": 3343.516, - "train_tokens_per_second": 129323.215 + "train_runtime": 841.7307, + "train_tokens_per_second": 513696.636 }, { "epoch": 1.4279532669839896, - "eval_loss": 0.010480456054210663, - "eval_runtime": 24.2793, - "eval_samples_per_second": 1.318, - "eval_steps_per_second": 0.041, + "eval_loss": 0.014414757490158081, + "eval_runtime": 2.1559, + "eval_samples_per_second": 14.843, + "eval_steps_per_second": 0.464, "num_input_tokens_seen": 432394240, "step": 6600 }, { "epoch": 1.4279532669839896, - "eval_byte_accuracy": 0.9992581602373887, - "eval_chrf": 39.64404437521967, - "eval_sacrebleu": 20.21065548382088, - "eval_word_accuracy": 0.9972527472527473, + "eval_byte_accuracy": 0.9955489614243324, + "eval_chrf": 96.67764573411229, + "eval_sacrebleu": 95.86027221972736, + "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 432394240, - "perplexity": 1.010535568399663, + "perplexity": 1.014519151106678, "step": 6600 }, { "epoch": 1.430116832540026, - "grad_norm": 0.416015625, - "learning_rate": 0.00010172999999999999, - "loss": 0.0147, + "grad_norm": 0.3984375, + "learning_rate": 0.00021416842105263155, + "loss": 0.0199, "num_input_tokens_seen": 433049600, "step": 6610, - "train_runtime": 3371.2632, - "train_tokens_per_second": 128453.216 + "train_runtime": 847.7278, + "train_tokens_per_second": 510835.646 }, { "epoch": 1.4322803980960623, - "grad_norm": 0.439453125, - "learning_rate": 0.00010143, - "loss": 0.0139, + "grad_norm": 0.404296875, + "learning_rate": 0.00021353684210526314, + "loss": 0.0194, "num_input_tokens_seen": 433704960, "step": 6620, - "train_runtime": 3374.7177, - "train_tokens_per_second": 128515.923 + "train_runtime": 851.5621, + "train_tokens_per_second": 509305.165 }, { "epoch": 1.4344439636520987, - "grad_norm": 0.7421875, - "learning_rate": 0.00010112999999999999, - "loss": 0.0201, + "grad_norm": 0.60546875, + "learning_rate": 0.00021290526315789472, + "loss": 0.0258, "num_input_tokens_seen": 434360320, "step": 6630, - "train_runtime": 3378.2167, - "train_tokens_per_second": 128576.807 + "train_runtime": 855.3921, + "train_tokens_per_second": 507790.911 }, { "epoch": 1.436607529208135, - "grad_norm": 0.46875, - "learning_rate": 0.00010083, - "loss": 0.015, + "grad_norm": 0.41796875, + "learning_rate": 0.0002122736842105263, + "loss": 0.0196, "num_input_tokens_seen": 435015680, "step": 6640, - "train_runtime": 3381.7186, - "train_tokens_per_second": 128637.455 + "train_runtime": 859.2162, + "train_tokens_per_second": 506293.62 }, { "epoch": 1.4387710947641714, - "grad_norm": 0.8203125, - "learning_rate": 0.00010052999999999999, - "loss": 0.0149, + "grad_norm": 0.32421875, + "learning_rate": 0.0002116421052631579, + "loss": 0.0196, "num_input_tokens_seen": 435671040, "step": 6650, - "train_runtime": 3385.2081, - "train_tokens_per_second": 128698.452 + "train_runtime": 863.0321, + "train_tokens_per_second": 504814.436 }, { "epoch": 1.4409346603202078, - "grad_norm": 1.03125, - "learning_rate": 0.00010023, - "loss": 0.0142, + "grad_norm": 0.427734375, + "learning_rate": 0.00021101052631578945, + "loss": 0.0186, "num_input_tokens_seen": 436326400, "step": 6660, - "train_runtime": 3388.6591, - "train_tokens_per_second": 128760.785 + "train_runtime": 866.8539, + "train_tokens_per_second": 503344.785 }, { "epoch": 1.443098225876244, - "grad_norm": 0.5703125, - "learning_rate": 9.992999999999999e-05, - "loss": 0.0147, + "grad_norm": 0.4453125, + "learning_rate": 0.000210378947368421, + "loss": 0.0205, "num_input_tokens_seen": 436981760, "step": 6670, - "train_runtime": 3392.1617, - "train_tokens_per_second": 128821.03 + "train_runtime": 870.6753, + "train_tokens_per_second": 501888.287 }, { "epoch": 1.4452617914322805, - "grad_norm": 0.71484375, - "learning_rate": 9.962999999999999e-05, - "loss": 0.0155, + "grad_norm": 0.515625, + "learning_rate": 0.0002097473684210526, + "loss": 0.0208, "num_input_tokens_seen": 437637120, "step": 6680, - "train_runtime": 3395.6707, - "train_tokens_per_second": 128880.907 + "train_runtime": 874.4982, + "train_tokens_per_second": 500443.7 }, { "epoch": 1.4474253569883166, - "grad_norm": 0.443359375, - "learning_rate": 9.933e-05, - "loss": 0.0154, + "grad_norm": 0.62109375, + "learning_rate": 0.00020911578947368419, + "loss": 0.0201, "num_input_tokens_seen": 438292480, "step": 6690, - "train_runtime": 3399.1663, - "train_tokens_per_second": 128941.17 + "train_runtime": 878.3125, + "train_tokens_per_second": 499016.574 }, { "epoch": 1.4495889225443532, - "grad_norm": 0.7421875, - "learning_rate": 9.902999999999999e-05, - "loss": 0.0163, + "grad_norm": 0.73828125, + "learning_rate": 0.00020848421052631577, + "loss": 0.0211, "num_input_tokens_seen": 438947840, "step": 6700, - "train_runtime": 3402.6349, - "train_tokens_per_second": 129002.332 + "train_runtime": 882.1283, + "train_tokens_per_second": 497600.901 }, { "epoch": 1.4495889225443532, - "eval_loss": 0.009744660928845406, - "eval_runtime": 26.3116, - "eval_samples_per_second": 1.216, - "eval_steps_per_second": 0.038, + "eval_loss": 0.015520436689257622, + "eval_runtime": 1.7455, + "eval_samples_per_second": 18.333, + "eval_steps_per_second": 0.573, "num_input_tokens_seen": 438947840, "step": 6700 }, { "epoch": 1.4495889225443532, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.64404437521967, - "eval_sacrebleu": 20.21065548382088, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9955489614243324, + "eval_chrf": 96.30359773954326, + "eval_sacrebleu": 94.84810346446147, + "eval_word_accuracy": 0.9876373626373627, "num_input_tokens_seen": 438947840, - "perplexity": 1.0097922947365292, + "perplexity": 1.0156415041960352, "step": 6700 }, { "epoch": 1.4517524881003894, - "grad_norm": 0.490234375, - "learning_rate": 9.873e-05, - "loss": 0.0151, + "grad_norm": 0.41796875, + "learning_rate": 0.00020785263157894736, + "loss": 0.0203, "num_input_tokens_seen": 439603200, "step": 6710, - "train_runtime": 3432.4123, - "train_tokens_per_second": 128074.124 + "train_runtime": 887.7094, + "train_tokens_per_second": 495210.697 }, { "epoch": 1.4539160536564257, - "grad_norm": 0.55859375, - "learning_rate": 9.842999999999999e-05, - "loss": 0.0139, + "grad_norm": 0.357421875, + "learning_rate": 0.00020722105263157895, + "loss": 0.019, "num_input_tokens_seen": 440258560, "step": 6720, - "train_runtime": 3435.8849, - "train_tokens_per_second": 128135.421 + "train_runtime": 891.5276, + "train_tokens_per_second": 493824.937 }, { "epoch": 1.456079619212462, - "grad_norm": 0.515625, - "learning_rate": 9.813e-05, - "loss": 0.0148, + "grad_norm": 0.33203125, + "learning_rate": 0.0002065894736842105, + "loss": 0.0206, "num_input_tokens_seen": 440913920, "step": 6730, - "train_runtime": 3439.3842, - "train_tokens_per_second": 128195.601 + "train_runtime": 895.3511, + "train_tokens_per_second": 492448.083 }, { "epoch": 1.4582431847684985, - "grad_norm": 0.66796875, - "learning_rate": 9.782999999999999e-05, - "loss": 0.015, + "grad_norm": 0.5390625, + "learning_rate": 0.00020595789473684206, + "loss": 0.02, "num_input_tokens_seen": 441569280, "step": 6740, - "train_runtime": 3442.9053, - "train_tokens_per_second": 128254.843 + "train_runtime": 899.1684, + "train_tokens_per_second": 491086.316 }, { "epoch": 1.4604067503245348, - "grad_norm": 0.515625, - "learning_rate": 9.753e-05, - "loss": 0.0156, + "grad_norm": 0.5546875, + "learning_rate": 0.00020532631578947365, + "loss": 0.022, "num_input_tokens_seen": 442224640, "step": 6750, - "train_runtime": 3446.38, - "train_tokens_per_second": 128315.693 + "train_runtime": 902.9825, + "train_tokens_per_second": 489737.8 }, { "epoch": 1.4625703158805712, - "grad_norm": 0.80859375, - "learning_rate": 9.722999999999999e-05, - "loss": 0.0165, + "grad_norm": 0.41015625, + "learning_rate": 0.00020469473684210524, + "loss": 0.019, "num_input_tokens_seen": 442880000, "step": 6760, - "train_runtime": 3449.8318, - "train_tokens_per_second": 128377.272 + "train_runtime": 906.7891, + "train_tokens_per_second": 488404.645 }, { "epoch": 1.4647338814366075, - "grad_norm": 0.34375, - "learning_rate": 9.693e-05, - "loss": 0.0177, + "grad_norm": 0.306640625, + "learning_rate": 0.00020406315789473682, + "loss": 0.0206, "num_input_tokens_seen": 443535360, "step": 6770, - "train_runtime": 3453.3158, - "train_tokens_per_second": 128437.533 + "train_runtime": 910.6215, + "train_tokens_per_second": 487068.867 }, { "epoch": 1.466897446992644, - "grad_norm": 0.54296875, - "learning_rate": 9.662999999999999e-05, - "loss": 0.0139, + "grad_norm": 0.349609375, + "learning_rate": 0.0002034315789473684, + "loss": 0.0196, "num_input_tokens_seen": 444190720, "step": 6780, - "train_runtime": 3456.7855, - "train_tokens_per_second": 128498.2 + "train_runtime": 914.4324, + "train_tokens_per_second": 485755.666 }, { "epoch": 1.4690610125486803, - "grad_norm": 0.466796875, - "learning_rate": 9.633e-05, - "loss": 0.0158, + "grad_norm": 0.310546875, + "learning_rate": 0.0002028, + "loss": 0.0202, "num_input_tokens_seen": 444846080, "step": 6790, - "train_runtime": 3460.3051, - "train_tokens_per_second": 128556.896 + "train_runtime": 918.2489, + "train_tokens_per_second": 484450.42 }, { "epoch": 1.4712245781047166, - "grad_norm": 0.76171875, - "learning_rate": 9.602999999999999e-05, - "loss": 0.014, + "grad_norm": 0.39453125, + "learning_rate": 0.00020216842105263156, + "loss": 0.0184, "num_input_tokens_seen": 445497344, "step": 6800, - "train_runtime": 3463.788, - "train_tokens_per_second": 128615.65 + "train_runtime": 922.0496, + "train_tokens_per_second": 483159.857 }, { "epoch": 1.4712245781047166, - "eval_loss": 0.011742905713617802, - "eval_runtime": 25.8244, - "eval_samples_per_second": 1.239, - "eval_steps_per_second": 0.039, + "eval_loss": 0.015351799316704273, + "eval_runtime": 1.7437, + "eval_samples_per_second": 18.352, + "eval_steps_per_second": 0.573, "num_input_tokens_seen": 445497344, "step": 6800 }, { "epoch": 1.4712245781047166, - "eval_byte_accuracy": 0.9988872403560831, - "eval_chrf": 38.92098788871075, - "eval_sacrebleu": 19.337263244966266, - "eval_word_accuracy": 0.9958791208791209, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 97.3917095936316, + "eval_sacrebleu": 96.37930136022618, + "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 445497344, - "perplexity": 1.0118121243080482, + "perplexity": 1.0154702435221923, "step": 6800 }, { "epoch": 1.473388143660753, - "grad_norm": 0.44140625, - "learning_rate": 9.572999999999999e-05, - "loss": 0.0143, + "grad_norm": 0.4375, + "learning_rate": 0.00020153684210526314, + "loss": 0.0192, "num_input_tokens_seen": 446152704, "step": 6810, - "train_runtime": 3493.1401, - "train_tokens_per_second": 127722.533 + "train_runtime": 927.6261, + "train_tokens_per_second": 480961.793 }, { "epoch": 1.4755517092167891, - "grad_norm": 0.6875, - "learning_rate": 9.542999999999999e-05, - "loss": 0.0141, + "grad_norm": 0.56640625, + "learning_rate": 0.0002009052631578947, + "loss": 0.0194, "num_input_tokens_seen": 446808064, "step": 6820, - "train_runtime": 3496.6823, - "train_tokens_per_second": 127780.57 + "train_runtime": 931.455, + "train_tokens_per_second": 479688.322 }, { "epoch": 1.4777152747728257, - "grad_norm": 0.490234375, - "learning_rate": 9.512999999999999e-05, - "loss": 0.0153, + "grad_norm": 0.400390625, + "learning_rate": 0.0002002736842105263, + "loss": 0.0205, "num_input_tokens_seen": 447463424, "step": 6830, - "train_runtime": 3500.1943, - "train_tokens_per_second": 127839.595 + "train_runtime": 935.2803, + "train_tokens_per_second": 478427.09 }, { "epoch": 1.4798788403288619, - "grad_norm": 0.458984375, - "learning_rate": 9.482999999999998e-05, - "loss": 0.014, + "grad_norm": 0.4609375, + "learning_rate": 0.00019964210526315787, + "loss": 0.0195, "num_input_tokens_seen": 448118784, "step": 6840, - "train_runtime": 3503.6563, - "train_tokens_per_second": 127900.325 + "train_runtime": 939.0955, + "train_tokens_per_second": 477181.257 }, { "epoch": 1.4820424058848984, - "grad_norm": 0.57421875, - "learning_rate": 9.452999999999999e-05, - "loss": 0.0147, + "grad_norm": 0.369140625, + "learning_rate": 0.00019901052631578946, + "loss": 0.0206, "num_input_tokens_seen": 448774144, "step": 6850, - "train_runtime": 3507.1592, - "train_tokens_per_second": 127959.445 + "train_runtime": 942.9144, + "train_tokens_per_second": 475943.686 }, { "epoch": 1.4842059714409346, - "grad_norm": 0.43359375, - "learning_rate": 9.422999999999998e-05, - "loss": 0.0156, + "grad_norm": 0.46484375, + "learning_rate": 0.00019837894736842105, + "loss": 0.0212, "num_input_tokens_seen": 449429504, "step": 6860, - "train_runtime": 3510.7008, - "train_tokens_per_second": 128017.034 + "train_runtime": 946.7363, + "train_tokens_per_second": 474714.567 }, { "epoch": 1.486369536996971, - "grad_norm": 0.6484375, - "learning_rate": 9.392999999999999e-05, - "loss": 0.0371, + "grad_norm": 0.408203125, + "learning_rate": 0.0001977473684210526, + "loss": 0.0506, "num_input_tokens_seen": 450084864, "step": 6870, - "train_runtime": 3514.2388, - "train_tokens_per_second": 128074.638 + "train_runtime": 950.5571, + "train_tokens_per_second": 473495.858 }, { "epoch": 1.4885331025530073, - "grad_norm": 0.43359375, - "learning_rate": 9.362999999999998e-05, - "loss": 0.015, + "grad_norm": 0.365234375, + "learning_rate": 0.0001971157894736842, + "loss": 0.0199, "num_input_tokens_seen": 450740224, "step": 6880, - "train_runtime": 3517.7737, - "train_tokens_per_second": 128132.241 + "train_runtime": 954.368, + "train_tokens_per_second": 472291.835 }, { "epoch": 1.4906966681090437, - "grad_norm": 0.6953125, - "learning_rate": 9.332999999999999e-05, - "loss": 0.0157, + "grad_norm": 0.5390625, + "learning_rate": 0.00019648421052631578, + "loss": 0.0207, "num_input_tokens_seen": 451395584, "step": 6890, - "train_runtime": 3521.42, - "train_tokens_per_second": 128185.672 + "train_runtime": 958.1838, + "train_tokens_per_second": 471094.968 }, { "epoch": 1.49286023366508, - "grad_norm": 0.68359375, - "learning_rate": 9.302999999999998e-05, - "loss": 0.0144, + "grad_norm": 0.6015625, + "learning_rate": 0.00019585263157894734, + "loss": 0.0202, "num_input_tokens_seen": 452050944, "step": 6900, - "train_runtime": 3525.071, - "train_tokens_per_second": 128238.82 + "train_runtime": 962.0035, + "train_tokens_per_second": 469905.704 }, { "epoch": 1.49286023366508, - "eval_loss": 0.010436570271849632, - "eval_runtime": 28.3744, - "eval_samples_per_second": 1.128, - "eval_steps_per_second": 0.035, + "eval_loss": 0.014207720756530762, + "eval_runtime": 1.7403, + "eval_samples_per_second": 18.388, + "eval_steps_per_second": 0.575, "num_input_tokens_seen": 452050944, "step": 6900 }, { "epoch": 1.49286023366508, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.995919881305638, + "eval_chrf": 97.24944108474526, + "eval_sacrebleu": 96.68143920500805, + "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 452050944, - "perplexity": 1.0104912212287525, + "perplexity": 1.0143091301172105, "step": 6900 }, { "epoch": 1.4950237992211164, - "grad_norm": 0.3828125, - "learning_rate": 9.272999999999999e-05, - "loss": 0.0137, + "grad_norm": 0.421875, + "learning_rate": 0.00019522105263157892, + "loss": 0.0201, "num_input_tokens_seen": 452706304, "step": 6910, - "train_runtime": 3556.9859, - "train_tokens_per_second": 127272.45 + "train_runtime": 967.588, + "train_tokens_per_second": 467870.924 }, { "epoch": 1.4971873647771528, - "grad_norm": 0.44921875, - "learning_rate": 9.242999999999998e-05, - "loss": 0.0152, + "grad_norm": 0.43359375, + "learning_rate": 0.0001945894736842105, + "loss": 0.0226, "num_input_tokens_seen": 453361664, "step": 6920, - "train_runtime": 3560.5377, - "train_tokens_per_second": 127329.55 + "train_runtime": 971.3999, + "train_tokens_per_second": 466709.598 }, { "epoch": 1.4993509303331891, - "grad_norm": 0.7109375, - "learning_rate": 9.212999999999999e-05, - "loss": 0.0156, + "grad_norm": 0.5546875, + "learning_rate": 0.0001939578947368421, + "loss": 0.0208, "num_input_tokens_seen": 454017024, "step": 6930, - "train_runtime": 3564.3767, - "train_tokens_per_second": 127376.273 + "train_runtime": 975.2197, + "train_tokens_per_second": 465553.591 }, { "epoch": 1.5015144958892255, - "grad_norm": 0.4375, - "learning_rate": 9.183e-05, - "loss": 0.0144, + "grad_norm": 0.29296875, + "learning_rate": 0.00019332631578947366, + "loss": 0.0182, "num_input_tokens_seen": 454672384, "step": 6940, - "train_runtime": 3567.9302, - "train_tokens_per_second": 127433.095 + "train_runtime": 979.0437, + "train_tokens_per_second": 464404.602 }, { "epoch": 1.5036780614452618, - "grad_norm": 0.5859375, - "learning_rate": 9.152999999999999e-05, - "loss": 0.0153, + "grad_norm": 0.455078125, + "learning_rate": 0.00019269473684210524, + "loss": 0.0198, "num_input_tokens_seen": 455327744, "step": 6950, - "train_runtime": 3571.6269, - "train_tokens_per_second": 127484.688 + "train_runtime": 982.8674, + "train_tokens_per_second": 463264.695 }, { "epoch": 1.5058416270012982, - "grad_norm": 0.4765625, - "learning_rate": 9.122999999999999e-05, - "loss": 0.017, + "grad_norm": 0.392578125, + "learning_rate": 0.00019206315789473683, + "loss": 0.0181, "num_input_tokens_seen": 455983104, "step": 6960, - "train_runtime": 3575.1853, - "train_tokens_per_second": 127541.112 + "train_runtime": 986.6982, + "train_tokens_per_second": 462130.276 }, { "epoch": 1.5080051925573343, - "grad_norm": 0.62890625, - "learning_rate": 9.092999999999998e-05, - "loss": 0.0147, + "grad_norm": 0.431640625, + "learning_rate": 0.0001914315789473684, + "loss": 0.0183, "num_input_tokens_seen": 456638464, "step": 6970, - "train_runtime": 3578.8115, - "train_tokens_per_second": 127595.004 + "train_runtime": 990.5199, + "train_tokens_per_second": 461008.87 }, { "epoch": 1.510168758113371, - "grad_norm": 0.44140625, - "learning_rate": 9.062999999999999e-05, - "loss": 0.0132, + "grad_norm": 0.4140625, + "learning_rate": 0.00019079999999999998, + "loss": 0.0184, "num_input_tokens_seen": 457293824, "step": 6980, - "train_runtime": 3582.4525, - "train_tokens_per_second": 127648.259 + "train_runtime": 994.3509, + "train_tokens_per_second": 459891.818 }, { "epoch": 1.512332323669407, - "grad_norm": 0.64453125, - "learning_rate": 9.032999999999998e-05, - "loss": 0.0143, + "grad_norm": 0.458984375, + "learning_rate": 0.00019016842105263156, + "loss": 0.0196, "num_input_tokens_seen": 457949184, "step": 6990, - "train_runtime": 3586.0613, - "train_tokens_per_second": 127702.553 + "train_runtime": 998.1779, + "train_tokens_per_second": 458785.114 }, { "epoch": 1.5144958892254436, - "grad_norm": 0.6875, - "learning_rate": 9.002999999999999e-05, - "loss": 0.0167, + "grad_norm": 0.51953125, + "learning_rate": 0.00018953684210526315, + "loss": 0.0213, "num_input_tokens_seen": 458604544, "step": 7000, - "train_runtime": 3589.6107, - "train_tokens_per_second": 127758.852 + "train_runtime": 1002.0001, + "train_tokens_per_second": 457689.118 }, { "epoch": 1.5144958892254436, - "eval_loss": 0.010686405003070831, - "eval_runtime": 28.3912, - "eval_samples_per_second": 1.127, - "eval_steps_per_second": 0.035, + "eval_loss": 0.014768999069929123, + "eval_runtime": 1.7379, + "eval_samples_per_second": 18.413, + "eval_steps_per_second": 0.575, "num_input_tokens_seen": 458604544, "step": 7000 }, { "epoch": 1.5144958892254436, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9962908011869436, + "eval_chrf": 96.77834078968087, + "eval_sacrebleu": 95.79032669908624, + "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 458604544, - "perplexity": 1.0107437085701498, + "perplexity": 1.0148785996355225, "step": 7000 }, { "epoch": 1.5166594547814798, - "grad_norm": 0.421875, - "learning_rate": 8.972999999999998e-05, - "loss": 0.0144, + "grad_norm": 0.53125, + "learning_rate": 0.0001889052631578947, + "loss": 0.02, "num_input_tokens_seen": 459259904, "step": 7010, - "train_runtime": 3621.6303, - "train_tokens_per_second": 126810.266 + "train_runtime": 1007.6295, + "train_tokens_per_second": 455782.526 }, { "epoch": 1.5188230203375164, - "grad_norm": 0.546875, - "learning_rate": 8.942999999999999e-05, - "loss": 0.0146, + "grad_norm": 0.41015625, + "learning_rate": 0.0001882736842105263, + "loss": 0.0199, "num_input_tokens_seen": 459915264, "step": 7020, - "train_runtime": 3625.3673, - "train_tokens_per_second": 126860.323 + "train_runtime": 1011.4394, + "train_tokens_per_second": 454713.63 }, { "epoch": 1.5209865858935525, - "grad_norm": 0.50390625, - "learning_rate": 8.912999999999998e-05, - "loss": 0.0143, + "grad_norm": 0.6015625, + "learning_rate": 0.00018764210526315788, + "loss": 0.0187, "num_input_tokens_seen": 460570624, "step": 7030, - "train_runtime": 3628.8204, - "train_tokens_per_second": 126920.202 + "train_runtime": 1015.2675, + "train_tokens_per_second": 453644.627 }, { "epoch": 1.5231501514495889, - "grad_norm": 0.72265625, - "learning_rate": 8.882999999999999e-05, - "loss": 0.0149, + "grad_norm": 0.3515625, + "learning_rate": 0.00018701052631578947, + "loss": 0.0192, "num_input_tokens_seen": 461225984, "step": 7040, - "train_runtime": 3632.4552, - "train_tokens_per_second": 126973.62 + "train_runtime": 1019.094, + "train_tokens_per_second": 452584.352 }, { "epoch": 1.5253137170056252, "grad_norm": 0.5, - "learning_rate": 8.852999999999998e-05, - "loss": 0.0145, + "learning_rate": 0.00018637894736842103, + "loss": 0.0198, "num_input_tokens_seen": 461881344, "step": 7050, - "train_runtime": 3635.9688, - "train_tokens_per_second": 127031.161 + "train_runtime": 1022.903, + "train_tokens_per_second": 451539.732 }, { "epoch": 1.5274772825616616, - "grad_norm": 0.75, - "learning_rate": 8.822999999999999e-05, - "loss": 0.0142, + "grad_norm": 0.5, + "learning_rate": 0.0001857473684210526, + "loss": 0.0192, "num_input_tokens_seen": 462536704, "step": 7060, - "train_runtime": 3639.5016, - "train_tokens_per_second": 127087.923 + "train_runtime": 1026.7344, + "train_tokens_per_second": 450493.05 }, { "epoch": 1.529640848117698, - "grad_norm": 0.67578125, - "learning_rate": 8.793000000000001e-05, - "loss": 0.0143, + "grad_norm": 0.50390625, + "learning_rate": 0.0001851157894736842, + "loss": 0.0196, "num_input_tokens_seen": 463192064, "step": 7070, - "train_runtime": 3643.1118, - "train_tokens_per_second": 127141.873 + "train_runtime": 1030.5622, + "train_tokens_per_second": 449455.697 }, { "epoch": 1.5318044136737343, - "grad_norm": 0.6796875, - "learning_rate": 8.763e-05, - "loss": 0.0135, + "grad_norm": 0.333984375, + "learning_rate": 0.00018448421052631579, + "loss": 0.0197, "num_input_tokens_seen": 463847424, "step": 7080, - "train_runtime": 3646.8469, - "train_tokens_per_second": 127191.361 + "train_runtime": 1034.3948, + "train_tokens_per_second": 448423.959 }, { "epoch": 1.5339679792297707, - "grad_norm": 0.85546875, - "learning_rate": 8.733e-05, - "loss": 0.014, + "grad_norm": 0.404296875, + "learning_rate": 0.00018385263157894735, + "loss": 0.0191, "num_input_tokens_seen": 464502784, "step": 7090, - "train_runtime": 3650.3817, - "train_tokens_per_second": 127247.73 + "train_runtime": 1038.2274, + "train_tokens_per_second": 447399.872 }, { "epoch": 1.536131544785807, - "grad_norm": 0.64453125, - "learning_rate": 8.703e-05, - "loss": 0.0128, + "grad_norm": 0.53515625, + "learning_rate": 0.00018322105263157893, + "loss": 0.0177, "num_input_tokens_seen": 465158144, "step": 7100, - "train_runtime": 3653.9839, - "train_tokens_per_second": 127301.641 + "train_runtime": 1042.0582, + "train_tokens_per_second": 446384.022 }, { "epoch": 1.536131544785807, - "eval_loss": 0.009883665479719639, - "eval_runtime": 40.1477, - "eval_samples_per_second": 0.797, - "eval_steps_per_second": 0.025, + "eval_loss": 0.01327989250421524, + "eval_runtime": 1.7389, + "eval_samples_per_second": 18.403, + "eval_steps_per_second": 0.575, "num_input_tokens_seen": 465158144, "step": 7100 }, { "epoch": 1.536131544785807, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 97.50210841922704, + "eval_sacrebleu": 96.68143920500805, + "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 465158144, - "perplexity": 1.0099326702171245, + "perplexity": 1.0133684619064596, "step": 7100 }, { "epoch": 1.5382951103418434, - "grad_norm": 0.69921875, - "learning_rate": 8.673e-05, - "loss": 0.0136, + "grad_norm": 0.443359375, + "learning_rate": 0.00018258947368421052, + "loss": 0.0189, "num_input_tokens_seen": 465813504, "step": 7110, - "train_runtime": 3698.0355, - "train_tokens_per_second": 125962.42 + "train_runtime": 1047.6475, + "train_tokens_per_second": 444628.075 }, { "epoch": 1.5404586758978795, - "grad_norm": 0.60546875, - "learning_rate": 8.643e-05, - "loss": 0.0164, + "grad_norm": 0.318359375, + "learning_rate": 0.0001819578947368421, + "loss": 0.0203, "num_input_tokens_seen": 466468864, "step": 7120, - "train_runtime": 3701.6421, - "train_tokens_per_second": 126016.74 + "train_runtime": 1051.4769, + "train_tokens_per_second": 443632.077 }, { "epoch": 1.5426222414539161, - "grad_norm": 0.6796875, - "learning_rate": 8.613e-05, - "loss": 0.014, + "grad_norm": 0.3125, + "learning_rate": 0.00018132631578947366, + "loss": 0.0204, "num_input_tokens_seen": 467124224, "step": 7130, - "train_runtime": 3705.1595, - "train_tokens_per_second": 126073.984 + "train_runtime": 1055.3019, + "train_tokens_per_second": 442645.107 }, { "epoch": 1.5447858070099523, - "grad_norm": 0.490234375, - "learning_rate": 8.583e-05, - "loss": 0.0161, + "grad_norm": 0.36328125, + "learning_rate": 0.00018069473684210525, + "loss": 0.0195, "num_input_tokens_seen": 467779584, "step": 7140, - "train_runtime": 3708.755, - "train_tokens_per_second": 126128.468 + "train_runtime": 1059.1102, + "train_tokens_per_second": 441672.241 }, { "epoch": 1.5469493725659889, - "grad_norm": 0.53515625, - "learning_rate": 8.553e-05, - "loss": 0.0141, + "grad_norm": 0.294921875, + "learning_rate": 0.00018006315789473684, + "loss": 0.0185, "num_input_tokens_seen": 468434944, "step": 7150, - "train_runtime": 3712.3077, - "train_tokens_per_second": 126184.3 + "train_runtime": 1062.9377, + "train_tokens_per_second": 440698.395 }, { "epoch": 1.549112938122025, - "grad_norm": 0.64453125, - "learning_rate": 8.523e-05, - "loss": 0.0142, + "grad_norm": 0.5078125, + "learning_rate": 0.0001794315789473684, + "loss": 0.0186, "num_input_tokens_seen": 469090304, "step": 7160, - "train_runtime": 3715.9165, - "train_tokens_per_second": 126238.119 + "train_runtime": 1066.764, + "train_tokens_per_second": 439732.024 }, { "epoch": 1.5512765036780616, - "grad_norm": 0.52734375, - "learning_rate": 8.493e-05, - "loss": 0.0165, + "grad_norm": 0.3671875, + "learning_rate": 0.00017879999999999998, + "loss": 0.0218, "num_input_tokens_seen": 469745664, "step": 7170, - "train_runtime": 3719.5062, - "train_tokens_per_second": 126292.479 + "train_runtime": 1070.5891, + "train_tokens_per_second": 438773.072 }, { "epoch": 1.5534400692340977, - "grad_norm": 0.474609375, - "learning_rate": 8.463e-05, - "loss": 0.015, + "grad_norm": 0.4296875, + "learning_rate": 0.00017816842105263157, + "loss": 0.0185, "num_input_tokens_seen": 470401024, "step": 7180, - "train_runtime": 3723.0188, - "train_tokens_per_second": 126349.357 + "train_runtime": 1074.4161, + "train_tokens_per_second": 437820.156 }, { "epoch": 1.5556036347901343, - "grad_norm": 0.5078125, - "learning_rate": 8.433e-05, - "loss": 0.0147, + "grad_norm": 0.5234375, + "learning_rate": 0.00017753684210526316, + "loss": 0.019, "num_input_tokens_seen": 471056384, "step": 7190, - "train_runtime": 3726.5645, - "train_tokens_per_second": 126404.999 + "train_runtime": 1078.245, + "train_tokens_per_second": 436873.226 }, { "epoch": 1.5577672003461704, - "grad_norm": 0.578125, - "learning_rate": 8.403e-05, - "loss": 0.0142, + "grad_norm": 0.423828125, + "learning_rate": 0.00017690526315789471, + "loss": 0.0193, "num_input_tokens_seen": 471711744, "step": 7200, - "train_runtime": 3730.3873, - "train_tokens_per_second": 126451.145 + "train_runtime": 1082.0752, + "train_tokens_per_second": 435932.508 }, { "epoch": 1.5577672003461704, - "eval_loss": 0.009905115701258183, - "eval_runtime": 31.1689, - "eval_samples_per_second": 1.027, - "eval_steps_per_second": 0.032, + "eval_loss": 0.01322339940816164, + "eval_runtime": 1.724, + "eval_samples_per_second": 18.562, + "eval_steps_per_second": 0.58, "num_input_tokens_seen": 471711744, "step": 7200 }, { "epoch": 1.5577672003461704, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9962908011869436, + "eval_chrf": 97.51595116562792, + "eval_sacrebleu": 96.68143920500805, + "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 471711744, - "perplexity": 1.0099543337289825, + "perplexity": 1.0133112152016404, "step": 7200 }, { "epoch": 1.5599307659022068, - "grad_norm": 0.65234375, - "learning_rate": 8.373e-05, - "loss": 0.0193, + "grad_norm": 0.78125, + "learning_rate": 0.0001762736842105263, + "loss": 0.0231, "num_input_tokens_seen": 472367104, "step": 7210, - "train_runtime": 3765.244, - "train_tokens_per_second": 125454.579 + "train_runtime": 1087.647, + "train_tokens_per_second": 434301.852 }, { "epoch": 1.5620943314582432, - "grad_norm": 0.447265625, - "learning_rate": 8.342999999999999e-05, - "loss": 0.0159, + "grad_norm": 0.46875, + "learning_rate": 0.0001756421052631579, + "loss": 0.0183, "num_input_tokens_seen": 473022464, "step": 7220, - "train_runtime": 3768.7663, - "train_tokens_per_second": 125511.222 + "train_runtime": 1091.4784, + "train_tokens_per_second": 433377.759 }, { "epoch": 1.5642578970142795, - "grad_norm": 0.453125, - "learning_rate": 8.313e-05, - "loss": 0.0127, + "grad_norm": 0.298828125, + "learning_rate": 0.00017501052631578945, + "loss": 0.0162, "num_input_tokens_seen": 473673728, "step": 7230, - "train_runtime": 3772.3403, - "train_tokens_per_second": 125564.953 + "train_runtime": 1095.2826, + "train_tokens_per_second": 432467.15 }, { "epoch": 1.566421462570316, - "grad_norm": 0.56640625, - "learning_rate": 8.282999999999999e-05, - "loss": 0.0141, + "grad_norm": 0.455078125, + "learning_rate": 0.00017437894736842103, + "loss": 0.0193, "num_input_tokens_seen": 474329088, "step": 7240, - "train_runtime": 3776.0658, - "train_tokens_per_second": 125614.625 + "train_runtime": 1099.1053, + "train_tokens_per_second": 431559.287 }, { "epoch": 1.5685850281263523, - "grad_norm": 0.54296875, - "learning_rate": 8.253e-05, - "loss": 0.0129, + "grad_norm": 0.400390625, + "learning_rate": 0.00017374736842105262, + "loss": 0.0168, "num_input_tokens_seen": 474984448, "step": 7250, - "train_runtime": 3779.6051, - "train_tokens_per_second": 125670.389 + "train_runtime": 1102.9285, + "train_tokens_per_second": 430657.516 }, { "epoch": 1.5707485936823886, - "grad_norm": 0.443359375, - "learning_rate": 8.222999999999999e-05, - "loss": 0.0139, + "grad_norm": 0.34765625, + "learning_rate": 0.0001731157894736842, + "loss": 0.0191, "num_input_tokens_seen": 475639808, "step": 7260, - "train_runtime": 3783.1738, - "train_tokens_per_second": 125725.074 + "train_runtime": 1106.7545, + "train_tokens_per_second": 429760.903 }, { "epoch": 1.572912159238425, - "grad_norm": 0.50390625, - "learning_rate": 8.193e-05, - "loss": 0.0156, + "grad_norm": 0.5390625, + "learning_rate": 0.0001724842105263158, + "loss": 0.0215, "num_input_tokens_seen": 476295168, "step": 7270, - "train_runtime": 3786.8095, - "train_tokens_per_second": 125777.43 + "train_runtime": 1110.579, + "train_tokens_per_second": 428871.048 }, { "epoch": 1.5750757247944613, - "grad_norm": 0.65234375, - "learning_rate": 8.162999999999999e-05, - "loss": 0.0147, + "grad_norm": 0.50390625, + "learning_rate": 0.00017185263157894735, + "loss": 0.021, "num_input_tokens_seen": 476950528, "step": 7280, - "train_runtime": 3790.3382, - "train_tokens_per_second": 125833.239 + "train_runtime": 1114.4053, + "train_tokens_per_second": 427986.6 }, { "epoch": 1.5772392903504975, - "grad_norm": 0.48046875, - "learning_rate": 8.133e-05, - "loss": 0.0141, + "grad_norm": 0.5859375, + "learning_rate": 0.00017122105263157894, + "loss": 0.0199, "num_input_tokens_seen": 477605888, "step": 7290, - "train_runtime": 3794.0888, - "train_tokens_per_second": 125881.577 + "train_runtime": 1118.2285, + "train_tokens_per_second": 427109.371 }, { "epoch": 1.579402855906534, - "grad_norm": 0.66796875, - "learning_rate": 8.102999999999999e-05, - "loss": 0.0142, + "grad_norm": 0.4765625, + "learning_rate": 0.0001705894736842105, + "loss": 0.019, "num_input_tokens_seen": 478261248, "step": 7300, - "train_runtime": 3797.6137, - "train_tokens_per_second": 125937.31 + "train_runtime": 1122.0519, + "train_tokens_per_second": 426238.09 }, { "epoch": 1.579402855906534, - "eval_loss": 0.01061211433261633, - "eval_runtime": 29.1574, - "eval_samples_per_second": 1.097, - "eval_steps_per_second": 0.034, + "eval_loss": 0.013833579607307911, + "eval_runtime": 1.7302, + "eval_samples_per_second": 18.495, + "eval_steps_per_second": 0.578, "num_input_tokens_seen": 478261248, "step": 7300 }, { "epoch": 1.579402855906534, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9962908011869436, + "eval_chrf": 97.06431064307826, + "eval_sacrebleu": 95.76190341678922, + "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 478261248, - "perplexity": 1.0106686225315131, + "perplexity": 1.0139297063170556, "step": 7300 }, { "epoch": 1.5815664214625702, - "grad_norm": 0.50390625, - "learning_rate": 8.073e-05, - "loss": 0.0133, + "grad_norm": 0.5859375, + "learning_rate": 0.00016995789473684208, + "loss": 0.0191, "num_input_tokens_seen": 478916608, "step": 7310, - "train_runtime": 3830.3537, - "train_tokens_per_second": 125031.953 + "train_runtime": 1127.6289, + "train_tokens_per_second": 424711.19 }, { "epoch": 1.5837299870186068, - "grad_norm": 0.62890625, - "learning_rate": 8.043e-05, - "loss": 0.0143, + "grad_norm": 0.455078125, + "learning_rate": 0.00016932631578947367, + "loss": 0.0191, "num_input_tokens_seen": 479567872, "step": 7320, - "train_runtime": 3833.8106, - "train_tokens_per_second": 125089.088 + "train_runtime": 1131.433, + "train_tokens_per_second": 423858.826 }, { "epoch": 1.585893552574643, - "grad_norm": 0.4375, - "learning_rate": 8.013e-05, - "loss": 0.0135, + "grad_norm": 0.41796875, + "learning_rate": 0.00016869473684210526, + "loss": 0.0187, "num_input_tokens_seen": 480223232, "step": 7330, - "train_runtime": 3837.2741, - "train_tokens_per_second": 125146.972 + "train_runtime": 1135.2422, + "train_tokens_per_second": 423013.894 }, { "epoch": 1.5880571181306795, - "grad_norm": 0.9765625, - "learning_rate": 7.983e-05, - "loss": 0.0178, + "grad_norm": 0.462890625, + "learning_rate": 0.00016806315789473684, + "loss": 0.0225, "num_input_tokens_seen": 480878592, "step": 7340, - "train_runtime": 3840.7639, - "train_tokens_per_second": 125203.894 + "train_runtime": 1139.066, + "train_tokens_per_second": 422169.227 }, { "epoch": 1.5902206836867157, - "grad_norm": 0.6171875, - "learning_rate": 7.952999999999999e-05, - "loss": 0.0134, + "grad_norm": 0.4140625, + "learning_rate": 0.00016743157894736843, + "loss": 0.0172, "num_input_tokens_seen": 481533952, "step": 7350, - "train_runtime": 3844.248, - "train_tokens_per_second": 125260.897 + "train_runtime": 1142.8924, + "train_tokens_per_second": 421329.2 }, { "epoch": 1.592384249242752, - "grad_norm": 0.5859375, - "learning_rate": 7.923e-05, - "loss": 0.0144, + "grad_norm": 0.443359375, + "learning_rate": 0.0001668, + "loss": 0.019, "num_input_tokens_seen": 482189312, "step": 7360, - "train_runtime": 3847.738, - "train_tokens_per_second": 125317.606 + "train_runtime": 1146.724, + "train_tokens_per_second": 420492.911 }, { "epoch": 1.5945478147987884, - "grad_norm": 0.60546875, - "learning_rate": 7.892999999999999e-05, - "loss": 0.0154, + "grad_norm": 0.54296875, + "learning_rate": 0.00016616842105263155, + "loss": 0.0197, "num_input_tokens_seen": 482844672, "step": 7370, - "train_runtime": 3851.2407, - "train_tokens_per_second": 125373.798 + "train_runtime": 1150.5458, + "train_tokens_per_second": 419665.757 }, { "epoch": 1.5967113803548247, - "grad_norm": 0.4609375, - "learning_rate": 7.863e-05, - "loss": 0.0149, + "grad_norm": 0.314453125, + "learning_rate": 0.00016553684210526313, + "loss": 0.0194, "num_input_tokens_seen": 483500032, "step": 7380, - "train_runtime": 3854.7347, - "train_tokens_per_second": 125430.172 + "train_runtime": 1154.374, + "train_tokens_per_second": 418841.767 }, { "epoch": 1.598874945910861, - "grad_norm": 0.8984375, - "learning_rate": 7.832999999999999e-05, - "loss": 0.0149, + "grad_norm": 0.609375, + "learning_rate": 0.00016490526315789472, + "loss": 0.0206, "num_input_tokens_seen": 484155392, "step": 7390, - "train_runtime": 3858.1705, - "train_tokens_per_second": 125488.336 + "train_runtime": 1158.1709, + "train_tokens_per_second": 418034.488 }, { "epoch": 1.6010385114668975, - "grad_norm": 0.69140625, - "learning_rate": 7.803e-05, - "loss": 0.0141, + "grad_norm": 0.369140625, + "learning_rate": 0.0001642736842105263, + "loss": 0.0193, "num_input_tokens_seen": 484810752, "step": 7400, - "train_runtime": 3861.6281, - "train_tokens_per_second": 125545.687 + "train_runtime": 1161.9921, + "train_tokens_per_second": 417223.789 }, { "epoch": 1.6010385114668975, - "eval_loss": 0.00974221620708704, - "eval_runtime": 25.4227, - "eval_samples_per_second": 1.259, - "eval_steps_per_second": 0.039, + "eval_loss": 0.013829471543431282, + "eval_runtime": 1.7413, + "eval_samples_per_second": 18.377, + "eval_steps_per_second": 0.574, "num_input_tokens_seen": 484810752, "step": 7400 }, { "epoch": 1.6010385114668975, - "eval_byte_accuracy": 0.9992581602373887, - "eval_chrf": 39.35539991367501, - "eval_sacrebleu": 20.53372838118806, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9970326409495549, + "eval_chrf": 97.63397091399936, + "eval_sacrebleu": 97.01614214490985, + "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 484810752, - "perplexity": 1.0097898260783524, + "perplexity": 1.0139255410376113, "step": 7400 }, { "epoch": 1.6032020770229338, - "grad_norm": 0.51171875, - "learning_rate": 7.772999999999999e-05, - "loss": 0.0148, + "grad_norm": 0.380859375, + "learning_rate": 0.0001636421052631579, + "loss": 0.0199, "num_input_tokens_seen": 485466112, "step": 7410, - "train_runtime": 3890.5103, - "train_tokens_per_second": 124782.117 + "train_runtime": 1167.5747, + "train_tokens_per_second": 415790.198 }, { "epoch": 1.6053656425789702, - "grad_norm": 0.53125, - "learning_rate": 7.743e-05, - "loss": 0.0166, + "grad_norm": 0.390625, + "learning_rate": 0.00016301052631578948, + "loss": 0.0205, "num_input_tokens_seen": 486121472, "step": 7420, - "train_runtime": 3893.9903, - "train_tokens_per_second": 124838.9 + "train_runtime": 1171.3949, + "train_tokens_per_second": 414993.662 }, { "epoch": 1.6075292081350065, - "grad_norm": 0.734375, - "learning_rate": 7.712999999999999e-05, - "loss": 0.0148, + "grad_norm": 0.443359375, + "learning_rate": 0.00016237894736842104, + "loss": 0.0197, "num_input_tokens_seen": 486776832, "step": 7430, - "train_runtime": 3897.4645, - "train_tokens_per_second": 124895.771 + "train_runtime": 1175.211, + "train_tokens_per_second": 414203.761 }, { "epoch": 1.6096927736910427, - "grad_norm": 0.50390625, - "learning_rate": 7.683e-05, - "loss": 0.0135, + "grad_norm": 0.578125, + "learning_rate": 0.0001617473684210526, + "loss": 0.0186, "num_input_tokens_seen": 487432192, "step": 7440, - "train_runtime": 3900.9148, - "train_tokens_per_second": 124953.303 + "train_runtime": 1179.0344, + "train_tokens_per_second": 413416.438 }, { "epoch": 1.6118563392470793, - "grad_norm": 0.68359375, - "learning_rate": 7.652999999999999e-05, - "loss": 0.0136, + "grad_norm": 0.41015625, + "learning_rate": 0.00016111578947368419, + "loss": 0.0183, "num_input_tokens_seen": 488087552, "step": 7450, - "train_runtime": 3904.4438, - "train_tokens_per_second": 125008.214 + "train_runtime": 1182.8525, + "train_tokens_per_second": 412636.036 }, { "epoch": 1.6140199048031154, - "grad_norm": 0.470703125, - "learning_rate": 7.623e-05, - "loss": 0.0132, + "grad_norm": 0.373046875, + "learning_rate": 0.00016048421052631577, + "loss": 0.0173, "num_input_tokens_seen": 488742912, "step": 7460, - "train_runtime": 3907.967, - "train_tokens_per_second": 125063.215 + "train_runtime": 1186.6729, + "train_tokens_per_second": 411859.836 }, { "epoch": 1.616183470359152, - "grad_norm": 0.470703125, - "learning_rate": 7.592999999999999e-05, - "loss": 0.0141, + "grad_norm": 0.703125, + "learning_rate": 0.00015985263157894736, + "loss": 0.0197, "num_input_tokens_seen": 489398272, "step": 7470, - "train_runtime": 3911.4664, - "train_tokens_per_second": 125118.875 + "train_runtime": 1190.4916, + "train_tokens_per_second": 411089.213 }, { "epoch": 1.6183470359151881, - "grad_norm": 0.5078125, - "learning_rate": 7.562999999999999e-05, - "loss": 0.013, + "grad_norm": 0.458984375, + "learning_rate": 0.00015922105263157894, + "loss": 0.0169, "num_input_tokens_seen": 490053632, "step": 7480, - "train_runtime": 3914.9764, - "train_tokens_per_second": 125174.095 + "train_runtime": 1194.3154, + "train_tokens_per_second": 410321.798 }, { "epoch": 1.6205106014712247, - "grad_norm": 0.58203125, - "learning_rate": 7.532999999999999e-05, - "loss": 0.0146, + "grad_norm": 0.41015625, + "learning_rate": 0.00015858947368421053, + "loss": 0.0181, "num_input_tokens_seen": 490708992, "step": 7490, - "train_runtime": 3918.4931, - "train_tokens_per_second": 125229.004 + "train_runtime": 1198.1421, + "train_tokens_per_second": 409558.257 }, { "epoch": 1.6226741670272609, - "grad_norm": 0.66796875, - "learning_rate": 7.502999999999999e-05, - "loss": 0.0138, + "grad_norm": 0.5078125, + "learning_rate": 0.00015795789473684212, + "loss": 0.0188, "num_input_tokens_seen": 491364352, "step": 7500, - "train_runtime": 3921.9557, - "train_tokens_per_second": 125285.544 + "train_runtime": 1201.9634, + "train_tokens_per_second": 408801.423 }, { "epoch": 1.6226741670272609, - "eval_loss": 0.010030672885477543, - "eval_runtime": 25.1182, - "eval_samples_per_second": 1.274, - "eval_steps_per_second": 0.04, + "eval_loss": 0.013616210781037807, + "eval_runtime": 1.7416, + "eval_samples_per_second": 18.374, + "eval_steps_per_second": 0.574, "num_input_tokens_seen": 491364352, "step": 7500 }, { "epoch": 1.6226741670272609, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9962908011869436, + "eval_chrf": 97.09237409482614, + "eval_sacrebleu": 96.08208473900969, + "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 491364352, - "perplexity": 1.010081148712415, + "perplexity": 1.0137093335588248, "step": 7500 }, { "epoch": 1.6248377325832972, - "grad_norm": 0.4921875, - "learning_rate": 7.472999999999998e-05, - "loss": 0.0122, + "grad_norm": 0.375, + "learning_rate": 0.00015732631578947365, + "loss": 0.0175, "num_input_tokens_seen": 492019712, "step": 7510, - "train_runtime": 3950.6878, - "train_tokens_per_second": 124540.267 + "train_runtime": 1207.5937, + "train_tokens_per_second": 407438.132 }, { "epoch": 1.6270012981393336, - "grad_norm": 0.56640625, - "learning_rate": 7.442999999999999e-05, - "loss": 0.0155, + "grad_norm": 0.37109375, + "learning_rate": 0.00015669473684210524, + "loss": 0.0194, "num_input_tokens_seen": 492675072, "step": 7520, - "train_runtime": 3954.1883, - "train_tokens_per_second": 124595.753 + "train_runtime": 1211.4091, + "train_tokens_per_second": 406695.854 }, { "epoch": 1.62916486369537, - "grad_norm": 0.546875, - "learning_rate": 7.412999999999998e-05, - "loss": 0.0142, + "grad_norm": 0.26171875, + "learning_rate": 0.00015606315789473682, + "loss": 0.0185, "num_input_tokens_seen": 493330432, "step": 7530, - "train_runtime": 3957.6698, - "train_tokens_per_second": 124651.742 + "train_runtime": 1215.2321, + "train_tokens_per_second": 405955.72 }, { "epoch": 1.6313284292514063, - "grad_norm": 0.416015625, - "learning_rate": 7.383e-05, - "loss": 0.0138, + "grad_norm": 0.296875, + "learning_rate": 0.0001554315789473684, + "loss": 0.0182, "num_input_tokens_seen": 493985792, "step": 7540, - "train_runtime": 3961.1285, - "train_tokens_per_second": 124708.349 + "train_runtime": 1219.0542, + "train_tokens_per_second": 405220.534 }, { "epoch": 1.6334919948074427, - "grad_norm": 0.63671875, - "learning_rate": 7.353e-05, - "loss": 0.0144, + "grad_norm": 0.4609375, + "learning_rate": 0.0001548, + "loss": 0.0194, "num_input_tokens_seen": 494641152, "step": 7550, - "train_runtime": 3964.6212, - "train_tokens_per_second": 124763.786 + "train_runtime": 1222.8649, + "train_tokens_per_second": 404493.711 }, { "epoch": 1.635655560363479, - "grad_norm": 0.65234375, - "learning_rate": 7.323e-05, - "loss": 0.0157, + "grad_norm": 0.443359375, + "learning_rate": 0.00015416842105263158, + "loss": 0.0208, "num_input_tokens_seen": 495296512, "step": 7560, - "train_runtime": 3968.1269, - "train_tokens_per_second": 124818.718 + "train_runtime": 1226.6937, + "train_tokens_per_second": 403765.416 }, { "epoch": 1.6378191259195154, - "grad_norm": 0.77734375, - "learning_rate": 7.293e-05, - "loss": 0.0244, + "grad_norm": 0.5546875, + "learning_rate": 0.00015353684210526317, + "loss": 0.0281, "num_input_tokens_seen": 495951872, "step": 7570, - "train_runtime": 3971.6198, - "train_tokens_per_second": 124873.956 + "train_runtime": 1230.5146, + "train_tokens_per_second": 403044.283 }, { "epoch": 1.6399826914755518, - "grad_norm": 0.75, - "learning_rate": 7.263e-05, - "loss": 0.0135, + "grad_norm": 0.41015625, + "learning_rate": 0.0001529052631578947, + "loss": 0.018, "num_input_tokens_seen": 496603136, "step": 7580, - "train_runtime": 3975.1022, - "train_tokens_per_second": 124928.394 + "train_runtime": 1234.3122, + "train_tokens_per_second": 402331.882 }, { "epoch": 1.642146257031588, - "grad_norm": 0.4140625, - "learning_rate": 7.233e-05, - "loss": 0.0363, + "grad_norm": 0.376953125, + "learning_rate": 0.0001522736842105263, + "loss": 0.0421, "num_input_tokens_seen": 497258496, "step": 7590, - "train_runtime": 3978.5596, - "train_tokens_per_second": 124984.553 + "train_runtime": 1238.1343, + "train_tokens_per_second": 401619.179 }, { "epoch": 1.6443098225876245, - "grad_norm": 1.1953125, - "learning_rate": 7.203e-05, - "loss": 0.0181, + "grad_norm": 0.68359375, + "learning_rate": 0.00015164210526315787, + "loss": 0.0217, "num_input_tokens_seen": 497913856, "step": 7600, - "train_runtime": 3982.0121, - "train_tokens_per_second": 125040.769 + "train_runtime": 1241.953, + "train_tokens_per_second": 400912.003 }, { "epoch": 1.6443098225876245, - "eval_loss": 0.010073471814393997, - "eval_runtime": 27.7911, - "eval_samples_per_second": 1.151, - "eval_steps_per_second": 0.036, + "eval_loss": 0.014020687900483608, + "eval_runtime": 1.7048, + "eval_samples_per_second": 18.771, + "eval_steps_per_second": 0.587, "num_input_tokens_seen": 497913856, "step": 7600 }, { "epoch": 1.6443098225876245, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 97.66165640680113, + "eval_sacrebleu": 97.01614214490985, + "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 497913856, - "perplexity": 1.010124380028819, + "perplexity": 1.0141194387235046, "step": 7600 }, { "epoch": 1.6464733881436606, - "grad_norm": 0.376953125, - "learning_rate": 7.172999999999999e-05, - "loss": 0.013, + "grad_norm": 0.546875, + "learning_rate": 0.00015101052631578946, + "loss": 0.0174, "num_input_tokens_seen": 498569216, "step": 7610, - "train_runtime": 4013.2527, - "train_tokens_per_second": 124230.707 + "train_runtime": 1247.4884, + "train_tokens_per_second": 399658.409 }, { "epoch": 1.6486369536996972, - "grad_norm": 0.55859375, - "learning_rate": 7.143e-05, - "loss": 0.0118, + "grad_norm": 0.453125, + "learning_rate": 0.00015037894736842105, + "loss": 0.0165, "num_input_tokens_seen": 499224576, "step": 7620, - "train_runtime": 4016.678, - "train_tokens_per_second": 124287.925 + "train_runtime": 1251.297, + "train_tokens_per_second": 398965.687 }, { "epoch": 1.6508005192557333, - "grad_norm": 0.546875, - "learning_rate": 7.112999999999999e-05, - "loss": 0.0151, + "grad_norm": 0.333984375, + "learning_rate": 0.0001497473684210526, + "loss": 0.0201, "num_input_tokens_seen": 499879936, "step": 7630, - "train_runtime": 4020.1785, - "train_tokens_per_second": 124342.723 + "train_runtime": 1255.1196, + "train_tokens_per_second": 398272.738 }, { "epoch": 1.65296408481177, - "grad_norm": 1.03125, - "learning_rate": 7.083e-05, - "loss": 0.0157, + "grad_norm": 0.361328125, + "learning_rate": 0.0001491157894736842, + "loss": 0.021, "num_input_tokens_seen": 500535296, "step": 7640, - "train_runtime": 4023.7024, - "train_tokens_per_second": 124396.698 + "train_runtime": 1258.9407, + "train_tokens_per_second": 397584.502 }, { "epoch": 1.655127650367806, - "grad_norm": 0.671875, - "learning_rate": 7.052999999999999e-05, - "loss": 0.0156, + "grad_norm": 0.4453125, + "learning_rate": 0.00014848421052631578, + "loss": 0.0217, "num_input_tokens_seen": 501190656, "step": 7650, - "train_runtime": 4027.1984, - "train_tokens_per_second": 124451.444 + "train_runtime": 1262.7516, + "train_tokens_per_second": 396903.614 }, { "epoch": 1.6572912159238427, - "grad_norm": 0.53125, - "learning_rate": 7.023e-05, - "loss": 0.0131, + "grad_norm": 0.431640625, + "learning_rate": 0.00014785263157894736, + "loss": 0.018, "num_input_tokens_seen": 501846016, "step": 7660, - "train_runtime": 4030.6495, - "train_tokens_per_second": 124507.48 + "train_runtime": 1266.5554, + "train_tokens_per_second": 396229.029 }, { "epoch": 1.6594547814798788, - "grad_norm": 0.38671875, - "learning_rate": 6.992999999999999e-05, - "loss": 0.0141, + "grad_norm": 0.388671875, + "learning_rate": 0.00014722105263157892, + "loss": 0.0187, "num_input_tokens_seen": 502501376, "step": 7670, - "train_runtime": 4034.1532, - "train_tokens_per_second": 124561.797 + "train_runtime": 1270.3615, + "train_tokens_per_second": 395557.781 }, { "epoch": 1.6616183470359152, - "grad_norm": 0.494140625, - "learning_rate": 6.963e-05, - "loss": 0.0142, + "grad_norm": 0.33203125, + "learning_rate": 0.0001465894736842105, + "loss": 0.0194, "num_input_tokens_seen": 503156736, "step": 7680, - "train_runtime": 4037.6123, - "train_tokens_per_second": 124617.397 + "train_runtime": 1274.1782, + "train_tokens_per_second": 394887.254 }, { "epoch": 1.6637819125919515, - "grad_norm": 0.474609375, - "learning_rate": 6.932999999999999e-05, - "loss": 0.0192, + "grad_norm": 0.427734375, + "learning_rate": 0.0001459578947368421, + "loss": 0.0251, "num_input_tokens_seen": 503812096, "step": 7690, - "train_runtime": 4041.1376, - "train_tokens_per_second": 124670.859 + "train_runtime": 1277.9972, + "train_tokens_per_second": 394220.025 }, { "epoch": 1.6659454781479879, - "grad_norm": 0.5078125, - "learning_rate": 6.903e-05, - "loss": 0.014, + "grad_norm": 0.57421875, + "learning_rate": 0.00014532631578947368, + "loss": 0.0198, "num_input_tokens_seen": 504467456, "step": 7700, - "train_runtime": 4044.6403, - "train_tokens_per_second": 124724.926 + "train_runtime": 1281.8119, + "train_tokens_per_second": 393558.1 }, { "epoch": 1.6659454781479879, - "eval_loss": 0.009338016621768475, - "eval_runtime": 24.9753, - "eval_samples_per_second": 1.281, - "eval_steps_per_second": 0.04, + "eval_loss": 0.013091851025819778, + "eval_runtime": 1.6783, + "eval_samples_per_second": 19.067, + "eval_steps_per_second": 0.596, "num_input_tokens_seen": 504467456, "step": 7700 }, { "epoch": 1.6659454781479879, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 98.33791810109832, + "eval_sacrebleu": 97.89877731533007, + "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 504467456, - "perplexity": 1.00938175192665, + "perplexity": 1.013177924517751, "step": 7700 }, { "epoch": 1.6681090437040242, - "grad_norm": 0.53515625, - "learning_rate": 6.872999999999999e-05, - "loss": 0.0181, + "grad_norm": 0.498046875, + "learning_rate": 0.00014469473684210524, + "loss": 0.0208, "num_input_tokens_seen": 505122816, "step": 7710, - "train_runtime": 4073.0781, - "train_tokens_per_second": 124015.009 + "train_runtime": 1287.3267, + "train_tokens_per_second": 392381.207 }, { "epoch": 1.6702726092600606, - "grad_norm": 0.494140625, - "learning_rate": 6.843e-05, - "loss": 0.0128, + "grad_norm": 0.26171875, + "learning_rate": 0.00014406315789473683, + "loss": 0.0169, "num_input_tokens_seen": 505778176, "step": 7720, - "train_runtime": 4076.5916, - "train_tokens_per_second": 124068.886 + "train_runtime": 1291.1392, + "train_tokens_per_second": 391730.18 }, { "epoch": 1.672436174816097, - "grad_norm": 0.5, - "learning_rate": 6.812999999999999e-05, - "loss": 0.0138, + "grad_norm": 0.375, + "learning_rate": 0.00014343157894736842, + "loss": 0.0189, "num_input_tokens_seen": 506433536, "step": 7730, - "train_runtime": 4080.0968, - "train_tokens_per_second": 124122.921 + "train_runtime": 1294.9437, + "train_tokens_per_second": 391085.381 }, { "epoch": 1.6745997403721333, - "grad_norm": 0.453125, - "learning_rate": 6.782999999999999e-05, - "loss": 0.0133, + "grad_norm": 0.47265625, + "learning_rate": 0.00014279999999999997, + "loss": 0.0178, "num_input_tokens_seen": 507088896, "step": 7740, - "train_runtime": 4083.6159, - "train_tokens_per_second": 124176.443 + "train_runtime": 1298.7576, + "train_tokens_per_second": 390441.534 }, { "epoch": 1.6767633059281697, - "grad_norm": 0.55078125, - "learning_rate": 6.753e-05, - "loss": 0.013, + "grad_norm": 0.55859375, + "learning_rate": 0.00014216842105263156, + "loss": 0.0176, "num_input_tokens_seen": 507744256, "step": 7750, - "train_runtime": 4087.1471, - "train_tokens_per_second": 124229.504 + "train_runtime": 1302.5808, + "train_tokens_per_second": 389798.651 }, { "epoch": 1.6789268714842058, - "grad_norm": 0.453125, - "learning_rate": 6.722999999999999e-05, - "loss": 0.0129, + "grad_norm": 0.4609375, + "learning_rate": 0.00014153684210526315, + "loss": 0.0187, "num_input_tokens_seen": 508399616, "step": 7760, - "train_runtime": 4090.6683, - "train_tokens_per_second": 124282.776 + "train_runtime": 1306.3997, + "train_tokens_per_second": 389160.854 }, { "epoch": 1.6810904370402424, - "grad_norm": 0.51953125, - "learning_rate": 6.693e-05, - "loss": 0.013, + "grad_norm": 0.412109375, + "learning_rate": 0.00014090526315789473, + "loss": 0.0174, "num_input_tokens_seen": 509054976, "step": 7770, - "train_runtime": 4094.1678, - "train_tokens_per_second": 124336.617 + "train_runtime": 1310.226, + "train_tokens_per_second": 388524.563 }, { "epoch": 1.6832540025962786, - "grad_norm": 0.70703125, - "learning_rate": 6.662999999999999e-05, - "loss": 0.0137, + "grad_norm": 0.435546875, + "learning_rate": 0.0001402736842105263, + "loss": 0.0186, "num_input_tokens_seen": 509710336, "step": 7780, - "train_runtime": 4097.7005, - "train_tokens_per_second": 124389.357 + "train_runtime": 1314.0427, + "train_tokens_per_second": 387894.794 }, { "epoch": 1.6854175681523151, - "grad_norm": 0.51171875, - "learning_rate": 6.633e-05, - "loss": 0.0148, + "grad_norm": 0.279296875, + "learning_rate": 0.00013964210526315788, + "loss": 0.0201, "num_input_tokens_seen": 510365696, "step": 7790, - "train_runtime": 4101.1955, - "train_tokens_per_second": 124443.152 + "train_runtime": 1317.8586, + "train_tokens_per_second": 387268.921 }, { "epoch": 1.6875811337083513, - "grad_norm": 0.625, - "learning_rate": 6.602999999999999e-05, - "loss": 0.0148, + "grad_norm": 0.55078125, + "learning_rate": 0.00013901052631578947, + "loss": 0.0205, "num_input_tokens_seen": 511021056, "step": 7800, - "train_runtime": 4104.6532, - "train_tokens_per_second": 124497.986 + "train_runtime": 1321.683, + "train_tokens_per_second": 386644.18 }, { "epoch": 1.6875811337083513, - "eval_loss": 0.01054280437529087, - "eval_runtime": 26.0549, - "eval_samples_per_second": 1.228, - "eval_steps_per_second": 0.038, + "eval_loss": 0.013206811621785164, + "eval_runtime": 1.6732, + "eval_samples_per_second": 19.125, + "eval_steps_per_second": 0.598, "num_input_tokens_seen": 511021056, "step": 7800 }, { "epoch": 1.6875811337083513, - "eval_byte_accuracy": 0.9992581602373887, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9972527472527473, + "eval_byte_accuracy": 0.9962908011869436, + "eval_chrf": 97.72750672761462, + "eval_sacrebleu": 96.96578008549908, + "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 511021056, - "perplexity": 1.0105985755599196, + "perplexity": 1.0132944067510776, "step": 7800 }, { "epoch": 1.6897446992643879, - "grad_norm": 0.435546875, - "learning_rate": 6.573e-05, - "loss": 0.0176, + "grad_norm": 0.369140625, + "learning_rate": 0.00013837894736842103, + "loss": 0.0202, "num_input_tokens_seen": 511676416, "step": 7810, - "train_runtime": 4134.1552, - "train_tokens_per_second": 123768.071 + "train_runtime": 1327.1865, + "train_tokens_per_second": 385534.678 }, { "epoch": 1.691908264820424, - "grad_norm": 0.80859375, - "learning_rate": 6.542999999999999e-05, - "loss": 0.014, + "grad_norm": 0.470703125, + "learning_rate": 0.0001377473684210526, + "loss": 0.0194, "num_input_tokens_seen": 512331776, "step": 7820, - "train_runtime": 4137.6567, - "train_tokens_per_second": 123821.723 + "train_runtime": 1331.0028, + "train_tokens_per_second": 384921.632 }, { "epoch": 1.6940718303764604, - "grad_norm": 0.87109375, - "learning_rate": 6.513e-05, - "loss": 0.014, + "grad_norm": 0.287109375, + "learning_rate": 0.0001371157894736842, + "loss": 0.0187, "num_input_tokens_seen": 512987136, "step": 7830, - "train_runtime": 4141.1889, - "train_tokens_per_second": 123874.362 + "train_runtime": 1334.8205, + "train_tokens_per_second": 384311.689 }, { "epoch": 1.6962353959324967, - "grad_norm": 0.5390625, - "learning_rate": 6.482999999999999e-05, - "loss": 0.0141, + "grad_norm": 0.29296875, + "learning_rate": 0.00013648421052631578, + "loss": 0.0179, "num_input_tokens_seen": 513642496, "step": 7840, - "train_runtime": 4144.6928, - "train_tokens_per_second": 123927.759 + "train_runtime": 1338.649, + "train_tokens_per_second": 383702.138 }, { "epoch": 1.698398961488533, - "grad_norm": 0.392578125, - "learning_rate": 6.453e-05, - "loss": 0.0141, + "grad_norm": 0.30859375, + "learning_rate": 0.00013585263157894734, + "loss": 0.0172, "num_input_tokens_seen": 514297856, "step": 7850, - "train_runtime": 4148.1763, - "train_tokens_per_second": 123981.676 + "train_runtime": 1342.4692, + "train_tokens_per_second": 383098.448 }, { "epoch": 1.7005625270445694, - "grad_norm": 0.54296875, - "learning_rate": 6.423e-05, - "loss": 0.0133, + "grad_norm": 0.48828125, + "learning_rate": 0.00013522105263157893, + "loss": 0.0179, "num_input_tokens_seen": 514953216, "step": 7860, - "train_runtime": 4151.6309, - "train_tokens_per_second": 124036.368 + "train_runtime": 1346.2887, + "train_tokens_per_second": 382498.344 }, { "epoch": 1.7027260926006058, - "grad_norm": 0.48828125, - "learning_rate": 6.392999999999999e-05, - "loss": 0.0143, + "grad_norm": 0.5078125, + "learning_rate": 0.00013458947368421052, + "loss": 0.0171, "num_input_tokens_seen": 515608576, "step": 7870, - "train_runtime": 4155.0843, - "train_tokens_per_second": 124091.003 + "train_runtime": 1350.1111, + "train_tokens_per_second": 381900.851 }, { "epoch": 1.7048896581566422, - "grad_norm": 0.625, - "learning_rate": 6.363e-05, - "loss": 0.0131, + "grad_norm": 0.314453125, + "learning_rate": 0.00013395789473684208, + "loss": 0.0169, "num_input_tokens_seen": 516263936, "step": 7880, - "train_runtime": 4158.5318, - "train_tokens_per_second": 124145.723 + "train_runtime": 1353.9361, + "train_tokens_per_second": 381305.991 }, { "epoch": 1.7070532237126785, - "grad_norm": 0.5234375, - "learning_rate": 6.332999999999999e-05, - "loss": 0.0142, + "grad_norm": 0.3828125, + "learning_rate": 0.00013332631578947366, + "loss": 0.019, "num_input_tokens_seen": 516919296, "step": 7890, - "train_runtime": 4161.9774, - "train_tokens_per_second": 124200.408 + "train_runtime": 1357.7602, + "train_tokens_per_second": 380714.716 }, { "epoch": 1.709216789268715, "grad_norm": 0.40234375, - "learning_rate": 6.303e-05, - "loss": 0.0128, + "learning_rate": 0.00013269473684210525, + "loss": 0.0178, "num_input_tokens_seen": 517574656, "step": 7900, - "train_runtime": 4165.4197, - "train_tokens_per_second": 124255.104 + "train_runtime": 1361.5822, + "train_tokens_per_second": 380127.358 }, { "epoch": 1.709216789268715, - "eval_loss": 0.010329792276024818, - "eval_runtime": 24.4336, - "eval_samples_per_second": 1.31, - "eval_steps_per_second": 0.041, + "eval_loss": 0.013531056232750416, + "eval_runtime": 1.689, + "eval_samples_per_second": 18.946, + "eval_steps_per_second": 0.592, "num_input_tokens_seen": 517574656, "step": 7900 }, { "epoch": 1.709216789268715, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 97.81668588759814, + "eval_sacrebleu": 97.30034967398244, + "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 517574656, - "perplexity": 1.0103833287617243, + "perplexity": 1.0136230152736803, "step": 7900 }, { "epoch": 1.711380354824751, - "grad_norm": 0.6796875, - "learning_rate": 6.272999999999999e-05, - "loss": 0.0139, + "grad_norm": 0.47265625, + "learning_rate": 0.00013206315789473684, + "loss": 0.0179, "num_input_tokens_seen": 518230016, "step": 7910, - "train_runtime": 4193.3547, - "train_tokens_per_second": 123583.635 + "train_runtime": 1367.1045, + "train_tokens_per_second": 379071.261 }, { "epoch": 1.7135439203807876, - "grad_norm": 0.419921875, - "learning_rate": 6.243e-05, - "loss": 0.0136, + "grad_norm": 0.482421875, + "learning_rate": 0.0001314315789473684, + "loss": 0.0184, "num_input_tokens_seen": 518885376, "step": 7920, - "train_runtime": 4196.8735, - "train_tokens_per_second": 123636.172 + "train_runtime": 1370.9241, + "train_tokens_per_second": 378493.134 }, { "epoch": 1.7157074859368238, - "grad_norm": 0.4609375, - "learning_rate": 6.213e-05, - "loss": 0.0138, + "grad_norm": 0.5, + "learning_rate": 0.00013079999999999998, + "loss": 0.0167, "num_input_tokens_seen": 519540736, "step": 7930, - "train_runtime": 4200.3925, - "train_tokens_per_second": 123688.617 + "train_runtime": 1374.742, + "train_tokens_per_second": 377918.727 }, { "epoch": 1.7178710514928603, - "grad_norm": 0.431640625, - "learning_rate": 6.183e-05, - "loss": 0.0138, + "grad_norm": 0.345703125, + "learning_rate": 0.00013016842105263157, + "loss": 0.0185, "num_input_tokens_seen": 520196096, "step": 7940, - "train_runtime": 4203.9088, - "train_tokens_per_second": 123741.05 + "train_runtime": 1378.5609, + "train_tokens_per_second": 377347.193 }, { "epoch": 1.7200346170488965, - "grad_norm": 0.484375, - "learning_rate": 6.153e-05, - "loss": 0.015, + "grad_norm": 0.427734375, + "learning_rate": 0.00012953684210526313, + "loss": 0.0199, "num_input_tokens_seen": 520851456, "step": 7950, - "train_runtime": 4207.3853, - "train_tokens_per_second": 123794.571 + "train_runtime": 1382.3761, + "train_tokens_per_second": 376779.845 }, { "epoch": 1.722198182604933, - "grad_norm": 0.4921875, - "learning_rate": 6.123e-05, - "loss": 0.0144, + "grad_norm": 0.310546875, + "learning_rate": 0.0001289052631578947, + "loss": 0.0196, "num_input_tokens_seen": 521506816, "step": 7960, - "train_runtime": 4210.8321, - "train_tokens_per_second": 123848.873 + "train_runtime": 1386.191, + "train_tokens_per_second": 376215.708 }, { "epoch": 1.7243617481609692, - "grad_norm": 0.47265625, - "learning_rate": 6.0929999999999994e-05, - "loss": 0.0121, + "grad_norm": 0.3125, + "learning_rate": 0.0001282736842105263, + "loss": 0.0172, "num_input_tokens_seen": 522162176, "step": 7970, - "train_runtime": 4214.2919, - "train_tokens_per_second": 123902.706 + "train_runtime": 1390.0057, + "train_tokens_per_second": 375654.689 }, { "epoch": 1.7265253137170056, - "grad_norm": 0.60546875, - "learning_rate": 6.0629999999999994e-05, - "loss": 0.0136, + "grad_norm": 0.384765625, + "learning_rate": 0.00012764210526315789, + "loss": 0.019, "num_input_tokens_seen": 522817536, "step": 7980, - "train_runtime": 4217.7676, - "train_tokens_per_second": 123955.986 + "train_runtime": 1393.8333, + "train_tokens_per_second": 375093.309 }, { "epoch": 1.728688879273042, - "grad_norm": 0.4609375, - "learning_rate": 6.032999999999999e-05, - "loss": 0.0146, + "grad_norm": 0.443359375, + "learning_rate": 0.00012701052631578945, + "loss": 0.0202, "num_input_tokens_seen": 523472896, "step": 7990, - "train_runtime": 4221.2812, - "train_tokens_per_second": 124008.062 + "train_runtime": 1397.6593, + "train_tokens_per_second": 374535.397 }, { "epoch": 1.7308524448290783, - "grad_norm": 0.5625, - "learning_rate": 6.002999999999999e-05, - "loss": 0.0133, + "grad_norm": 0.33203125, + "learning_rate": 0.00012637894736842103, + "loss": 0.0185, "num_input_tokens_seen": 524128256, "step": 8000, - "train_runtime": 4224.8142, - "train_tokens_per_second": 124059.48 + "train_runtime": 1401.4815, + "train_tokens_per_second": 373981.569 }, { "epoch": 1.7308524448290783, - "eval_loss": 0.010492024943232536, - "eval_runtime": 25.6029, - "eval_samples_per_second": 1.25, - "eval_steps_per_second": 0.039, + "eval_loss": 0.014470809139311314, + "eval_runtime": 1.7021, + "eval_samples_per_second": 18.801, + "eval_steps_per_second": 0.588, "num_input_tokens_seen": 524128256, "step": 8000 }, { "epoch": 1.7308524448290783, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.995919881305638, + "eval_chrf": 97.36176305597374, + "eval_sacrebleu": 96.96578008549908, + "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 524128256, - "perplexity": 1.0105472592411315, + "perplexity": 1.0145760181719266, "step": 8000 }, { "epoch": 1.7330160103851147, - "grad_norm": 0.7890625, - "learning_rate": 5.972999999999999e-05, - "loss": 0.0125, + "grad_norm": 0.48046875, + "learning_rate": 0.00012574736842105262, + "loss": 0.018, "num_input_tokens_seen": 524783616, "step": 8010, - "train_runtime": 4254.0446, - "train_tokens_per_second": 123361.097 + "train_runtime": 1407.0677, + "train_tokens_per_second": 372962.592 }, { "epoch": 1.735179575941151, - "grad_norm": 0.439453125, - "learning_rate": 5.942999999999999e-05, - "loss": 0.0146, + "grad_norm": 0.29296875, + "learning_rate": 0.0001251157894736842, + "loss": 0.0191, "num_input_tokens_seen": 525438976, "step": 8020, - "train_runtime": 4257.5588, - "train_tokens_per_second": 123413.205 + "train_runtime": 1410.8879, + "train_tokens_per_second": 372417.238 }, { "epoch": 1.7373431414971874, - "grad_norm": 0.470703125, - "learning_rate": 5.912999999999999e-05, - "loss": 0.0142, + "grad_norm": 0.40625, + "learning_rate": 0.00012448421052631576, + "loss": 0.0192, "num_input_tokens_seen": 526094336, "step": 8030, - "train_runtime": 4261.0721, - "train_tokens_per_second": 123465.251 + "train_runtime": 1414.7074, + "train_tokens_per_second": 371875.006 }, { "epoch": 1.7395067070532237, - "grad_norm": 0.447265625, - "learning_rate": 5.882999999999999e-05, - "loss": 0.0134, + "grad_norm": 0.44140625, + "learning_rate": 0.00012385263157894735, + "loss": 0.0186, "num_input_tokens_seen": 526749696, "step": 8040, - "train_runtime": 4264.5988, - "train_tokens_per_second": 123516.823 + "train_runtime": 1418.533, + "train_tokens_per_second": 371334.117 }, { "epoch": 1.74167027260926, - "grad_norm": 0.349609375, - "learning_rate": 5.852999999999999e-05, - "loss": 0.0134, + "grad_norm": 0.328125, + "learning_rate": 0.00012322105263157894, + "loss": 0.0181, "num_input_tokens_seen": 527405056, "step": 8050, - "train_runtime": 4268.1207, - "train_tokens_per_second": 123568.45 + "train_runtime": 1422.3527, + "train_tokens_per_second": 370797.665 }, { "epoch": 1.7438338381652962, - "grad_norm": 0.56640625, - "learning_rate": 5.8229999999999996e-05, - "loss": 0.0136, + "grad_norm": 0.43359375, + "learning_rate": 0.00012258947368421052, + "loss": 0.0188, "num_input_tokens_seen": 528060416, "step": 8060, - "train_runtime": 4271.6512, - "train_tokens_per_second": 123619.742 + "train_runtime": 1426.1678, + "train_tokens_per_second": 370265.285 }, { "epoch": 1.7459974037213328, - "grad_norm": 0.4296875, - "learning_rate": 5.7929999999999996e-05, - "loss": 0.0122, + "grad_norm": 0.349609375, + "learning_rate": 0.0001219578947368421, + "loss": 0.0158, "num_input_tokens_seen": 528715776, "step": 8070, - "train_runtime": 4275.1816, - "train_tokens_per_second": 123670.951 + "train_runtime": 1429.9887, + "train_tokens_per_second": 369734.225 }, { "epoch": 1.748160969277369, - "grad_norm": 0.294921875, - "learning_rate": 5.7629999999999995e-05, - "loss": 0.0126, + "grad_norm": 0.37109375, + "learning_rate": 0.00012132631578947368, + "loss": 0.0175, "num_input_tokens_seen": 529371136, "step": 8080, - "train_runtime": 4278.7018, - "train_tokens_per_second": 123722.373 + "train_runtime": 1433.8098, + "train_tokens_per_second": 369205.976 }, { "epoch": 1.7503245348334056, - "grad_norm": 0.490234375, - "learning_rate": 5.7329999999999995e-05, - "loss": 0.0131, + "grad_norm": 0.39453125, + "learning_rate": 0.00012069473684210526, + "loss": 0.018, "num_input_tokens_seen": 530026496, "step": 8090, - "train_runtime": 4282.2153, - "train_tokens_per_second": 123773.901 + "train_runtime": 1437.6297, + "train_tokens_per_second": 368680.818 }, { "epoch": 1.7524881003894417, "grad_norm": 0.62109375, - "learning_rate": 5.7029999999999994e-05, - "loss": 0.0155, + "learning_rate": 0.00012006315789473683, + "loss": 0.0203, "num_input_tokens_seen": 530681856, "step": 8100, - "train_runtime": 4285.7093, - "train_tokens_per_second": 123825.91 + "train_runtime": 1441.4439, + "train_tokens_per_second": 368159.91 }, { "epoch": 1.7524881003894417, - "eval_loss": 0.009867150336503983, - "eval_runtime": 25.6916, - "eval_samples_per_second": 1.246, - "eval_steps_per_second": 0.039, + "eval_loss": 0.013024972751736641, + "eval_runtime": 1.7245, + "eval_samples_per_second": 18.556, + "eval_steps_per_second": 0.58, "num_input_tokens_seen": 530681856, "step": 8100 }, { "epoch": 1.7524881003894417, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9962908011869436, + "eval_chrf": 97.81668588759814, + "eval_sacrebleu": 97.30034967398244, + "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 530681856, - "perplexity": 1.0099159911721665, + "perplexity": 1.0131101671925917, "step": 8100 }, { "epoch": 1.7546516659454783, - "grad_norm": 0.58203125, - "learning_rate": 5.6729999999999994e-05, - "loss": 0.0135, + "grad_norm": 0.33984375, + "learning_rate": 0.00011943157894736841, + "loss": 0.0191, "num_input_tokens_seen": 531337216, "step": 8110, - "train_runtime": 4314.8838, - "train_tokens_per_second": 123140.563 + "train_runtime": 1447.0008, + "train_tokens_per_second": 367198.984 }, { "epoch": 1.7568152315015144, - "grad_norm": 0.57421875, - "learning_rate": 5.642999999999999e-05, - "loss": 0.0161, + "grad_norm": 0.40625, + "learning_rate": 0.0001188, + "loss": 0.0187, "num_input_tokens_seen": 531992576, "step": 8120, - "train_runtime": 4318.3418, - "train_tokens_per_second": 123193.716 + "train_runtime": 1450.8158, + "train_tokens_per_second": 366685.131 }, { "epoch": 1.758978797057551, - "grad_norm": 0.408203125, - "learning_rate": 5.612999999999999e-05, - "loss": 0.0138, + "grad_norm": 0.294921875, + "learning_rate": 0.00011816842105263156, + "loss": 0.0184, "num_input_tokens_seen": 532647936, "step": 8130, - "train_runtime": 4321.8439, - "train_tokens_per_second": 123245.528 + "train_runtime": 1454.6211, + "train_tokens_per_second": 366176.407 }, { "epoch": 1.7611423626135871, - "grad_norm": 0.71875, - "learning_rate": 5.582999999999999e-05, - "loss": 0.0139, + "grad_norm": 0.453125, + "learning_rate": 0.00011753684210526315, + "loss": 0.0182, "num_input_tokens_seen": 533303296, "step": 8140, - "train_runtime": 4325.3489, - "train_tokens_per_second": 123297.174 + "train_runtime": 1458.4408, + "train_tokens_per_second": 365666.738 }, { "epoch": 1.7633059281696235, - "grad_norm": 0.90625, - "learning_rate": 5.552999999999999e-05, - "loss": 0.0142, + "grad_norm": 0.578125, + "learning_rate": 0.00011690526315789473, + "loss": 0.0208, "num_input_tokens_seen": 533958656, "step": 8150, - "train_runtime": 4328.8692, - "train_tokens_per_second": 123348.3 + "train_runtime": 1462.2518, + "train_tokens_per_second": 365161.91 }, { "epoch": 1.7654694937256599, - "grad_norm": 0.61328125, - "learning_rate": 5.523e-05, - "loss": 0.023, + "grad_norm": 0.37109375, + "learning_rate": 0.00011627368421052632, + "loss": 0.0248, "num_input_tokens_seen": 534614016, "step": 8160, - "train_runtime": 4332.3863, - "train_tokens_per_second": 123399.434 + "train_runtime": 1466.0724, + "train_tokens_per_second": 364657.313 }, { "epoch": 1.7676330592816962, - "grad_norm": 0.474609375, - "learning_rate": 5.493e-05, - "loss": 0.0138, + "grad_norm": 0.36328125, + "learning_rate": 0.00011564210526315788, + "loss": 0.0193, "num_input_tokens_seen": 535269376, "step": 8170, - "train_runtime": 4335.904, - "train_tokens_per_second": 123450.469 + "train_runtime": 1469.8985, + "train_tokens_per_second": 364153.975 }, { "epoch": 1.7697966248377326, - "grad_norm": 0.6953125, - "learning_rate": 5.463e-05, - "loss": 0.0179, - "num_input_tokens_seen": 535924736, + "grad_norm": 0.466796875, + "learning_rate": 0.00011501052631578947, + "loss": 0.0216, + "num_input_tokens_seen": 535924736, "step": 8180, - "train_runtime": 4339.3731, - "train_tokens_per_second": 123502.802 + "train_runtime": 1473.719, + "train_tokens_per_second": 363654.622 }, { "epoch": 1.771960190393769, - "grad_norm": 0.5078125, - "learning_rate": 5.4329999999999997e-05, - "loss": 0.0127, + "grad_norm": 0.328125, + "learning_rate": 0.00011437894736842105, + "loss": 0.0169, "num_input_tokens_seen": 536580096, "step": 8190, - "train_runtime": 4342.8717, - "train_tokens_per_second": 123554.212 + "train_runtime": 1477.5411, + "train_tokens_per_second": 363157.486 }, { "epoch": 1.7741237559498053, - "grad_norm": 0.46484375, - "learning_rate": 5.4029999999999996e-05, - "loss": 0.0147, + "grad_norm": 0.298828125, + "learning_rate": 0.00011374736842105261, + "loss": 0.0163, "num_input_tokens_seen": 537235456, "step": 8200, - "train_runtime": 4346.3638, - "train_tokens_per_second": 123605.727 + "train_runtime": 1481.3719, + "train_tokens_per_second": 362660.748 }, { "epoch": 1.7741237559498053, - "eval_loss": 0.009776638820767403, - "eval_runtime": 26.0205, - "eval_samples_per_second": 1.23, - "eval_steps_per_second": 0.038, + "eval_loss": 0.013625239953398705, + "eval_runtime": 1.727, + "eval_samples_per_second": 18.529, + "eval_steps_per_second": 0.579, "num_input_tokens_seen": 537235456, "step": 8200 }, { "epoch": 1.7741237559498053, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 97.80284314119724, + "eval_sacrebleu": 97.30034967398244, + "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 537235456, - "perplexity": 1.009824586281699, + "perplexity": 1.0137184865564433, "step": 8200 }, { "epoch": 1.7762873215058417, - "grad_norm": 0.7109375, - "learning_rate": 5.3729999999999995e-05, - "loss": 0.0143, + "grad_norm": 0.380859375, + "learning_rate": 0.0001131157894736842, + "loss": 0.0202, "num_input_tokens_seen": 537890816, "step": 8210, - "train_runtime": 4375.8953, - "train_tokens_per_second": 122921.316 + "train_runtime": 1486.9277, + "train_tokens_per_second": 361746.45 }, { "epoch": 1.778450887061878, - "grad_norm": 0.64453125, - "learning_rate": 5.3429999999999995e-05, - "loss": 0.0126, + "grad_norm": 0.349609375, + "learning_rate": 0.00011248421052631578, + "loss": 0.017, "num_input_tokens_seen": 538546176, "step": 8220, - "train_runtime": 4379.3911, - "train_tokens_per_second": 122972.844 + "train_runtime": 1490.7499, + "train_tokens_per_second": 361258.562 }, { "epoch": 1.7806144526179142, - "grad_norm": 0.494140625, - "learning_rate": 5.3129999999999994e-05, - "loss": 0.0149, + "grad_norm": 0.296875, + "learning_rate": 0.00011185263157894737, + "loss": 0.0203, "num_input_tokens_seen": 539201536, "step": 8230, - "train_runtime": 4382.9147, - "train_tokens_per_second": 123023.506 + "train_runtime": 1494.5707, + "train_tokens_per_second": 360773.533 }, { "epoch": 1.7827780181739508, - "grad_norm": 0.404296875, - "learning_rate": 5.283e-05, - "loss": 0.0124, + "grad_norm": 0.453125, + "learning_rate": 0.00011122105263157893, + "loss": 0.0159, "num_input_tokens_seen": 539856896, "step": 8240, - "train_runtime": 4386.3839, - "train_tokens_per_second": 123075.616 + "train_runtime": 1498.3854, + "train_tokens_per_second": 360292.412 }, { "epoch": 1.784941583729987, - "grad_norm": 0.6328125, - "learning_rate": 5.253e-05, - "loss": 0.0153, + "grad_norm": 0.341796875, + "learning_rate": 0.00011058947368421052, + "loss": 0.0207, "num_input_tokens_seen": 540512256, "step": 8250, - "train_runtime": 4389.8742, - "train_tokens_per_second": 123127.048 + "train_runtime": 1502.2091, + "train_tokens_per_second": 359811.589 }, { "epoch": 1.7871051492860235, - "grad_norm": 0.396484375, - "learning_rate": 5.223e-05, - "loss": 0.0135, + "grad_norm": 0.3671875, + "learning_rate": 0.0001099578947368421, + "loss": 0.0173, "num_input_tokens_seen": 541167616, "step": 8260, - "train_runtime": 4393.4171, - "train_tokens_per_second": 123176.926 + "train_runtime": 1506.0281, + "train_tokens_per_second": 359334.348 }, { "epoch": 1.7892687148420596, - "grad_norm": 0.59375, - "learning_rate": 5.193e-05, - "loss": 0.0154, + "grad_norm": 0.52734375, + "learning_rate": 0.00010932631578947366, + "loss": 0.0185, "num_input_tokens_seen": 541822976, "step": 8270, - "train_runtime": 4396.909, - "train_tokens_per_second": 123228.154 + "train_runtime": 1509.8436, + "train_tokens_per_second": 358860.325 }, { "epoch": 1.7914322803980962, - "grad_norm": 0.416015625, - "learning_rate": 5.163e-05, - "loss": 0.0125, + "grad_norm": 0.30859375, + "learning_rate": 0.00010869473684210525, + "loss": 0.0167, "num_input_tokens_seen": 542478336, "step": 8280, - "train_runtime": 4400.4243, - "train_tokens_per_second": 123278.642 + "train_runtime": 1513.6693, + "train_tokens_per_second": 358386.294 }, { "epoch": 1.7935958459541324, - "grad_norm": 0.609375, - "learning_rate": 5.133e-05, - "loss": 0.0157, + "grad_norm": 0.3359375, + "learning_rate": 0.00010806315789473683, + "loss": 0.0203, "num_input_tokens_seen": 543133696, "step": 8290, - "train_runtime": 4403.9557, - "train_tokens_per_second": 123328.601 + "train_runtime": 1517.488, + "train_tokens_per_second": 357916.313 }, { "epoch": 1.7957594115101687, - "grad_norm": 0.44140625, - "learning_rate": 5.103e-05, - "loss": 0.0136, + "grad_norm": 0.4765625, + "learning_rate": 0.00010743157894736842, + "loss": 0.0189, "num_input_tokens_seen": 543789056, "step": 8300, - "train_runtime": 4407.4548, - "train_tokens_per_second": 123379.383 + "train_runtime": 1521.3068, + "train_tokens_per_second": 357448.64 }, { "epoch": 1.7957594115101687, - "eval_loss": 0.009596272371709347, - "eval_runtime": 26.993, - "eval_samples_per_second": 1.185, - "eval_steps_per_second": 0.037, + "eval_loss": 0.013864190317690372, + "eval_runtime": 1.7263, + "eval_samples_per_second": 18.537, + "eval_steps_per_second": 0.579, "num_input_tokens_seen": 543789056, "step": 8300 }, { "epoch": 1.7957594115101687, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9974035608308606, + "eval_chrf": 98.29638986189566, + "eval_sacrebleu": 97.89877731533007, + "eval_word_accuracy": 0.9931318681318682, "num_input_tokens_seen": 543789056, - "perplexity": 1.0096424642317472, + "perplexity": 1.0139607439006826, "step": 8300 }, { "epoch": 1.797922977066205, - "grad_norm": 0.455078125, - "learning_rate": 5.073e-05, - "loss": 0.0148, + "grad_norm": 0.37109375, + "learning_rate": 0.00010679999999999998, + "loss": 0.0209, "num_input_tokens_seen": 544444416, "step": 8310, - "train_runtime": 4437.9953, - "train_tokens_per_second": 122678.006 + "train_runtime": 1526.8669, + "train_tokens_per_second": 356576.208 }, { "epoch": 1.8000865426222414, - "grad_norm": 0.38671875, - "learning_rate": 5.0429999999999997e-05, - "loss": 0.0137, + "grad_norm": 0.443359375, + "learning_rate": 0.00010616842105263157, + "loss": 0.0185, "num_input_tokens_seen": 545099776, "step": 8320, - "train_runtime": 4441.5315, - "train_tokens_per_second": 122727.888 + "train_runtime": 1530.6897, + "train_tokens_per_second": 356113.831 }, { "epoch": 1.8022501081782778, - "grad_norm": 0.416015625, - "learning_rate": 5.0129999999999996e-05, - "loss": 0.013, + "grad_norm": 0.357421875, + "learning_rate": 0.00010553684210526315, + "loss": 0.0179, "num_input_tokens_seen": 545755136, "step": 8330, - "train_runtime": 4445.1617, - "train_tokens_per_second": 122775.093 + "train_runtime": 1534.5113, + "train_tokens_per_second": 355654.039 }, { "epoch": 1.8044136737343142, - "grad_norm": 0.54296875, - "learning_rate": 4.9829999999999996e-05, - "loss": 0.0174, + "grad_norm": 0.369140625, + "learning_rate": 0.00010490526315789473, + "loss": 0.0188, "num_input_tokens_seen": 546410496, "step": 8340, - "train_runtime": 4448.6443, - "train_tokens_per_second": 122826.295 + "train_runtime": 1538.3352, + "train_tokens_per_second": 355195.994 }, { "epoch": 1.8065772392903505, - "grad_norm": 0.455078125, - "learning_rate": 4.9529999999999995e-05, - "loss": 0.0152, + "grad_norm": 0.322265625, + "learning_rate": 0.0001042736842105263, + "loss": 0.0169, "num_input_tokens_seen": 547065856, "step": 8350, - "train_runtime": 4452.2792, - "train_tokens_per_second": 122873.213 + "train_runtime": 1542.1463, + "train_tokens_per_second": 354743.156 }, { "epoch": 1.8087408048463869, - "grad_norm": 0.68359375, - "learning_rate": 4.9229999999999995e-05, - "loss": 0.0145, + "grad_norm": 0.427734375, + "learning_rate": 0.00010364210526315789, + "loss": 0.0183, "num_input_tokens_seen": 547721216, "step": 8360, - "train_runtime": 4455.8551, - "train_tokens_per_second": 122921.683 + "train_runtime": 1545.9682, + "train_tokens_per_second": 354290.09 }, { "epoch": 1.8109043704024232, - "grad_norm": 0.478515625, - "learning_rate": 4.8929999999999994e-05, - "loss": 0.015, + "grad_norm": 0.392578125, + "learning_rate": 0.00010301052631578947, + "loss": 0.019, "num_input_tokens_seen": 548376576, "step": 8370, - "train_runtime": 4459.4306, - "train_tokens_per_second": 122970.087 + "train_runtime": 1549.7896, + "train_tokens_per_second": 353839.38 }, { "epoch": 1.8130679359584594, - "grad_norm": 0.435546875, - "learning_rate": 4.8629999999999993e-05, - "loss": 0.0141, + "grad_norm": 0.33984375, + "learning_rate": 0.00010237894736842104, + "loss": 0.0178, "num_input_tokens_seen": 549031936, "step": 8380, - "train_runtime": 4463.1144, - "train_tokens_per_second": 123015.428 + "train_runtime": 1553.6089, + "train_tokens_per_second": 353391.347 }, { "epoch": 1.815231501514496, - "grad_norm": 0.3671875, - "learning_rate": 4.832999999999999e-05, - "loss": 0.0141, + "grad_norm": 0.357421875, + "learning_rate": 0.00010174736842105262, + "loss": 0.0192, "num_input_tokens_seen": 549687296, "step": 8390, - "train_runtime": 4466.8936, - "train_tokens_per_second": 123058.068 + "train_runtime": 1557.4331, + "train_tokens_per_second": 352944.411 }, { "epoch": 1.817395067070532, - "grad_norm": 0.400390625, - "learning_rate": 4.802999999999999e-05, - "loss": 0.0149, + "grad_norm": 0.4140625, + "learning_rate": 0.0001011157894736842, + "loss": 0.0198, "num_input_tokens_seen": 550342656, "step": 8400, - "train_runtime": 4470.3843, - "train_tokens_per_second": 123108.578 + "train_runtime": 1561.2661, + "train_tokens_per_second": 352497.674 }, { "epoch": 1.817395067070532, - "eval_loss": 0.009731123223900795, - "eval_runtime": 30.744, - "eval_samples_per_second": 1.041, - "eval_steps_per_second": 0.033, + "eval_loss": 0.013265056535601616, + "eval_runtime": 1.7475, + "eval_samples_per_second": 18.312, + "eval_steps_per_second": 0.572, "num_input_tokens_seen": 550342656, "step": 8400 }, { "epoch": 1.817395067070532, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.995919881305638, + "eval_chrf": 97.15717684332516, + "eval_sacrebleu": 96.77994309150014, + "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 550342656, - "perplexity": 1.0097786245589193, + "perplexity": 1.0133534277152885, "step": 8400 }, { "epoch": 1.8195586326265687, - "grad_norm": 0.84375, - "learning_rate": 4.772999999999999e-05, - "loss": 0.0134, + "grad_norm": 0.51953125, + "learning_rate": 0.00010048421052631578, + "loss": 0.018, "num_input_tokens_seen": 550993920, "step": 8410, - "train_runtime": 4504.7975, - "train_tokens_per_second": 122312.694 + "train_runtime": 1566.8283, + "train_tokens_per_second": 351661.961 }, { "epoch": 1.8217221981826048, - "grad_norm": 0.42578125, - "learning_rate": 4.742999999999999e-05, - "loss": 0.0123, + "grad_norm": 0.318359375, + "learning_rate": 9.985263157894735e-05, + "loss": 0.016, "num_input_tokens_seen": 551649280, "step": 8420, - "train_runtime": 4508.4535, - "train_tokens_per_second": 122358.87 + "train_runtime": 1570.653, + "train_tokens_per_second": 351222.883 }, { "epoch": 1.8238857637386414, - "grad_norm": 0.5390625, - "learning_rate": 4.712999999999999e-05, - "loss": 0.0146, + "grad_norm": 0.349609375, + "learning_rate": 9.922105263157894e-05, + "loss": 0.0202, "num_input_tokens_seen": 552304640, "step": 8430, - "train_runtime": 4512.1093, - "train_tokens_per_second": 122404.979 + "train_runtime": 1574.475, + "train_tokens_per_second": 350786.55 }, { "epoch": 1.8260493292946776, - "grad_norm": 0.59765625, - "learning_rate": 4.682999999999999e-05, - "loss": 0.014, + "grad_norm": 0.478515625, + "learning_rate": 9.858947368421052e-05, + "loss": 0.0191, "num_input_tokens_seen": 552960000, "step": 8440, - "train_runtime": 4515.7078, - "train_tokens_per_second": 122452.564 + "train_runtime": 1578.2914, + "train_tokens_per_second": 350353.564 }, { "epoch": 1.8282128948507141, - "grad_norm": 0.53515625, - "learning_rate": 4.652999999999999e-05, - "loss": 0.0137, + "grad_norm": 0.341796875, + "learning_rate": 9.79578947368421e-05, + "loss": 0.0185, "num_input_tokens_seen": 553615360, "step": 8450, - "train_runtime": 4519.3484, - "train_tokens_per_second": 122498.934 + "train_runtime": 1582.1154, + "train_tokens_per_second": 349920.975 }, { "epoch": 1.8303764604067503, - "grad_norm": 0.412109375, - "learning_rate": 4.622999999999999e-05, - "loss": 0.0128, + "grad_norm": 0.384765625, + "learning_rate": 9.732631578947367e-05, + "loss": 0.017, "num_input_tokens_seen": 554270720, "step": 8460, - "train_runtime": 4522.8902, - "train_tokens_per_second": 122547.907 + "train_runtime": 1585.9303, + "train_tokens_per_second": 349492.475 }, { "epoch": 1.8325400259627866, - "grad_norm": 0.44140625, - "learning_rate": 4.593e-05, - "loss": 0.0121, + "grad_norm": 0.357421875, + "learning_rate": 9.669473684210525e-05, + "loss": 0.016, "num_input_tokens_seen": 554926080, "step": 8470, - "train_runtime": 4526.4451, - "train_tokens_per_second": 122596.446 + "train_runtime": 1589.7507, + "train_tokens_per_second": 349064.849 }, { "epoch": 1.834703591518823, - "grad_norm": 0.5625, - "learning_rate": 4.563e-05, - "loss": 0.013, + "grad_norm": 0.359375, + "learning_rate": 9.606315789473684e-05, + "loss": 0.0166, "num_input_tokens_seen": 555581440, "step": 8480, - "train_runtime": 4529.9309, - "train_tokens_per_second": 122646.779 + "train_runtime": 1593.5726, + "train_tokens_per_second": 348638.922 }, { "epoch": 1.8368671570748594, - "grad_norm": 0.419921875, - "learning_rate": 4.533e-05, - "loss": 0.0142, + "grad_norm": 0.30859375, + "learning_rate": 9.543157894736841e-05, + "loss": 0.0194, "num_input_tokens_seen": 556236800, "step": 8490, - "train_runtime": 4533.5438, - "train_tokens_per_second": 122693.599 + "train_runtime": 1597.3881, + "train_tokens_per_second": 348216.432 }, { "epoch": 1.8390307226308957, - "grad_norm": 0.5859375, - "learning_rate": 4.503e-05, - "loss": 0.0138, + "grad_norm": 0.380859375, + "learning_rate": 9.479999999999999e-05, + "loss": 0.0194, "num_input_tokens_seen": 556892160, "step": 8500, - "train_runtime": 4537.4136, - "train_tokens_per_second": 122733.393 + "train_runtime": 1601.2204, + "train_tokens_per_second": 347792.313 }, { "epoch": 1.8390307226308957, - "eval_loss": 0.009925661608576775, - "eval_runtime": 86.7259, - "eval_samples_per_second": 0.369, - "eval_steps_per_second": 0.012, + "eval_loss": 0.013057196512818336, + "eval_runtime": 1.7679, + "eval_samples_per_second": 18.101, + "eval_steps_per_second": 0.566, "num_input_tokens_seen": 556892160, "step": 8500 }, { "epoch": 1.8390307226308957, - "eval_byte_accuracy": 0.9996290801186943, - "eval_chrf": 39.16253337118813, - "eval_sacrebleu": 19.88569831211158, - "eval_word_accuracy": 0.9986263736263736, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 98.31023260829656, + "eval_sacrebleu": 97.89877731533007, + "eval_word_accuracy": 0.9931318681318682, "num_input_tokens_seen": 556892160, - "perplexity": 1.0099750843702888, + "perplexity": 1.0131428139385665, "step": 8500 + }, + { + "epoch": 1.841194288186932, + "grad_norm": 0.423828125, + "learning_rate": 9.416842105263157e-05, + "loss": 0.0176, + "num_input_tokens_seen": 557547520, + "step": 8510, + "train_runtime": 1606.8812, + "train_tokens_per_second": 346974.938 + }, + { + "epoch": 1.8433578537429685, + "grad_norm": 0.37109375, + "learning_rate": 9.353684210526315e-05, + "loss": 0.0168, + "num_input_tokens_seen": 558202880, + "step": 8520, + "train_runtime": 1610.6957, + "train_tokens_per_second": 346560.113 + }, + { + "epoch": 1.8455214192990046, + "grad_norm": 0.333984375, + "learning_rate": 9.290526315789473e-05, + "loss": 0.0196, + "num_input_tokens_seen": 558858240, + "step": 8530, + "train_runtime": 1614.5112, + "train_tokens_per_second": 346147.017 + }, + { + "epoch": 1.8476849848550412, + "grad_norm": 0.38671875, + "learning_rate": 9.22736842105263e-05, + "loss": 0.0173, + "num_input_tokens_seen": 559513600, + "step": 8540, + "train_runtime": 1618.3296, + "train_tokens_per_second": 345735.256 + }, + { + "epoch": 1.8498485504110773, + "grad_norm": 0.306640625, + "learning_rate": 9.164210526315789e-05, + "loss": 0.018, + "num_input_tokens_seen": 560168960, + "step": 8550, + "train_runtime": 1622.1481, + "train_tokens_per_second": 345325.416 + }, + { + "epoch": 1.852012115967114, + "grad_norm": 0.267578125, + "learning_rate": 9.101052631578946e-05, + "loss": 0.0176, + "num_input_tokens_seen": 560824320, + "step": 8560, + "train_runtime": 1625.9619, + "train_tokens_per_second": 344918.496 + }, + { + "epoch": 1.85417568152315, + "grad_norm": 0.248046875, + "learning_rate": 9.037894736842105e-05, + "loss": 0.0173, + "num_input_tokens_seen": 561479680, + "step": 8570, + "train_runtime": 1629.7856, + "train_tokens_per_second": 344511.37 + }, + { + "epoch": 1.8563392470791866, + "grad_norm": 0.37890625, + "learning_rate": 8.974736842105262e-05, + "loss": 0.0166, + "num_input_tokens_seen": 562135040, + "step": 8580, + "train_runtime": 1633.6031, + "train_tokens_per_second": 344107.479 + }, + { + "epoch": 1.8585028126352228, + "grad_norm": 0.39453125, + "learning_rate": 8.91157894736842e-05, + "loss": 0.0186, + "num_input_tokens_seen": 562790400, + "step": 8590, + "train_runtime": 1637.4214, + "train_tokens_per_second": 343705.288 + }, + { + "epoch": 1.8606663781912594, + "grad_norm": 0.4140625, + "learning_rate": 8.848421052631578e-05, + "loss": 0.0196, + "num_input_tokens_seen": 563445760, + "step": 8600, + "train_runtime": 1641.2424, + "train_tokens_per_second": 343304.424 + }, + { + "epoch": 1.8606663781912594, + "eval_loss": 0.012696781195700169, + "eval_runtime": 1.759, + "eval_samples_per_second": 18.192, + "eval_steps_per_second": 0.568, + "num_input_tokens_seen": 563445760, + "step": 8600 + }, + { + "epoch": 1.8606663781912594, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 97.70457548553854, + "eval_sacrebleu": 97.11458584551553, + "eval_word_accuracy": 0.9917582417582418, + "num_input_tokens_seen": 563445760, + "perplexity": 1.012777727545311, + "step": 8600 + }, + { + "epoch": 1.8628299437472955, + "grad_norm": 0.337890625, + "learning_rate": 8.785263157894737e-05, + "loss": 0.0167, + "num_input_tokens_seen": 564101120, + "step": 8610, + "train_runtime": 1646.8299, + "train_tokens_per_second": 342537.572 + }, + { + "epoch": 1.8649935093033319, + "grad_norm": 0.322265625, + "learning_rate": 8.722105263157894e-05, + "loss": 0.0172, + "num_input_tokens_seen": 564756480, + "step": 8620, + "train_runtime": 1650.639, + "train_tokens_per_second": 342144.146 + }, + { + "epoch": 1.8671570748593682, + "grad_norm": 0.328125, + "learning_rate": 8.658947368421052e-05, + "loss": 0.0163, + "num_input_tokens_seen": 565411840, + "step": 8630, + "train_runtime": 1654.4626, + "train_tokens_per_second": 341749.55 + }, + { + "epoch": 1.8693206404154046, + "grad_norm": 0.38671875, + "learning_rate": 8.59578947368421e-05, + "loss": 0.0186, + "num_input_tokens_seen": 566063104, + "step": 8640, + "train_runtime": 1658.2639, + "train_tokens_per_second": 341358.878 + }, + { + "epoch": 1.871484205971441, + "grad_norm": 0.3828125, + "learning_rate": 8.532631578947369e-05, + "loss": 0.018, + "num_input_tokens_seen": 566718464, + "step": 8650, + "train_runtime": 1662.0809, + "train_tokens_per_second": 340969.238 + }, + { + "epoch": 1.8736477715274773, + "grad_norm": 0.390625, + "learning_rate": 8.469473684210525e-05, + "loss": 0.0189, + "num_input_tokens_seen": 567373824, + "step": 8660, + "train_runtime": 1665.898, + "train_tokens_per_second": 340581.365 + }, + { + "epoch": 1.8758113370835137, + "grad_norm": 0.45703125, + "learning_rate": 8.406315789473683e-05, + "loss": 0.0372, + "num_input_tokens_seen": 568029184, + "step": 8670, + "train_runtime": 1669.7217, + "train_tokens_per_second": 340193.919 + }, + { + "epoch": 1.87797490263955, + "grad_norm": 0.41796875, + "learning_rate": 8.343157894736842e-05, + "loss": 0.0174, + "num_input_tokens_seen": 568684544, + "step": 8680, + "train_runtime": 1673.5403, + "train_tokens_per_second": 339809.298 + }, + { + "epoch": 1.8801384681955864, + "grad_norm": 0.400390625, + "learning_rate": 8.28e-05, + "loss": 0.0188, + "num_input_tokens_seen": 569339904, + "step": 8690, + "train_runtime": 1677.3658, + "train_tokens_per_second": 339425.007 + }, + { + "epoch": 1.8823020337516225, + "grad_norm": 0.3828125, + "learning_rate": 8.216842105263157e-05, + "loss": 0.0155, + "num_input_tokens_seen": 569995264, + "step": 8700, + "train_runtime": 1681.1869, + "train_tokens_per_second": 339043.374 + }, + { + "epoch": 1.8823020337516225, + "eval_loss": 0.012751961126923561, + "eval_runtime": 1.7169, + "eval_samples_per_second": 18.638, + "eval_steps_per_second": 0.582, + "num_input_tokens_seen": 569995264, + "step": 8700 + }, + { + "epoch": 1.8823020337516225, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 97.93277340374337, + "eval_sacrebleu": 97.30034967398244, + "eval_word_accuracy": 0.9917582417582418, + "num_input_tokens_seen": 569995264, + "perplexity": 1.0128336140925551, + "step": 8700 + }, + { + "epoch": 1.8844655993076591, + "grad_norm": 0.357421875, + "learning_rate": 8.153684210526315e-05, + "loss": 0.0182, + "num_input_tokens_seen": 570650624, + "step": 8710, + "train_runtime": 1686.7192, + "train_tokens_per_second": 338319.878 + }, + { + "epoch": 1.8866291648636953, + "grad_norm": 0.38671875, + "learning_rate": 8.090526315789474e-05, + "loss": 0.017, + "num_input_tokens_seen": 571305984, + "step": 8720, + "train_runtime": 1690.52, + "train_tokens_per_second": 337946.886 + }, + { + "epoch": 1.8887927304197318, + "grad_norm": 0.26953125, + "learning_rate": 8.02736842105263e-05, + "loss": 0.0167, + "num_input_tokens_seen": 571961344, + "step": 8730, + "train_runtime": 1694.3384, + "train_tokens_per_second": 337572.087 + }, + { + "epoch": 1.890956295975768, + "grad_norm": 0.365234375, + "learning_rate": 7.964210526315788e-05, + "loss": 0.0187, + "num_input_tokens_seen": 572616704, + "step": 8740, + "train_runtime": 1698.1568, + "train_tokens_per_second": 337198.951 + }, + { + "epoch": 1.8931198615318046, + "grad_norm": 0.26171875, + "learning_rate": 7.901052631578947e-05, + "loss": 0.0187, + "num_input_tokens_seen": 573272064, + "step": 8750, + "train_runtime": 1701.9803, + "train_tokens_per_second": 336826.499 + }, + { + "epoch": 1.8952834270878407, + "grad_norm": 0.302734375, + "learning_rate": 7.837894736842106e-05, + "loss": 0.0168, + "num_input_tokens_seen": 573927424, + "step": 8760, + "train_runtime": 1705.81, + "train_tokens_per_second": 336454.491 + }, + { + "epoch": 1.897446992643877, + "grad_norm": 0.29296875, + "learning_rate": 7.774736842105262e-05, + "loss": 0.0194, + "num_input_tokens_seen": 574582784, + "step": 8770, + "train_runtime": 1709.6347, + "train_tokens_per_second": 336085.119 + }, + { + "epoch": 1.8996105581999134, + "grad_norm": 0.30078125, + "learning_rate": 7.71157894736842e-05, + "loss": 0.0167, + "num_input_tokens_seen": 575238144, + "step": 8780, + "train_runtime": 1713.4531, + "train_tokens_per_second": 335718.631 + }, + { + "epoch": 1.9017741237559498, + "grad_norm": 0.439453125, + "learning_rate": 7.648421052631579e-05, + "loss": 0.0195, + "num_input_tokens_seen": 575893504, + "step": 8790, + "train_runtime": 1717.2793, + "train_tokens_per_second": 335352.256 + }, + { + "epoch": 1.9039376893119861, + "grad_norm": 0.341796875, + "learning_rate": 7.585263157894735e-05, + "loss": 0.0195, + "num_input_tokens_seen": 576548864, + "step": 8800, + "train_runtime": 1721.0838, + "train_tokens_per_second": 334991.742 + }, + { + "epoch": 1.9039376893119861, + "eval_loss": 0.01311762910336256, + "eval_runtime": 1.7389, + "eval_samples_per_second": 18.402, + "eval_steps_per_second": 0.575, + "num_input_tokens_seen": 576548864, + "step": 8800 + }, + { + "epoch": 1.9039376893119861, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 97.87702445919373, + "eval_sacrebleu": 97.30034967398244, + "eval_word_accuracy": 0.9917582417582418, + "num_input_tokens_seen": 576548864, + "perplexity": 1.0132040426334898, + "step": 8800 + }, + { + "epoch": 1.9061012548680225, + "grad_norm": 0.310546875, + "learning_rate": 7.522105263157894e-05, + "loss": 0.0172, + "num_input_tokens_seen": 577204224, + "step": 8810, + "train_runtime": 1726.6618, + "train_tokens_per_second": 334289.099 + }, + { + "epoch": 1.9082648204240589, + "grad_norm": 0.5859375, + "learning_rate": 7.458947368421052e-05, + "loss": 0.0173, + "num_input_tokens_seen": 577859584, + "step": 8820, + "train_runtime": 1730.4841, + "train_tokens_per_second": 333929.433 + }, + { + "epoch": 1.9104283859800952, + "grad_norm": 0.314453125, + "learning_rate": 7.39578947368421e-05, + "loss": 0.0176, + "num_input_tokens_seen": 578514944, + "step": 8830, + "train_runtime": 1734.3033, + "train_tokens_per_second": 333571.965 + }, + { + "epoch": 1.9125919515361316, + "grad_norm": 0.4296875, + "learning_rate": 7.332631578947368e-05, + "loss": 0.0178, + "num_input_tokens_seen": 579170304, + "step": 8840, + "train_runtime": 1738.1235, + "train_tokens_per_second": 333215.847 + }, + { + "epoch": 1.9147555170921677, + "grad_norm": 0.34375, + "learning_rate": 7.269473684210525e-05, + "loss": 0.0177, + "num_input_tokens_seen": 579821568, + "step": 8850, + "train_runtime": 1741.9307, + "train_tokens_per_second": 332861.452 + }, + { + "epoch": 1.9169190826482043, + "grad_norm": 0.263671875, + "learning_rate": 7.206315789473684e-05, + "loss": 0.0173, + "num_input_tokens_seen": 580476928, + "step": 8860, + "train_runtime": 1745.7567, + "train_tokens_per_second": 332507.342 + }, + { + "epoch": 1.9190826482042405, + "grad_norm": 0.283203125, + "learning_rate": 7.143157894736841e-05, + "loss": 0.0396, + "num_input_tokens_seen": 581132288, + "step": 8870, + "train_runtime": 1749.5799, + "train_tokens_per_second": 332155.333 + }, + { + "epoch": 1.921246213760277, + "grad_norm": 0.357421875, + "learning_rate": 7.079999999999999e-05, + "loss": 0.0205, + "num_input_tokens_seen": 581787648, + "step": 8880, + "train_runtime": 1753.4027, + "train_tokens_per_second": 331804.921 + }, + { + "epoch": 1.9234097793163132, + "grad_norm": 0.38671875, + "learning_rate": 7.016842105263157e-05, + "loss": 0.0184, + "num_input_tokens_seen": 582443008, + "step": 8890, + "train_runtime": 1757.2196, + "train_tokens_per_second": 331457.158 + }, + { + "epoch": 1.9255733448723498, + "grad_norm": 0.416015625, + "learning_rate": 6.953684210526315e-05, + "loss": 0.0197, + "num_input_tokens_seen": 583098368, + "step": 8900, + "train_runtime": 1761.037, + "train_tokens_per_second": 331110.803 + }, + { + "epoch": 1.9255733448723498, + "eval_loss": 0.013076567091047764, + "eval_runtime": 1.7602, + "eval_samples_per_second": 18.18, + "eval_steps_per_second": 0.568, + "num_input_tokens_seen": 583098368, + "step": 8900 + }, + { + "epoch": 1.9255733448723498, + "eval_byte_accuracy": 0.9970326409495549, + "eval_chrf": 98.33791810109832, + "eval_sacrebleu": 97.89877731533007, + "eval_word_accuracy": 0.9931318681318682, + "num_input_tokens_seen": 583098368, + "perplexity": 1.013162439290778, + "step": 8900 + }, + { + "epoch": 1.927736910428386, + "grad_norm": 0.419921875, + "learning_rate": 6.890526315789473e-05, + "loss": 0.0175, + "num_input_tokens_seen": 583753728, + "step": 8910, + "train_runtime": 1766.6312, + "train_tokens_per_second": 330433.265 + }, + { + "epoch": 1.9299004759844225, + "grad_norm": 0.2734375, + "learning_rate": 6.82736842105263e-05, + "loss": 0.0169, + "num_input_tokens_seen": 584409088, + "step": 8920, + "train_runtime": 1770.4611, + "train_tokens_per_second": 330088.638 + }, + { + "epoch": 1.9320640415404586, + "grad_norm": 0.384765625, + "learning_rate": 6.764210526315789e-05, + "loss": 0.0171, + "num_input_tokens_seen": 585064448, + "step": 8930, + "train_runtime": 1774.2831, + "train_tokens_per_second": 329746.95 + }, + { + "epoch": 1.934227607096495, + "grad_norm": 0.48046875, + "learning_rate": 6.701052631578946e-05, + "loss": 0.0171, + "num_input_tokens_seen": 585719808, + "step": 8940, + "train_runtime": 1778.1038, + "train_tokens_per_second": 329406.981 + }, + { + "epoch": 1.9363911726525314, + "grad_norm": 0.328125, + "learning_rate": 6.637894736842104e-05, + "loss": 0.0159, + "num_input_tokens_seen": 586375168, + "step": 8950, + "train_runtime": 1781.9085, + "train_tokens_per_second": 329071.433 + }, + { + "epoch": 1.9385547382085677, + "grad_norm": 0.333984375, + "learning_rate": 6.574736842105262e-05, + "loss": 0.019, + "num_input_tokens_seen": 587030528, + "step": 8960, + "train_runtime": 1785.7304, + "train_tokens_per_second": 328734.137 + }, + { + "epoch": 1.940718303764604, + "grad_norm": 0.337890625, + "learning_rate": 6.51157894736842e-05, + "loss": 0.0182, + "num_input_tokens_seen": 587685888, + "step": 8970, + "train_runtime": 1789.5563, + "train_tokens_per_second": 328397.543 + }, + { + "epoch": 1.9428818693206404, + "grad_norm": 0.55859375, + "learning_rate": 6.448421052631578e-05, + "loss": 0.0203, + "num_input_tokens_seen": 588341248, + "step": 8980, + "train_runtime": 1793.385, + "train_tokens_per_second": 328061.874 + }, + { + "epoch": 1.9450454348766768, + "grad_norm": 0.4609375, + "learning_rate": 6.385263157894736e-05, + "loss": 0.0177, + "num_input_tokens_seen": 588996608, + "step": 8990, + "train_runtime": 1797.2072, + "train_tokens_per_second": 327728.829 + }, + { + "epoch": 1.9472090004327132, + "grad_norm": 0.36328125, + "learning_rate": 6.322105263157894e-05, + "loss": 0.0186, + "num_input_tokens_seen": 589651968, + "step": 9000, + "train_runtime": 1801.0244, + "train_tokens_per_second": 327398.099 + }, + { + "epoch": 1.9472090004327132, + "eval_loss": 0.013011611066758633, + "eval_runtime": 1.7305, + "eval_samples_per_second": 18.492, + "eval_steps_per_second": 0.578, + "num_input_tokens_seen": 589651968, + "step": 9000 + }, + { + "epoch": 1.9472090004327132, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 97.8443713803999, + "eval_sacrebleu": 97.30034967398244, + "eval_word_accuracy": 0.9917582417582418, + "num_input_tokens_seen": 589651968, + "perplexity": 1.013096630424127, + "step": 9000 + }, + { + "epoch": 1.9493725659887495, + "grad_norm": 0.41796875, + "learning_rate": 6.258947368421051e-05, + "loss": 0.0176, + "num_input_tokens_seen": 590307328, + "step": 9010, + "train_runtime": 1806.6431, + "train_tokens_per_second": 326742.641 + }, + { + "epoch": 1.9515361315447857, + "grad_norm": 0.361328125, + "learning_rate": 6.19578947368421e-05, + "loss": 0.0168, + "num_input_tokens_seen": 590962688, + "step": 9020, + "train_runtime": 1810.4672, + "train_tokens_per_second": 326414.463 + }, + { + "epoch": 1.9536996971008223, + "grad_norm": 0.35546875, + "learning_rate": 6.132631578947367e-05, + "loss": 0.0178, + "num_input_tokens_seen": 591618048, + "step": 9030, + "train_runtime": 1814.284, + "train_tokens_per_second": 326088.998 + }, + { + "epoch": 1.9558632626568584, + "grad_norm": 0.375, + "learning_rate": 6.0694736842105254e-05, + "loss": 0.0142, + "num_input_tokens_seen": 592273408, + "step": 9040, + "train_runtime": 1818.0992, + "train_tokens_per_second": 325765.181 + }, + { + "epoch": 1.958026828212895, + "grad_norm": 0.3671875, + "learning_rate": 6.006315789473684e-05, + "loss": 0.0167, + "num_input_tokens_seen": 592928768, + "step": 9050, + "train_runtime": 1821.9234, + "train_tokens_per_second": 325441.099 + }, + { + "epoch": 1.9601903937689311, + "grad_norm": 0.328125, + "learning_rate": 5.943157894736841e-05, + "loss": 0.0179, + "num_input_tokens_seen": 593580032, + "step": 9060, + "train_runtime": 1825.7215, + "train_tokens_per_second": 325120.802 + }, + { + "epoch": 1.9623539593249677, + "grad_norm": 0.51171875, + "learning_rate": 5.88e-05, + "loss": 0.0164, + "num_input_tokens_seen": 594235392, + "step": 9070, + "train_runtime": 1829.5267, + "train_tokens_per_second": 324802.796 + }, + { + "epoch": 1.9645175248810038, + "grad_norm": 0.294921875, + "learning_rate": 5.816842105263157e-05, + "loss": 0.0171, + "num_input_tokens_seen": 594890752, + "step": 9080, + "train_runtime": 1833.3509, + "train_tokens_per_second": 324482.76 + }, + { + "epoch": 1.9666810904370402, + "grad_norm": 0.3203125, + "learning_rate": 5.753684210526316e-05, + "loss": 0.0158, + "num_input_tokens_seen": 595546112, + "step": 9090, + "train_runtime": 1837.1771, + "train_tokens_per_second": 324163.685 + }, + { + "epoch": 1.9688446559930766, + "grad_norm": 0.384765625, + "learning_rate": 5.690526315789473e-05, + "loss": 0.0175, + "num_input_tokens_seen": 596201472, + "step": 9100, + "train_runtime": 1841.0114, + "train_tokens_per_second": 323844.539 + }, + { + "epoch": 1.9688446559930766, + "eval_loss": 0.012902812100946903, + "eval_runtime": 1.721, + "eval_samples_per_second": 18.594, + "eval_steps_per_second": 0.581, + "num_input_tokens_seen": 596201472, + "step": 9100 + }, + { + "epoch": 1.9688446559930766, + "eval_byte_accuracy": 0.9970326409495549, + "eval_chrf": 98.31023260829656, + "eval_sacrebleu": 97.89877731533007, + "eval_word_accuracy": 0.9931318681318682, + "num_input_tokens_seen": 596201472, + "perplexity": 1.0129864125543733, + "step": 9100 + }, + { + "epoch": 1.971008221549113, + "grad_norm": 0.375, + "learning_rate": 5.6273684210526304e-05, + "loss": 0.021, + "num_input_tokens_seen": 596856832, + "step": 9110, + "train_runtime": 1846.5729, + "train_tokens_per_second": 323224.081 + }, + { + "epoch": 1.9731717871051493, + "grad_norm": 0.408203125, + "learning_rate": 5.564210526315789e-05, + "loss": 0.0165, + "num_input_tokens_seen": 597512192, + "step": 9120, + "train_runtime": 1850.3877, + "train_tokens_per_second": 322911.89 + }, + { + "epoch": 1.9753353526611857, + "grad_norm": 0.275390625, + "learning_rate": 5.5010526315789464e-05, + "loss": 0.0182, + "num_input_tokens_seen": 598167552, + "step": 9130, + "train_runtime": 1854.2112, + "train_tokens_per_second": 322599.478 + }, + { + "epoch": 1.977498918217222, + "grad_norm": 0.2431640625, + "learning_rate": 5.437894736842105e-05, + "loss": 0.0161, + "num_input_tokens_seen": 598822912, + "step": 9140, + "train_runtime": 1858.0463, + "train_tokens_per_second": 322286.325 + }, + { + "epoch": 1.9796624837732584, + "grad_norm": 0.404296875, + "learning_rate": 5.374736842105262e-05, + "loss": 0.0169, + "num_input_tokens_seen": 599478272, + "step": 9150, + "train_runtime": 1861.8665, + "train_tokens_per_second": 321977.04 + }, + { + "epoch": 1.9818260493292947, + "grad_norm": 0.462890625, + "learning_rate": 5.311578947368421e-05, + "loss": 0.017, + "num_input_tokens_seen": 600133632, + "step": 9160, + "train_runtime": 1865.6883, + "train_tokens_per_second": 321668.749 + }, + { + "epoch": 1.9839896148853309, + "grad_norm": 0.416015625, + "learning_rate": 5.248421052631578e-05, + "loss": 0.0189, + "num_input_tokens_seen": 600788992, + "step": 9170, + "train_runtime": 1869.5058, + "train_tokens_per_second": 321362.465 + }, + { + "epoch": 1.9861531804413675, + "grad_norm": 0.384765625, + "learning_rate": 5.185263157894737e-05, + "loss": 0.0188, + "num_input_tokens_seen": 601444352, + "step": 9180, + "train_runtime": 1873.3305, + "train_tokens_per_second": 321056.196 + }, + { + "epoch": 1.9883167459974036, + "grad_norm": 0.384765625, + "learning_rate": 5.122105263157894e-05, + "loss": 0.0179, + "num_input_tokens_seen": 602099712, + "step": 9190, + "train_runtime": 1877.1578, + "train_tokens_per_second": 320750.71 + }, + { + "epoch": 1.9904803115534402, + "grad_norm": 0.345703125, + "learning_rate": 5.058947368421052e-05, + "loss": 0.0154, + "num_input_tokens_seen": 602755072, + "step": 9200, + "train_runtime": 1880.981, + "train_tokens_per_second": 320447.182 + }, + { + "epoch": 1.9904803115534402, + "eval_loss": 0.012645396403968334, + "eval_runtime": 1.7339, + "eval_samples_per_second": 18.456, + "eval_steps_per_second": 0.577, + "num_input_tokens_seen": 602755072, + "step": 9200 + }, + { + "epoch": 1.9904803115534402, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 98.31023260829656, + "eval_sacrebleu": 97.89877731533007, + "eval_word_accuracy": 0.9917582417582418, + "num_input_tokens_seen": 602755072, + "perplexity": 1.012725687509755, + "step": 9200 + }, + { + "epoch": 1.9926438771094763, + "grad_norm": 0.361328125, + "learning_rate": 4.99578947368421e-05, + "loss": 0.0175, + "num_input_tokens_seen": 603410432, + "step": 9210, + "train_runtime": 1886.5591, + "train_tokens_per_second": 319847.091 + }, + { + "epoch": 1.994807442665513, + "grad_norm": 0.4765625, + "learning_rate": 4.932631578947368e-05, + "loss": 0.0177, + "num_input_tokens_seen": 604065792, + "step": 9220, + "train_runtime": 1890.3607, + "train_tokens_per_second": 319550.55 + }, + { + "epoch": 1.996971008221549, + "grad_norm": 0.4296875, + "learning_rate": 4.869473684210526e-05, + "loss": 0.0184, + "num_input_tokens_seen": 604721152, + "step": 9230, + "train_runtime": 1894.1637, + "train_tokens_per_second": 319254.96 + }, + { + "epoch": 1.9991345737775854, + "grad_norm": 0.25, + "learning_rate": 4.806315789473684e-05, + "loss": 0.0185, + "num_input_tokens_seen": 605372416, + "step": 9240, + "train_runtime": 1897.9386, + "train_tokens_per_second": 318963.116 + }, + { + "epoch": 2.0012981393336218, + "grad_norm": 0.421875, + "learning_rate": 4.743157894736842e-05, + "loss": 0.0183, + "num_input_tokens_seen": 606015488, + "step": 9250, + "train_runtime": 1901.7544, + "train_tokens_per_second": 318661.277 + }, + { + "epoch": 2.0034617048896584, + "grad_norm": 0.322265625, + "learning_rate": 4.68e-05, + "loss": 0.0151, + "num_input_tokens_seen": 606670848, + "step": 9260, + "train_runtime": 1905.5735, + "train_tokens_per_second": 318366.535 + }, + { + "epoch": 2.0056252704456945, + "grad_norm": 0.365234375, + "learning_rate": 4.616842105263157e-05, + "loss": 0.019, + "num_input_tokens_seen": 607326208, + "step": 9270, + "train_runtime": 1909.3949, + "train_tokens_per_second": 318072.6 + }, + { + "epoch": 2.0077888360017306, + "grad_norm": 0.3203125, + "learning_rate": 4.553684210526315e-05, + "loss": 0.0176, + "num_input_tokens_seen": 607981568, + "step": 9280, + "train_runtime": 1913.2182, + "train_tokens_per_second": 317779.529 + }, + { + "epoch": 2.009952401557767, + "grad_norm": 0.322265625, + "learning_rate": 4.490526315789473e-05, + "loss": 0.0163, + "num_input_tokens_seen": 608636928, + "step": 9290, + "train_runtime": 1917.0468, + "train_tokens_per_second": 317486.738 + }, + { + "epoch": 2.0121159671138034, + "grad_norm": 0.27734375, + "learning_rate": 4.427368421052631e-05, + "loss": 0.0164, + "num_input_tokens_seen": 609292288, + "step": 9300, + "train_runtime": 1920.8707, + "train_tokens_per_second": 317195.894 + }, + { + "epoch": 2.0121159671138034, + "eval_loss": 0.012494292110204697, + "eval_runtime": 1.7403, + "eval_samples_per_second": 18.387, + "eval_steps_per_second": 0.575, + "num_input_tokens_seen": 609292288, + "step": 9300 + }, + { + "epoch": 2.0121159671138034, + "eval_byte_accuracy": 0.9974035608308606, + "eval_chrf": 98.31023260829656, + "eval_sacrebleu": 97.89877731533007, + "eval_word_accuracy": 0.9931318681318682, + "num_input_tokens_seen": 609292288, + "perplexity": 1.0125726718709187, + "step": 9300 + }, + { + "epoch": 2.01427953266984, + "grad_norm": 0.357421875, + "learning_rate": 4.364210526315789e-05, + "loss": 0.0166, + "num_input_tokens_seen": 609947648, + "step": 9310, + "train_runtime": 1926.4536, + "train_tokens_per_second": 316616.841 + }, + { + "epoch": 2.016443098225876, + "grad_norm": 0.353515625, + "learning_rate": 4.301052631578947e-05, + "loss": 0.0153, + "num_input_tokens_seen": 610603008, + "step": 9320, + "train_runtime": 1930.2731, + "train_tokens_per_second": 316329.85 + }, + { + "epoch": 2.0186066637819127, + "grad_norm": 0.271484375, + "learning_rate": 4.237894736842105e-05, + "loss": 0.0165, + "num_input_tokens_seen": 611258368, + "step": 9330, + "train_runtime": 1934.0809, + "train_tokens_per_second": 316045.916 + }, + { + "epoch": 2.020770229337949, + "grad_norm": 0.36328125, + "learning_rate": 4.174736842105262e-05, + "loss": 0.0158, + "num_input_tokens_seen": 611913728, + "step": 9340, + "train_runtime": 1937.9052, + "train_tokens_per_second": 315760.403 + }, + { + "epoch": 2.0229337948939854, + "grad_norm": 0.328125, + "learning_rate": 4.111578947368421e-05, + "loss": 0.0171, + "num_input_tokens_seen": 612569088, + "step": 9350, + "train_runtime": 1941.7238, + "train_tokens_per_second": 315476.947 + }, + { + "epoch": 2.0250973604500215, + "grad_norm": 0.345703125, + "learning_rate": 4.048421052631578e-05, + "loss": 0.0167, + "num_input_tokens_seen": 613224448, + "step": 9360, + "train_runtime": 1945.5442, + "train_tokens_per_second": 315194.305 + }, + { + "epoch": 2.027260926006058, + "grad_norm": 0.32421875, + "learning_rate": 3.985263157894737e-05, + "loss": 0.0162, + "num_input_tokens_seen": 613879808, + "step": 9370, + "train_runtime": 1949.3723, + "train_tokens_per_second": 314911.526 + }, + { + "epoch": 2.0294244915620943, + "grad_norm": 0.306640625, + "learning_rate": 3.922105263157894e-05, + "loss": 0.0176, + "num_input_tokens_seen": 614535168, + "step": 9380, + "train_runtime": 1953.1986, + "train_tokens_per_second": 314630.146 + }, + { + "epoch": 2.031588057118131, + "grad_norm": 0.3203125, + "learning_rate": 3.858947368421053e-05, + "loss": 0.0199, + "num_input_tokens_seen": 615190528, + "step": 9390, + "train_runtime": 1957.0185, + "train_tokens_per_second": 314350.899 + }, + { + "epoch": 2.033751622674167, + "grad_norm": 0.333984375, + "learning_rate": 3.79578947368421e-05, + "loss": 0.0182, + "num_input_tokens_seen": 615845888, + "step": 9400, + "train_runtime": 1960.8436, + "train_tokens_per_second": 314071.907 + }, + { + "epoch": 2.033751622674167, + "eval_loss": 0.01256804633885622, + "eval_runtime": 1.7572, + "eval_samples_per_second": 18.211, + "eval_steps_per_second": 0.569, + "num_input_tokens_seen": 615845888, + "step": 9400 + }, + { + "epoch": 2.033751622674167, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 97.81668588759814, + "eval_sacrebleu": 97.30034967398244, + "eval_word_accuracy": 0.9917582417582418, + "num_input_tokens_seen": 615845888, + "perplexity": 1.0126473561413927, + "step": 9400 + }, + { + "epoch": 2.0359151882302036, + "grad_norm": 0.443359375, + "learning_rate": 3.732631578947368e-05, + "loss": 0.0167, + "num_input_tokens_seen": 616501248, + "step": 9410, + "train_runtime": 1966.4282, + "train_tokens_per_second": 313513.226 + }, + { + "epoch": 2.0380787537862397, + "grad_norm": 0.384765625, + "learning_rate": 3.669473684210526e-05, + "loss": 0.0167, + "num_input_tokens_seen": 617156608, + "step": 9420, + "train_runtime": 1970.2537, + "train_tokens_per_second": 313237.124 + }, + { + "epoch": 2.0402423193422763, + "grad_norm": 0.32421875, + "learning_rate": 3.606315789473684e-05, + "loss": 0.0187, + "num_input_tokens_seen": 617811968, + "step": 9430, + "train_runtime": 1974.0803, + "train_tokens_per_second": 312961.927 + }, + { + "epoch": 2.0424058848983124, + "grad_norm": 0.34765625, + "learning_rate": 3.543157894736842e-05, + "loss": 0.0164, + "num_input_tokens_seen": 618467328, + "step": 9440, + "train_runtime": 1977.9041, + "train_tokens_per_second": 312688.224 + }, + { + "epoch": 2.0445694504543486, + "grad_norm": 0.33984375, + "learning_rate": 3.48e-05, + "loss": 0.0156, + "num_input_tokens_seen": 619122688, + "step": 9450, + "train_runtime": 1981.7289, + "train_tokens_per_second": 312415.428 + }, + { + "epoch": 2.046733016010385, + "grad_norm": 0.3046875, + "learning_rate": 3.416842105263157e-05, + "loss": 0.0161, + "num_input_tokens_seen": 619778048, + "step": 9460, + "train_runtime": 1985.5506, + "train_tokens_per_second": 312144.165 + }, + { + "epoch": 2.0488965815664213, + "grad_norm": 0.30078125, + "learning_rate": 3.353684210526315e-05, + "loss": 0.0154, + "num_input_tokens_seen": 620433408, + "step": 9470, + "train_runtime": 1989.3722, + "train_tokens_per_second": 311873.977 + }, + { + "epoch": 2.051060147122458, + "grad_norm": 0.2890625, + "learning_rate": 3.290526315789473e-05, + "loss": 0.0151, + "num_input_tokens_seen": 621088768, + "step": 9480, + "train_runtime": 1993.1753, + "train_tokens_per_second": 311607.691 + }, + { + "epoch": 2.053223712678494, + "grad_norm": 0.375, + "learning_rate": 3.227368421052631e-05, + "loss": 0.015, + "num_input_tokens_seen": 621744128, + "step": 9490, + "train_runtime": 1996.9919, + "train_tokens_per_second": 311340.336 + }, + { + "epoch": 2.0553872782345306, + "grad_norm": 0.345703125, + "learning_rate": 3.164210526315789e-05, + "loss": 0.0138, + "num_input_tokens_seen": 622399488, + "step": 9500, + "train_runtime": 2000.8102, + "train_tokens_per_second": 311073.728 + }, + { + "epoch": 2.0553872782345306, + "eval_loss": 0.01297488622367382, + "eval_runtime": 1.7539, + "eval_samples_per_second": 18.245, + "eval_steps_per_second": 0.57, + "num_input_tokens_seen": 622399488, + "step": 9500 + }, + { + "epoch": 2.0553872782345306, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 97.81668588759814, + "eval_sacrebleu": 97.30034967398244, + "eval_word_accuracy": 0.9917582417582418, + "num_input_tokens_seen": 622399488, + "perplexity": 1.0130594252925254, + "step": 9500 + }, + { + "epoch": 2.0575508437905667, + "grad_norm": 0.3515625, + "learning_rate": 3.101052631578947e-05, + "loss": 0.0176, + "num_input_tokens_seen": 623054848, + "step": 9510, + "train_runtime": 2006.4558, + "train_tokens_per_second": 310525.073 + }, + { + "epoch": 2.0597144093466033, + "grad_norm": 0.287109375, + "learning_rate": 3.0378947368421053e-05, + "loss": 0.0153, + "num_input_tokens_seen": 623710208, + "step": 9520, + "train_runtime": 2010.2876, + "train_tokens_per_second": 310259.192 + }, + { + "epoch": 2.0618779749026395, + "grad_norm": 0.30859375, + "learning_rate": 2.974736842105263e-05, + "loss": 0.0151, + "num_input_tokens_seen": 624365568, + "step": 9530, + "train_runtime": 2014.1246, + "train_tokens_per_second": 309993.522 + }, + { + "epoch": 2.064041540458676, + "grad_norm": 0.453125, + "learning_rate": 2.9115789473684205e-05, + "loss": 0.0161, + "num_input_tokens_seen": 625020928, + "step": 9540, + "train_runtime": 2017.9547, + "train_tokens_per_second": 309729.918 + }, + { + "epoch": 2.066205106014712, + "grad_norm": 0.369140625, + "learning_rate": 2.8484210526315785e-05, + "loss": 0.0155, + "num_input_tokens_seen": 625676288, + "step": 9550, + "train_runtime": 2021.7816, + "train_tokens_per_second": 309467.785 + }, + { + "epoch": 2.0683686715707488, + "grad_norm": 0.294921875, + "learning_rate": 2.7852631578947365e-05, + "loss": 0.0147, + "num_input_tokens_seen": 626331648, + "step": 9560, + "train_runtime": 2025.6074, + "train_tokens_per_second": 309206.828 + }, + { + "epoch": 2.070532237126785, + "grad_norm": 0.326171875, + "learning_rate": 2.7221052631578944e-05, + "loss": 0.0148, + "num_input_tokens_seen": 626987008, + "step": 9570, + "train_runtime": 2029.4297, + "train_tokens_per_second": 308947.395 + }, + { + "epoch": 2.0726958026828215, + "grad_norm": 0.37890625, + "learning_rate": 2.6589473684210524e-05, + "loss": 0.0173, + "num_input_tokens_seen": 627642368, + "step": 9580, + "train_runtime": 2033.2601, + "train_tokens_per_second": 308687.686 + }, + { + "epoch": 2.0748593682388576, + "grad_norm": 0.326171875, + "learning_rate": 2.5957894736842104e-05, + "loss": 0.0169, + "num_input_tokens_seen": 628297728, + "step": 9590, + "train_runtime": 2037.0979, + "train_tokens_per_second": 308427.848 + }, + { + "epoch": 2.0770229337948938, + "grad_norm": 0.3828125, + "learning_rate": 2.5326315789473683e-05, + "loss": 0.017, + "num_input_tokens_seen": 628953088, + "step": 9600, + "train_runtime": 2040.9189, + "train_tokens_per_second": 308171.527 + }, + { + "epoch": 2.0770229337948938, + "eval_loss": 0.01264923345297575, + "eval_runtime": 1.7454, + "eval_samples_per_second": 18.334, + "eval_steps_per_second": 0.573, + "num_input_tokens_seen": 628953088, + "step": 9600 + }, + { + "epoch": 2.0770229337948938, + "eval_byte_accuracy": 0.9970326409495549, + "eval_chrf": 98.31023260829656, + "eval_sacrebleu": 97.89877731533007, + "eval_word_accuracy": 0.9931318681318682, + "num_input_tokens_seen": 628953088, + "perplexity": 1.0127295733953043, + "step": 9600 + }, + { + "epoch": 2.0791864993509304, + "grad_norm": 0.310546875, + "learning_rate": 2.4694736842105263e-05, + "loss": 0.015, + "num_input_tokens_seen": 629608448, + "step": 9610, + "train_runtime": 2046.5064, + "train_tokens_per_second": 307650.374 + }, + { + "epoch": 2.0813500649069665, + "grad_norm": 0.259765625, + "learning_rate": 2.406315789473684e-05, + "loss": 0.0162, + "num_input_tokens_seen": 630263808, + "step": 9620, + "train_runtime": 2050.3255, + "train_tokens_per_second": 307396.946 + }, + { + "epoch": 2.083513630463003, + "grad_norm": 0.298828125, + "learning_rate": 2.343157894736842e-05, + "loss": 0.0151, + "num_input_tokens_seen": 630919168, + "step": 9630, + "train_runtime": 2054.1477, + "train_tokens_per_second": 307144.006 + }, + { + "epoch": 2.0856771960190392, + "grad_norm": 0.3125, + "learning_rate": 2.28e-05, + "loss": 0.0173, + "num_input_tokens_seen": 631574528, + "step": 9640, + "train_runtime": 2057.9652, + "train_tokens_per_second": 306892.717 + }, + { + "epoch": 2.087840761575076, + "grad_norm": 0.3515625, + "learning_rate": 2.2168421052631578e-05, + "loss": 0.0173, + "num_input_tokens_seen": 632229888, + "step": 9650, + "train_runtime": 2061.7866, + "train_tokens_per_second": 306641.771 + }, + { + "epoch": 2.090004327131112, + "grad_norm": 0.33203125, + "learning_rate": 2.1536842105263158e-05, + "loss": 0.0155, + "num_input_tokens_seen": 632885248, + "step": 9660, + "train_runtime": 2065.6095, + "train_tokens_per_second": 306391.525 + }, + { + "epoch": 2.0921678926871485, + "grad_norm": 0.390625, + "learning_rate": 2.0905263157894737e-05, + "loss": 0.0155, + "num_input_tokens_seen": 633540608, + "step": 9670, + "train_runtime": 2069.4392, + "train_tokens_per_second": 306141.211 + }, + { + "epoch": 2.0943314582431847, + "grad_norm": 0.330078125, + "learning_rate": 2.0273684210526317e-05, + "loss": 0.0186, + "num_input_tokens_seen": 634195968, + "step": 9680, + "train_runtime": 2073.2575, + "train_tokens_per_second": 305893.487 + }, + { + "epoch": 2.0964950237992213, + "grad_norm": 0.3828125, + "learning_rate": 1.964210526315789e-05, + "loss": 0.017, + "num_input_tokens_seen": 634851328, + "step": 9690, + "train_runtime": 2077.0824, + "train_tokens_per_second": 305645.707 + }, + { + "epoch": 2.0986585893552574, + "grad_norm": 0.388671875, + "learning_rate": 1.901052631578947e-05, + "loss": 0.0176, + "num_input_tokens_seen": 635506688, + "step": 9700, + "train_runtime": 2080.9071, + "train_tokens_per_second": 305398.871 + }, + { + "epoch": 2.0986585893552574, + "eval_loss": 0.01279271300882101, + "eval_runtime": 1.7482, + "eval_samples_per_second": 18.305, + "eval_steps_per_second": 0.572, + "num_input_tokens_seen": 635506688, + "step": 9700 + }, + { + "epoch": 2.0986585893552574, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 97.81668588759814, + "eval_sacrebleu": 97.30034967398244, + "eval_word_accuracy": 0.9917582417582418, + "num_input_tokens_seen": 635506688, + "perplexity": 1.0128748898094044, + "step": 9700 + }, + { + "epoch": 2.100822154911294, + "grad_norm": 0.318359375, + "learning_rate": 1.837894736842105e-05, + "loss": 0.0184, + "num_input_tokens_seen": 636162048, + "step": 9710, + "train_runtime": 2086.5013, + "train_tokens_per_second": 304894.156 + }, + { + "epoch": 2.10298572046733, + "grad_norm": 0.33984375, + "learning_rate": 1.774736842105263e-05, + "loss": 0.0165, + "num_input_tokens_seen": 636817408, + "step": 9720, + "train_runtime": 2090.2846, + "train_tokens_per_second": 304655.832 + }, + { + "epoch": 2.1051492860233667, + "grad_norm": 0.3359375, + "learning_rate": 1.711578947368421e-05, + "loss": 0.017, + "num_input_tokens_seen": 637472768, + "step": 9730, + "train_runtime": 2094.1096, + "train_tokens_per_second": 304412.325 + }, + { + "epoch": 2.107312851579403, + "grad_norm": 0.375, + "learning_rate": 1.6484210526315788e-05, + "loss": 0.0166, + "num_input_tokens_seen": 638128128, + "step": 9740, + "train_runtime": 2097.9304, + "train_tokens_per_second": 304170.301 + }, + { + "epoch": 2.109476417135439, + "grad_norm": 0.287109375, + "learning_rate": 1.5852631578947364e-05, + "loss": 0.0147, + "num_input_tokens_seen": 638783488, + "step": 9750, + "train_runtime": 2101.7387, + "train_tokens_per_second": 303930.978 + }, + { + "epoch": 2.1116399826914756, + "grad_norm": 0.31640625, + "learning_rate": 1.5221052631578946e-05, + "loss": 0.0157, + "num_input_tokens_seen": 639438848, + "step": 9760, + "train_runtime": 2105.5587, + "train_tokens_per_second": 303690.82 + }, + { + "epoch": 2.1138035482475117, + "grad_norm": 0.34765625, + "learning_rate": 1.4589473684210525e-05, + "loss": 0.0156, + "num_input_tokens_seen": 640094208, + "step": 9770, + "train_runtime": 2109.3849, + "train_tokens_per_second": 303450.638 + }, + { + "epoch": 2.1159671138035483, + "grad_norm": 0.341796875, + "learning_rate": 1.3957894736842105e-05, + "loss": 0.0164, + "num_input_tokens_seen": 640749568, + "step": 9780, + "train_runtime": 2113.2091, + "train_tokens_per_second": 303211.626 + }, + { + "epoch": 2.1181306793595844, + "grad_norm": 0.298828125, + "learning_rate": 1.3326315789473681e-05, + "loss": 0.016, + "num_input_tokens_seen": 641404928, + "step": 9790, + "train_runtime": 2117.0346, + "train_tokens_per_second": 302973.28 + }, + { + "epoch": 2.120294244915621, + "grad_norm": 0.283203125, + "learning_rate": 1.2694736842105261e-05, + "loss": 0.0178, + "num_input_tokens_seen": 642060288, + "step": 9800, + "train_runtime": 2120.8617, + "train_tokens_per_second": 302735.575 + }, + { + "epoch": 2.120294244915621, + "eval_loss": 0.012742544524371624, + "eval_runtime": 1.7509, + "eval_samples_per_second": 18.276, + "eval_steps_per_second": 0.571, + "num_input_tokens_seen": 642060288, + "step": 9800 + }, + { + "epoch": 2.120294244915621, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 97.81668588759814, + "eval_sacrebleu": 97.30034967398244, + "eval_word_accuracy": 0.9917582417582418, + "num_input_tokens_seen": 642060288, + "perplexity": 1.0128240766858652, + "step": 9800 + }, + { + "epoch": 2.122457810471657, + "grad_norm": 0.361328125, + "learning_rate": 1.206315789473684e-05, + "loss": 0.0161, + "num_input_tokens_seen": 642715648, + "step": 9810, + "train_runtime": 2126.4483, + "train_tokens_per_second": 302248.418 + }, + { + "epoch": 2.1246213760276937, + "grad_norm": 0.34765625, + "learning_rate": 1.143157894736842e-05, + "loss": 0.0138, + "num_input_tokens_seen": 643371008, + "step": 9820, + "train_runtime": 2130.2665, + "train_tokens_per_second": 302014.328 + }, + { + "epoch": 2.12678494158373, + "grad_norm": 0.35546875, + "learning_rate": 1.0799999999999998e-05, + "loss": 0.0149, + "num_input_tokens_seen": 644026368, + "step": 9830, + "train_runtime": 2134.0896, + "train_tokens_per_second": 301780.376 + }, + { + "epoch": 2.1289485071397665, + "grad_norm": 0.361328125, + "learning_rate": 1.0168421052631578e-05, + "loss": 0.0162, + "num_input_tokens_seen": 644681728, + "step": 9840, + "train_runtime": 2137.916, + "train_tokens_per_second": 301546.8 + }, + { + "epoch": 2.1311120726958026, + "grad_norm": 0.37890625, + "learning_rate": 9.536842105263158e-06, + "loss": 0.0171, + "num_input_tokens_seen": 645337088, + "step": 9850, + "train_runtime": 2141.7343, + "train_tokens_per_second": 301315.201 + }, + { + "epoch": 2.133275638251839, + "grad_norm": 0.29296875, + "learning_rate": 8.905263157894735e-06, + "loss": 0.0177, + "num_input_tokens_seen": 645992448, + "step": 9860, + "train_runtime": 2145.5672, + "train_tokens_per_second": 301082.361 + }, + { + "epoch": 2.1354392038078753, + "grad_norm": 0.3828125, + "learning_rate": 8.273684210526315e-06, + "loss": 0.0144, + "num_input_tokens_seen": 646643712, + "step": 9870, + "train_runtime": 2149.3524, + "train_tokens_per_second": 300855.133 + }, + { + "epoch": 2.137602769363912, + "grad_norm": 0.25390625, + "learning_rate": 7.642105263157893e-06, + "loss": 0.0152, + "num_input_tokens_seen": 647299072, + "step": 9880, + "train_runtime": 2153.1607, + "train_tokens_per_second": 300627.389 + }, + { + "epoch": 2.139766334919948, + "grad_norm": 0.369140625, + "learning_rate": 7.0105263157894736e-06, + "loss": 0.0176, + "num_input_tokens_seen": 647954432, + "step": 9890, + "train_runtime": 2156.9971, + "train_tokens_per_second": 300396.521 + }, + { + "epoch": 2.1419299004759846, + "grad_norm": 0.263671875, + "learning_rate": 6.3789473684210515e-06, + "loss": 0.0167, + "num_input_tokens_seen": 648605696, + "step": 9900, + "train_runtime": 2160.8044, + "train_tokens_per_second": 300168.632 + }, + { + "epoch": 2.1419299004759846, + "eval_loss": 0.012824327684938908, + "eval_runtime": 1.755, + "eval_samples_per_second": 18.233, + "eval_steps_per_second": 0.57, + "num_input_tokens_seen": 648605696, + "step": 9900 + }, + { + "epoch": 2.1419299004759846, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 98.31023260829656, + "eval_sacrebleu": 97.89877731533007, + "eval_word_accuracy": 0.9917582417582418, + "num_input_tokens_seen": 648605696, + "perplexity": 1.012906912027177, + "step": 9900 + }, + { + "epoch": 2.144093466032021, + "grad_norm": 0.36328125, + "learning_rate": 5.747368421052631e-06, + "loss": 0.0179, + "num_input_tokens_seen": 649261056, + "step": 9910, + "train_runtime": 2166.3917, + "train_tokens_per_second": 299696.981 + }, + { + "epoch": 2.146257031588057, + "grad_norm": 0.369140625, + "learning_rate": 5.11578947368421e-06, + "loss": 0.0172, + "num_input_tokens_seen": 649916416, + "step": 9920, + "train_runtime": 2170.2133, + "train_tokens_per_second": 299471.215 + }, + { + "epoch": 2.1484205971440935, + "grad_norm": 0.36328125, + "learning_rate": 4.484210526315789e-06, + "loss": 0.0157, + "num_input_tokens_seen": 650571776, + "step": 9930, + "train_runtime": 2174.0376, + "train_tokens_per_second": 299245.87 + }, + { + "epoch": 2.1505841627001296, + "grad_norm": 0.30078125, + "learning_rate": 3.8526315789473676e-06, + "loss": 0.0155, + "num_input_tokens_seen": 651227136, + "step": 9940, + "train_runtime": 2177.8684, + "train_tokens_per_second": 299020.427 + }, + { + "epoch": 2.1527477282561662, + "grad_norm": 0.47265625, + "learning_rate": 3.2210526315789468e-06, + "loss": 0.0382, + "num_input_tokens_seen": 651882496, + "step": 9950, + "train_runtime": 2181.6807, + "train_tokens_per_second": 298798.299 + }, + { + "epoch": 2.1549112938122024, + "grad_norm": 0.341796875, + "learning_rate": 2.589473684210526e-06, + "loss": 0.0165, + "num_input_tokens_seen": 652537856, + "step": 9960, + "train_runtime": 2185.4931, + "train_tokens_per_second": 298576.95 + }, + { + "epoch": 2.157074859368239, + "grad_norm": 0.365234375, + "learning_rate": 1.957894736842105e-06, + "loss": 0.0161, + "num_input_tokens_seen": 653193216, + "step": 9970, + "train_runtime": 2189.3203, + "train_tokens_per_second": 298354.34 + }, + { + "epoch": 2.159238424924275, + "grad_norm": 0.32421875, + "learning_rate": 1.326315789473684e-06, + "loss": 0.0164, + "num_input_tokens_seen": 653848576, + "step": 9980, + "train_runtime": 2193.1448, + "train_tokens_per_second": 298132.884 + }, + { + "epoch": 2.1614019904803117, + "grad_norm": 0.3515625, + "learning_rate": 6.947368421052631e-07, + "loss": 0.0159, + "num_input_tokens_seen": 654503936, + "step": 9990, + "train_runtime": 2196.9649, + "train_tokens_per_second": 297912.793 + }, + { + "epoch": 2.163565556036348, + "grad_norm": 0.345703125, + "learning_rate": 6.31578947368421e-08, + "loss": 0.0166, + "num_input_tokens_seen": 655159296, + "step": 10000, + "train_runtime": 2200.7835, + "train_tokens_per_second": 297693.656 + }, + { + "epoch": 2.163565556036348, + "eval_loss": 0.012828610837459564, + "eval_runtime": 1.7704, + "eval_samples_per_second": 18.075, + "eval_steps_per_second": 0.565, + "num_input_tokens_seen": 655159296, + "step": 10000 + }, + { + "epoch": 2.163565556036348, + "eval_byte_accuracy": 0.9966617210682492, + "eval_chrf": 97.81668588759814, + "eval_sacrebleu": 97.30034967398244, + "eval_word_accuracy": 0.9917582417582418, + "num_input_tokens_seen": 655159296, + "perplexity": 1.0129112504712616, + "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, - "num_input_tokens_seen": 556892160, + "num_input_tokens_seen": 655159296, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { @@ -10175,12 +11979,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } }, - "total_flos": 2.237519634628608e+16, + "total_flos": 2.632344094428365e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null