{ "best_global_step": 5000, "best_metric": 98.56639547519084, "best_model_checkpoint": "./output/string-repetition-tiny/checkpoint-5000", "epoch": 2.163565556036348, "eval_steps": 100, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 4.9941487312316895, "eval_runtime": 2.0943, "eval_samples_per_second": 15.279, "eval_steps_per_second": 0.477, "num_input_tokens_seen": 0, "step": 0 }, { "epoch": 0, "eval_byte_accuracy": 0.29154302670623145, "eval_chrf": 1.9898598915486052, "eval_sacrebleu": 0.027097937214200497, "eval_word_accuracy": 0.0, "num_input_tokens_seen": 0, "perplexity": 147.5472895081926, "step": 0 }, { "epoch": 0.0021635655560363477, "grad_norm": 18.625, "learning_rate": 1.0799999999999998e-05, "loss": 4.9829, "num_input_tokens_seen": 655360, "step": 10, "train_runtime": 7.5789, "train_tokens_per_second": 86472.05 }, { "epoch": 0.004327131112072695, "grad_norm": 16.625, "learning_rate": 2.28e-05, "loss": 4.7329, "num_input_tokens_seen": 1310720, "step": 20, "train_runtime": 11.379, "train_tokens_per_second": 115187.192 }, { "epoch": 0.006490696668109044, "grad_norm": 16.25, "learning_rate": 3.48e-05, "loss": 4.0165, "num_input_tokens_seen": 1966080, "step": 30, "train_runtime": 15.1734, "train_tokens_per_second": 129573.818 }, { "epoch": 0.00865426222414539, "grad_norm": 20.375, "learning_rate": 4.68e-05, "loss": 2.5654, "num_input_tokens_seen": 2621440, "step": 40, "train_runtime": 18.9678, "train_tokens_per_second": 138204.687 }, { "epoch": 0.010817827780181739, "grad_norm": 3.15625, "learning_rate": 5.88e-05, "loss": 1.5888, "num_input_tokens_seen": 3276800, "step": 50, "train_runtime": 22.7693, "train_tokens_per_second": 143912.922 }, { "epoch": 0.012981393336218087, "grad_norm": 3.671875, "learning_rate": 7.079999999999999e-05, "loss": 1.2862, "num_input_tokens_seen": 3932160, "step": 60, "train_runtime": 28.5663, "train_tokens_per_second": 137650.179 }, { "epoch": 0.015144958892254435, "grad_norm": 3.6875, "learning_rate": 8.28e-05, "loss": 1.1568, "num_input_tokens_seen": 4587520, "step": 70, "train_runtime": 32.3629, "train_tokens_per_second": 141752.538 }, { "epoch": 0.01730852444829078, "grad_norm": 2.375, "learning_rate": 9.479999999999999e-05, "loss": 1.0683, "num_input_tokens_seen": 5242880, "step": 80, "train_runtime": 36.1566, "train_tokens_per_second": 145004.585 }, { "epoch": 0.01947209000432713, "grad_norm": 8.875, "learning_rate": 0.00010679999999999998, "loss": 1.0143, "num_input_tokens_seen": 5898240, "step": 90, "train_runtime": 39.9526, "train_tokens_per_second": 147630.985 }, { "epoch": 0.021635655560363478, "grad_norm": 3.484375, "learning_rate": 0.0001188, "loss": 0.9351, "num_input_tokens_seen": 6553600, "step": 100, "train_runtime": 43.7483, "train_tokens_per_second": 149802.332 }, { "epoch": 0.021635655560363478, "eval_loss": 0.9468593597412109, "eval_runtime": 1.2791, "eval_samples_per_second": 25.018, "eval_steps_per_second": 0.782, "num_input_tokens_seen": 6553600, "step": 100 }, { "epoch": 0.021635655560363478, "eval_byte_accuracy": 0.7288575667655787, "eval_chrf": 4.12565998742775, "eval_sacrebleu": 0.0736970000889057, "eval_word_accuracy": 0.6057692307692307, "num_input_tokens_seen": 6553600, "perplexity": 2.5776016143530303, "step": 100 }, { "epoch": 0.023799221116399826, "grad_norm": 3.046875, "learning_rate": 0.00013079999999999998, "loss": 0.8818, "num_input_tokens_seen": 7204864, "step": 110, "train_runtime": 50.6655, "train_tokens_per_second": 142204.571 }, { "epoch": 0.025962786672436174, "grad_norm": 1.1015625, "learning_rate": 0.00014279999999999997, "loss": 0.8464, "num_input_tokens_seen": 7860224, "step": 120, "train_runtime": 54.4574, "train_tokens_per_second": 144337.182 }, { "epoch": 0.028126352228472522, "grad_norm": 3.890625, "learning_rate": 0.0001548, "loss": 0.883, "num_input_tokens_seen": 8515584, "step": 130, "train_runtime": 58.2578, "train_tokens_per_second": 146170.797 }, { "epoch": 0.03028991778450887, "grad_norm": 2.015625, "learning_rate": 0.0001668, "loss": 0.8342, "num_input_tokens_seen": 9170944, "step": 140, "train_runtime": 62.0415, "train_tokens_per_second": 147819.414 }, { "epoch": 0.032453483340545215, "grad_norm": 3.171875, "learning_rate": 0.00017879999999999998, "loss": 0.8202, "num_input_tokens_seen": 9826304, "step": 150, "train_runtime": 65.8305, "train_tokens_per_second": 149266.694 }, { "epoch": 0.03461704889658156, "grad_norm": 2.390625, "learning_rate": 0.00019079999999999998, "loss": 0.8595, "num_input_tokens_seen": 10481664, "step": 160, "train_runtime": 71.4958, "train_tokens_per_second": 146605.34 }, { "epoch": 0.03678061445261791, "grad_norm": 2.25, "learning_rate": 0.0002028, "loss": 0.8288, "num_input_tokens_seen": 11137024, "step": 170, "train_runtime": 75.288, "train_tokens_per_second": 147925.655 }, { "epoch": 0.03894418000865426, "grad_norm": 5.78125, "learning_rate": 0.00021479999999999996, "loss": 0.8148, "num_input_tokens_seen": 11792384, "step": 180, "train_runtime": 79.0884, "train_tokens_per_second": 149103.819 }, { "epoch": 0.04110774556469061, "grad_norm": 0.97265625, "learning_rate": 0.00022679999999999998, "loss": 0.7932, "num_input_tokens_seen": 12447744, "step": 190, "train_runtime": 82.8997, "train_tokens_per_second": 150154.262 }, { "epoch": 0.043271311120726956, "grad_norm": 1.40625, "learning_rate": 0.0002388, "loss": 0.7827, "num_input_tokens_seen": 13103104, "step": 200, "train_runtime": 86.6986, "train_tokens_per_second": 151133.933 }, { "epoch": 0.043271311120726956, "eval_loss": 0.8652347326278687, "eval_runtime": 1.5369, "eval_samples_per_second": 20.821, "eval_steps_per_second": 0.651, "num_input_tokens_seen": 13103104, "step": 200 }, { "epoch": 0.043271311120726956, "eval_byte_accuracy": 0.7399851632047477, "eval_chrf": 5.853085649666837, "eval_sacrebleu": 0.21629231639858632, "eval_word_accuracy": 0.6195054945054945, "num_input_tokens_seen": 13103104, "perplexity": 2.3755636428327103, "step": 200 }, { "epoch": 0.045434876676763304, "grad_norm": 1.8046875, "learning_rate": 0.00025079999999999997, "loss": 0.7815, "num_input_tokens_seen": 13758464, "step": 210, "train_runtime": 93.9602, "train_tokens_per_second": 146428.646 }, { "epoch": 0.04759844223279965, "grad_norm": 0.984375, "learning_rate": 0.0002628, "loss": 0.7407, "num_input_tokens_seen": 14413824, "step": 220, "train_runtime": 97.7619, "train_tokens_per_second": 147438.046 }, { "epoch": 0.049762007788836, "grad_norm": 1.703125, "learning_rate": 0.0002748, "loss": 0.7636, "num_input_tokens_seen": 15069184, "step": 230, "train_runtime": 101.5612, "train_tokens_per_second": 148375.388 }, { "epoch": 0.05192557334487235, "grad_norm": 0.73828125, "learning_rate": 0.0002868, "loss": 0.773, "num_input_tokens_seen": 15724544, "step": 240, "train_runtime": 105.3581, "train_tokens_per_second": 149248.582 }, { "epoch": 0.0540891389009087, "grad_norm": 1.4453125, "learning_rate": 0.0002988, "loss": 0.7638, "num_input_tokens_seen": 16379904, "step": 250, "train_runtime": 109.1647, "train_tokens_per_second": 150047.624 }, { "epoch": 0.056252704456945045, "grad_norm": 0.89453125, "learning_rate": 0.00031079999999999997, "loss": 0.7616, "num_input_tokens_seen": 17035264, "step": 260, "train_runtime": 114.9034, "train_tokens_per_second": 148257.262 }, { "epoch": 0.05841627001298139, "grad_norm": 0.71484375, "learning_rate": 0.0003228, "loss": 0.761, "num_input_tokens_seen": 17690624, "step": 270, "train_runtime": 118.6967, "train_tokens_per_second": 149040.514 }, { "epoch": 0.06057983556901774, "grad_norm": 1.3828125, "learning_rate": 0.0003348, "loss": 0.7694, "num_input_tokens_seen": 18345984, "step": 280, "train_runtime": 122.4954, "train_tokens_per_second": 149768.786 }, { "epoch": 0.06274340112505408, "grad_norm": 0.8828125, "learning_rate": 0.0003467999999999999, "loss": 0.754, "num_input_tokens_seen": 19001344, "step": 290, "train_runtime": 126.2997, "train_tokens_per_second": 150446.434 }, { "epoch": 0.06490696668109043, "grad_norm": 0.625, "learning_rate": 0.00035879999999999994, "loss": 0.7533, "num_input_tokens_seen": 19656704, "step": 300, "train_runtime": 130.1045, "train_tokens_per_second": 151083.958 }, { "epoch": 0.06490696668109043, "eval_loss": 0.8242582082748413, "eval_runtime": 1.7792, "eval_samples_per_second": 17.985, "eval_steps_per_second": 0.562, "num_input_tokens_seen": 19656704, "step": 300 }, { "epoch": 0.06490696668109043, "eval_byte_accuracy": 0.7436943620178041, "eval_chrf": 7.359687391784456, "eval_sacrebleu": 0.18818523987470104, "eval_word_accuracy": 0.6195054945054945, "num_input_tokens_seen": 19656704, "perplexity": 2.280188712712687, "step": 300 }, { "epoch": 0.06707053223712678, "grad_norm": 0.875, "learning_rate": 0.00037079999999999996, "loss": 0.743, "num_input_tokens_seen": 20312064, "step": 310, "train_runtime": 137.6374, "train_tokens_per_second": 147576.607 }, { "epoch": 0.06923409779316313, "grad_norm": 0.73828125, "learning_rate": 0.0003828, "loss": 0.7551, "num_input_tokens_seen": 20967424, "step": 320, "train_runtime": 141.444, "train_tokens_per_second": 148238.373 }, { "epoch": 0.07139766334919947, "grad_norm": 1.203125, "learning_rate": 0.0003948, "loss": 0.7512, "num_input_tokens_seen": 21622784, "step": 330, "train_runtime": 145.2419, "train_tokens_per_second": 148874.327 }, { "epoch": 0.07356122890523582, "grad_norm": 1.2109375, "learning_rate": 0.00040679999999999997, "loss": 0.7349, "num_input_tokens_seen": 22278144, "step": 340, "train_runtime": 149.0492, "train_tokens_per_second": 149468.345 }, { "epoch": 0.07572479446127217, "grad_norm": 0.62109375, "learning_rate": 0.00041879999999999993, "loss": 0.7356, "num_input_tokens_seen": 22933504, "step": 350, "train_runtime": 152.8623, "train_tokens_per_second": 150027.169 }, { "epoch": 0.07788836001730852, "grad_norm": 0.96484375, "learning_rate": 0.00043079999999999995, "loss": 0.7216, "num_input_tokens_seen": 23588864, "step": 360, "train_runtime": 158.5961, "train_tokens_per_second": 148735.445 }, { "epoch": 0.08005192557334487, "grad_norm": 0.9453125, "learning_rate": 0.0004428, "loss": 0.7263, "num_input_tokens_seen": 24244224, "step": 370, "train_runtime": 162.4005, "train_tokens_per_second": 149286.589 }, { "epoch": 0.08221549112938122, "grad_norm": 0.78125, "learning_rate": 0.00045479999999999994, "loss": 0.7355, "num_input_tokens_seen": 24899584, "step": 380, "train_runtime": 166.2066, "train_tokens_per_second": 149811.031 }, { "epoch": 0.08437905668541756, "grad_norm": 0.6328125, "learning_rate": 0.00046679999999999996, "loss": 0.7117, "num_input_tokens_seen": 25554944, "step": 390, "train_runtime": 170.012, "train_tokens_per_second": 150312.596 }, { "epoch": 0.08654262224145391, "grad_norm": 0.625, "learning_rate": 0.0004788, "loss": 0.7047, "num_input_tokens_seen": 26210304, "step": 400, "train_runtime": 173.8217, "train_tokens_per_second": 150788.449 }, { "epoch": 0.08654262224145391, "eval_loss": 0.7835195064544678, "eval_runtime": 1.666, "eval_samples_per_second": 19.208, "eval_steps_per_second": 0.6, "num_input_tokens_seen": 26210304, "step": 400 }, { "epoch": 0.08654262224145391, "eval_byte_accuracy": 0.766320474777448, "eval_chrf": 7.609548718464071, "eval_sacrebleu": 0.4190094244340075, "eval_word_accuracy": 0.6456043956043956, "num_input_tokens_seen": 26210304, "perplexity": 2.1891634979487655, "step": 400 }, { "epoch": 0.08870618779749026, "grad_norm": 0.6640625, "learning_rate": 0.0004907999999999999, "loss": 0.6961, "num_input_tokens_seen": 26865664, "step": 410, "train_runtime": 181.276, "train_tokens_per_second": 148203.132 }, { "epoch": 0.09086975335352661, "grad_norm": 0.80859375, "learning_rate": 0.0005028, "loss": 0.6971, "num_input_tokens_seen": 27521024, "step": 420, "train_runtime": 185.0858, "train_tokens_per_second": 148693.323 }, { "epoch": 0.09303331890956296, "grad_norm": 1.546875, "learning_rate": 0.0005147999999999999, "loss": 0.668, "num_input_tokens_seen": 28176384, "step": 430, "train_runtime": 188.8745, "train_tokens_per_second": 149180.456 }, { "epoch": 0.0951968844655993, "grad_norm": 0.859375, "learning_rate": 0.0005267999999999999, "loss": 0.6698, "num_input_tokens_seen": 28831744, "step": 440, "train_runtime": 192.6748, "train_tokens_per_second": 149639.425 }, { "epoch": 0.09736045002163565, "grad_norm": 1.828125, "learning_rate": 0.0005388, "loss": 0.6434, "num_input_tokens_seen": 29487104, "step": 450, "train_runtime": 196.4814, "train_tokens_per_second": 150075.792 }, { "epoch": 0.099524015577672, "grad_norm": 2.3125, "learning_rate": 0.0005507999999999999, "loss": 0.593, "num_input_tokens_seen": 30142464, "step": 460, "train_runtime": 202.2445, "train_tokens_per_second": 149039.706 }, { "epoch": 0.10168758113370835, "grad_norm": 2.703125, "learning_rate": 0.0005627999999999999, "loss": 0.5579, "num_input_tokens_seen": 30797824, "step": 470, "train_runtime": 206.0426, "train_tokens_per_second": 149473.092 }, { "epoch": 0.1038511466897447, "grad_norm": 1.5390625, "learning_rate": 0.0005747999999999999, "loss": 0.5097, "num_input_tokens_seen": 31453184, "step": 480, "train_runtime": 209.8528, "train_tokens_per_second": 149882.114 }, { "epoch": 0.10601471224578105, "grad_norm": 2.25, "learning_rate": 0.0005868, "loss": 0.4667, "num_input_tokens_seen": 32108544, "step": 490, "train_runtime": 213.6418, "train_tokens_per_second": 150291.498 }, { "epoch": 0.1081782778018174, "grad_norm": 2.25, "learning_rate": 0.0005987999999999999, "loss": 0.4021, "num_input_tokens_seen": 32763904, "step": 500, "train_runtime": 217.4483, "train_tokens_per_second": 150674.436 }, { "epoch": 0.1081782778018174, "eval_loss": 0.3849869966506958, "eval_runtime": 2.1359, "eval_samples_per_second": 14.982, "eval_steps_per_second": 0.468, "num_input_tokens_seen": 32763904, "step": 500 }, { "epoch": 0.1081782778018174, "eval_byte_accuracy": 0.8775964391691394, "eval_chrf": 33.58088554992905, "eval_sacrebleu": 13.160711204980268, "eval_word_accuracy": 0.7884615384615384, "num_input_tokens_seen": 32763904, "perplexity": 1.4695952116570261, "step": 500 }, { "epoch": 0.11034184335785374, "grad_norm": 1.65625, "learning_rate": 0.0005994315789473684, "loss": 0.3626, "num_input_tokens_seen": 33419264, "step": 510, "train_runtime": 225.4187, "train_tokens_per_second": 148254.162 }, { "epoch": 0.11250540891389009, "grad_norm": 2.640625, "learning_rate": 0.0005987999999999999, "loss": 0.336, "num_input_tokens_seen": 34074624, "step": 520, "train_runtime": 229.2259, "train_tokens_per_second": 148650.834 }, { "epoch": 0.11466897446992644, "grad_norm": 1.7890625, "learning_rate": 0.0005981684210526315, "loss": 0.3142, "num_input_tokens_seen": 34729984, "step": 530, "train_runtime": 233.0268, "train_tokens_per_second": 149038.583 }, { "epoch": 0.11683254002596279, "grad_norm": 1.9296875, "learning_rate": 0.0005975368421052631, "loss": 0.2976, "num_input_tokens_seen": 35385344, "step": 540, "train_runtime": 236.8458, "train_tokens_per_second": 149402.428 }, { "epoch": 0.11899610558199913, "grad_norm": 2.71875, "learning_rate": 0.0005969052631578947, "loss": 0.2723, "num_input_tokens_seen": 36040704, "step": 550, "train_runtime": 240.644, "train_tokens_per_second": 149767.721 }, { "epoch": 0.12115967113803548, "grad_norm": 2.015625, "learning_rate": 0.0005962736842105263, "loss": 0.2423, "num_input_tokens_seen": 36696064, "step": 560, "train_runtime": 246.4279, "train_tokens_per_second": 148911.956 }, { "epoch": 0.12332323669407183, "grad_norm": 2.0, "learning_rate": 0.0005956421052631579, "loss": 0.2458, "num_input_tokens_seen": 37351424, "step": 570, "train_runtime": 250.2418, "train_tokens_per_second": 149261.331 }, { "epoch": 0.12548680225010816, "grad_norm": 1.7734375, "learning_rate": 0.0005950105263157894, "loss": 0.2317, "num_input_tokens_seen": 38006784, "step": 580, "train_runtime": 254.0523, "train_tokens_per_second": 149602.203 }, { "epoch": 0.12765036780614453, "grad_norm": 2.671875, "learning_rate": 0.000594378947368421, "loss": 0.217, "num_input_tokens_seen": 38662144, "step": 590, "train_runtime": 257.8636, "train_tokens_per_second": 149932.561 }, { "epoch": 0.12981393336218086, "grad_norm": 2.53125, "learning_rate": 0.0005937473684210525, "loss": 0.207, "num_input_tokens_seen": 39317504, "step": 600, "train_runtime": 261.6775, "train_tokens_per_second": 150251.784 }, { "epoch": 0.12981393336218086, "eval_loss": 0.18635742366313934, "eval_runtime": 2.3524, "eval_samples_per_second": 13.603, "eval_steps_per_second": 0.425, "num_input_tokens_seen": 39317504, "step": 600 }, { "epoch": 0.12981393336218086, "eval_byte_accuracy": 0.9425074183976261, "eval_chrf": 61.6495042526747, "eval_sacrebleu": 48.627627138112864, "eval_word_accuracy": 0.885989010989011, "num_input_tokens_seen": 39317504, "perplexity": 1.2048528263365477, "step": 600 }, { "epoch": 0.13197749891821722, "grad_norm": 1.0078125, "learning_rate": 0.0005931157894736842, "loss": 0.1849, "num_input_tokens_seen": 39972864, "step": 610, "train_runtime": 269.8376, "train_tokens_per_second": 148136.726 }, { "epoch": 0.13414106447425356, "grad_norm": 2.015625, "learning_rate": 0.0005924842105263157, "loss": 0.1942, "num_input_tokens_seen": 40628224, "step": 620, "train_runtime": 273.6548, "train_tokens_per_second": 148465.26 }, { "epoch": 0.13630463003028992, "grad_norm": 2.1875, "learning_rate": 0.0005918526315789474, "loss": 0.1857, "num_input_tokens_seen": 41283584, "step": 630, "train_runtime": 277.4573, "train_tokens_per_second": 148792.569 }, { "epoch": 0.13846819558632625, "grad_norm": 1.6875, "learning_rate": 0.0005912210526315788, "loss": 0.1754, "num_input_tokens_seen": 41938944, "step": 640, "train_runtime": 281.2729, "train_tokens_per_second": 149104.106 }, { "epoch": 0.14063176114236262, "grad_norm": 1.4375, "learning_rate": 0.0005905894736842105, "loss": 0.1831, "num_input_tokens_seen": 42594304, "step": 650, "train_runtime": 285.0868, "train_tokens_per_second": 149408.204 }, { "epoch": 0.14279532669839895, "grad_norm": 2.046875, "learning_rate": 0.000589957894736842, "loss": 0.1667, "num_input_tokens_seen": 43249664, "step": 660, "train_runtime": 290.9113, "train_tokens_per_second": 148669.581 }, { "epoch": 0.1449588922544353, "grad_norm": 1.765625, "learning_rate": 0.0005893263157894736, "loss": 0.1622, "num_input_tokens_seen": 43905024, "step": 670, "train_runtime": 294.7222, "train_tokens_per_second": 148970.863 }, { "epoch": 0.14712245781047165, "grad_norm": 1.5390625, "learning_rate": 0.0005886947368421052, "loss": 0.1577, "num_input_tokens_seen": 44560384, "step": 680, "train_runtime": 298.5377, "train_tokens_per_second": 149262.168 }, { "epoch": 0.149286023366508, "grad_norm": 2.3125, "learning_rate": 0.0005880631578947368, "loss": 0.1535, "num_input_tokens_seen": 45215744, "step": 690, "train_runtime": 302.3486, "train_tokens_per_second": 149548.37 }, { "epoch": 0.15144958892254434, "grad_norm": 1.4609375, "learning_rate": 0.0005874315789473684, "loss": 0.1465, "num_input_tokens_seen": 45871104, "step": 700, "train_runtime": 306.1617, "train_tokens_per_second": 149826.412 }, { "epoch": 0.15144958892254434, "eval_loss": 0.14650069177150726, "eval_runtime": 2.2585, "eval_samples_per_second": 14.169, "eval_steps_per_second": 0.443, "num_input_tokens_seen": 45871104, "step": 700 }, { "epoch": 0.15144958892254434, "eval_byte_accuracy": 0.9584569732937686, "eval_chrf": 73.74830346733725, "eval_sacrebleu": 63.990915184214565, "eval_word_accuracy": 0.9217032967032966, "num_input_tokens_seen": 45871104, "perplexity": 1.1577757317344424, "step": 700 }, { "epoch": 0.1536131544785807, "grad_norm": 2.3125, "learning_rate": 0.0005868, "loss": 0.1534, "num_input_tokens_seen": 46526464, "step": 710, "train_runtime": 314.2434, "train_tokens_per_second": 148058.7 }, { "epoch": 0.15577672003461704, "grad_norm": 2.671875, "learning_rate": 0.0005861684210526315, "loss": 0.1441, "num_input_tokens_seen": 47181824, "step": 720, "train_runtime": 318.0598, "train_tokens_per_second": 148342.621 }, { "epoch": 0.1579402855906534, "grad_norm": 1.8671875, "learning_rate": 0.0005855368421052631, "loss": 0.1346, "num_input_tokens_seen": 47837184, "step": 730, "train_runtime": 321.8647, "train_tokens_per_second": 148625.111 }, { "epoch": 0.16010385114668974, "grad_norm": 1.8046875, "learning_rate": 0.0005849052631578946, "loss": 0.1403, "num_input_tokens_seen": 48492544, "step": 740, "train_runtime": 325.6813, "train_tokens_per_second": 148895.694 }, { "epoch": 0.1622674167027261, "grad_norm": 1.6328125, "learning_rate": 0.0005842736842105263, "loss": 0.1426, "num_input_tokens_seen": 49147904, "step": 750, "train_runtime": 329.4803, "train_tokens_per_second": 149167.94 }, { "epoch": 0.16443098225876243, "grad_norm": 1.1171875, "learning_rate": 0.0005836421052631578, "loss": 0.1282, "num_input_tokens_seen": 49803264, "step": 760, "train_runtime": 335.3012, "train_tokens_per_second": 148532.903 }, { "epoch": 0.1665945478147988, "grad_norm": 2.265625, "learning_rate": 0.0005830105263157895, "loss": 0.132, "num_input_tokens_seen": 50458624, "step": 770, "train_runtime": 339.1103, "train_tokens_per_second": 148797.094 }, { "epoch": 0.16875811337083513, "grad_norm": 1.234375, "learning_rate": 0.000582378947368421, "loss": 0.1239, "num_input_tokens_seen": 51113984, "step": 780, "train_runtime": 342.9157, "train_tokens_per_second": 149057.003 }, { "epoch": 0.1709216789268715, "grad_norm": 1.4375, "learning_rate": 0.0005817473684210526, "loss": 0.1318, "num_input_tokens_seen": 51769344, "step": 790, "train_runtime": 346.716, "train_tokens_per_second": 149313.417 }, { "epoch": 0.17308524448290782, "grad_norm": 1.7734375, "learning_rate": 0.0005811157894736841, "loss": 0.1268, "num_input_tokens_seen": 52424704, "step": 800, "train_runtime": 350.5228, "train_tokens_per_second": 149561.474 }, { "epoch": 0.17308524448290782, "eval_loss": 0.11049608141183853, "eval_runtime": 1.8345, "eval_samples_per_second": 17.443, "eval_steps_per_second": 0.545, "num_input_tokens_seen": 52424704, "step": 800 }, { "epoch": 0.17308524448290782, "eval_byte_accuracy": 0.9669881305637982, "eval_chrf": 77.17903139546479, "eval_sacrebleu": 67.74599411512361, "eval_word_accuracy": 0.9313186813186813, "num_input_tokens_seen": 52424704, "perplexity": 1.1168319726389973, "step": 800 }, { "epoch": 0.17524881003894419, "grad_norm": 1.109375, "learning_rate": 0.0005804842105263157, "loss": 0.122, "num_input_tokens_seen": 53080064, "step": 810, "train_runtime": 358.1954, "train_tokens_per_second": 148187.448 }, { "epoch": 0.17741237559498052, "grad_norm": 1.3515625, "learning_rate": 0.0005798526315789473, "loss": 0.1235, "num_input_tokens_seen": 53735424, "step": 820, "train_runtime": 362.0007, "train_tokens_per_second": 148440.091 }, { "epoch": 0.17957594115101688, "grad_norm": 1.5703125, "learning_rate": 0.0005792210526315789, "loss": 0.1217, "num_input_tokens_seen": 54390784, "step": 830, "train_runtime": 365.806, "train_tokens_per_second": 148687.519 }, { "epoch": 0.18173950670705322, "grad_norm": 1.1171875, "learning_rate": 0.0005785894736842105, "loss": 0.1169, "num_input_tokens_seen": 55046144, "step": 840, "train_runtime": 369.611, "train_tokens_per_second": 148929.926 }, { "epoch": 0.18390307226308958, "grad_norm": 1.3515625, "learning_rate": 0.000577957894736842, "loss": 0.1193, "num_input_tokens_seen": 55701504, "step": 850, "train_runtime": 373.4083, "train_tokens_per_second": 149170.488 }, { "epoch": 0.1860666378191259, "grad_norm": 1.15625, "learning_rate": 0.0005773263157894736, "loss": 0.1186, "num_input_tokens_seen": 56356864, "step": 860, "train_runtime": 379.2379, "train_tokens_per_second": 148605.558 }, { "epoch": 0.18823020337516227, "grad_norm": 1.21875, "learning_rate": 0.0005766947368421052, "loss": 0.1143, "num_input_tokens_seen": 57012224, "step": 870, "train_runtime": 383.0534, "train_tokens_per_second": 148836.24 }, { "epoch": 0.1903937689311986, "grad_norm": 1.6796875, "learning_rate": 0.0005760631578947368, "loss": 0.1117, "num_input_tokens_seen": 57667584, "step": 880, "train_runtime": 386.8673, "train_tokens_per_second": 149062.955 }, { "epoch": 0.19255733448723497, "grad_norm": 1.78125, "learning_rate": 0.0005754315789473684, "loss": 0.1103, "num_input_tokens_seen": 58322944, "step": 890, "train_runtime": 390.6796, "train_tokens_per_second": 149285.862 }, { "epoch": 0.1947209000432713, "grad_norm": 1.390625, "learning_rate": 0.0005747999999999999, "loss": 0.1326, "num_input_tokens_seen": 58978304, "step": 900, "train_runtime": 394.4919, "train_tokens_per_second": 149504.475 }, { "epoch": 0.1947209000432713, "eval_loss": 0.10183807462453842, "eval_runtime": 2.1572, "eval_samples_per_second": 14.834, "eval_steps_per_second": 0.464, "num_input_tokens_seen": 58978304, "step": 900 }, { "epoch": 0.1947209000432713, "eval_byte_accuracy": 0.9725519287833828, "eval_chrf": 79.43792789971359, "eval_sacrebleu": 70.42945591585108, "eval_word_accuracy": 0.9354395604395604, "num_input_tokens_seen": 58978304, "perplexity": 1.107204172760412, "step": 900 }, { "epoch": 0.19688446559930767, "grad_norm": 1.8984375, "learning_rate": 0.0005741684210526316, "loss": 0.1167, "num_input_tokens_seen": 59633664, "step": 910, "train_runtime": 402.4861, "train_tokens_per_second": 148163.277 }, { "epoch": 0.199048031155344, "grad_norm": 1.0234375, "learning_rate": 0.000573536842105263, "loss": 0.1048, "num_input_tokens_seen": 60289024, "step": 920, "train_runtime": 406.3049, "train_tokens_per_second": 148383.715 }, { "epoch": 0.20121159671138036, "grad_norm": 1.6328125, "learning_rate": 0.0005729052631578947, "loss": 0.1103, "num_input_tokens_seen": 60944384, "step": 930, "train_runtime": 410.1125, "train_tokens_per_second": 148604.052 }, { "epoch": 0.2033751622674167, "grad_norm": 1.484375, "learning_rate": 0.0005722736842105262, "loss": 0.1031, "num_input_tokens_seen": 61599744, "step": 940, "train_runtime": 413.9199, "train_tokens_per_second": 148820.456 }, { "epoch": 0.20553872782345306, "grad_norm": 0.90234375, "learning_rate": 0.0005716421052631578, "loss": 0.1013, "num_input_tokens_seen": 62255104, "step": 950, "train_runtime": 417.7339, "train_tokens_per_second": 149030.524 }, { "epoch": 0.2077022933794894, "grad_norm": 0.69140625, "learning_rate": 0.0005710105263157894, "loss": 0.1018, "num_input_tokens_seen": 62910464, "step": 960, "train_runtime": 423.5646, "train_tokens_per_second": 148526.263 }, { "epoch": 0.20986585893552576, "grad_norm": 1.109375, "learning_rate": 0.000570378947368421, "loss": 0.0874, "num_input_tokens_seen": 63565824, "step": 970, "train_runtime": 427.3766, "train_tokens_per_second": 148734.931 }, { "epoch": 0.2120294244915621, "grad_norm": 1.9609375, "learning_rate": 0.0005697473684210526, "loss": 0.1025, "num_input_tokens_seen": 64221184, "step": 980, "train_runtime": 431.1598, "train_tokens_per_second": 148949.857 }, { "epoch": 0.21419299004759845, "grad_norm": 1.828125, "learning_rate": 0.0005691157894736842, "loss": 0.0988, "num_input_tokens_seen": 64876544, "step": 990, "train_runtime": 434.9749, "train_tokens_per_second": 149150.081 }, { "epoch": 0.2163565556036348, "grad_norm": 1.875, "learning_rate": 0.0005684842105263157, "loss": 0.1019, "num_input_tokens_seen": 65531904, "step": 1000, "train_runtime": 438.7886, "train_tokens_per_second": 149347.327 }, { "epoch": 0.2163565556036348, "eval_loss": 0.08433977514505386, "eval_runtime": 2.1587, "eval_samples_per_second": 14.824, "eval_steps_per_second": 0.463, "num_input_tokens_seen": 65531904, "step": 1000 }, { "epoch": 0.2163565556036348, "eval_byte_accuracy": 0.9747774480712166, "eval_chrf": 82.22196329181519, "eval_sacrebleu": 74.53680220343439, "eval_word_accuracy": 0.9464285714285714, "num_input_tokens_seen": 65531904, "perplexity": 1.0879985058629236, "step": 1000 }, { "epoch": 0.21852012115967115, "grad_norm": 1.90625, "learning_rate": 0.0005678526315789473, "loss": 0.0908, "num_input_tokens_seen": 66187264, "step": 1010, "train_runtime": 446.6399, "train_tokens_per_second": 148189.311 }, { "epoch": 0.22068368671570748, "grad_norm": 1.1796875, "learning_rate": 0.0005672210526315789, "loss": 0.0935, "num_input_tokens_seen": 66842624, "step": 1020, "train_runtime": 450.4535, "train_tokens_per_second": 148389.609 }, { "epoch": 0.22284725227174385, "grad_norm": 1.0859375, "learning_rate": 0.0005665894736842105, "loss": 0.0938, "num_input_tokens_seen": 67497984, "step": 1030, "train_runtime": 454.2611, "train_tokens_per_second": 148588.505 }, { "epoch": 0.22501081782778018, "grad_norm": 1.1484375, "learning_rate": 0.000565957894736842, "loss": 0.1033, "num_input_tokens_seen": 68153344, "step": 1040, "train_runtime": 458.0751, "train_tokens_per_second": 148782.018 }, { "epoch": 0.22717438338381654, "grad_norm": 1.6328125, "learning_rate": 0.0005653263157894737, "loss": 0.1008, "num_input_tokens_seen": 68808704, "step": 1050, "train_runtime": 461.8811, "train_tokens_per_second": 148974.931 }, { "epoch": 0.22933794893985288, "grad_norm": 1.515625, "learning_rate": 0.0005646947368421052, "loss": 0.0898, "num_input_tokens_seen": 69464064, "step": 1060, "train_runtime": 467.8393, "train_tokens_per_second": 148478.48 }, { "epoch": 0.23150151449588924, "grad_norm": 1.640625, "learning_rate": 0.0005640631578947368, "loss": 0.0884, "num_input_tokens_seen": 70119424, "step": 1070, "train_runtime": 471.6499, "train_tokens_per_second": 148668.361 }, { "epoch": 0.23366508005192557, "grad_norm": 1.453125, "learning_rate": 0.0005634315789473683, "loss": 0.0984, "num_input_tokens_seen": 70774784, "step": 1080, "train_runtime": 475.461, "train_tokens_per_second": 148855.087 }, { "epoch": 0.23582864560796193, "grad_norm": 1.3046875, "learning_rate": 0.0005627999999999999, "loss": 0.0928, "num_input_tokens_seen": 71430144, "step": 1090, "train_runtime": 479.2668, "train_tokens_per_second": 149040.46 }, { "epoch": 0.23799221116399827, "grad_norm": 1.6875, "learning_rate": 0.0005621684210526315, "loss": 0.094, "num_input_tokens_seen": 72085504, "step": 1100, "train_runtime": 483.0765, "train_tokens_per_second": 149221.712 }, { "epoch": 0.23799221116399827, "eval_loss": 0.0829336941242218, "eval_runtime": 1.6255, "eval_samples_per_second": 19.686, "eval_steps_per_second": 0.615, "num_input_tokens_seen": 72085504, "step": 1100 }, { "epoch": 0.23799221116399827, "eval_byte_accuracy": 0.9762611275964391, "eval_chrf": 82.85300346706357, "eval_sacrebleu": 77.14647221270954, "eval_word_accuracy": 0.9478021978021978, "num_input_tokens_seen": 72085504, "perplexity": 1.0864697668304737, "step": 1100 }, { "epoch": 0.24015577672003463, "grad_norm": 1.3046875, "learning_rate": 0.0005615368421052631, "loss": 0.0911, "num_input_tokens_seen": 72740864, "step": 1110, "train_runtime": 490.5196, "train_tokens_per_second": 148293.491 }, { "epoch": 0.24231934227607096, "grad_norm": 1.078125, "learning_rate": 0.0005609052631578947, "loss": 0.087, "num_input_tokens_seen": 73396224, "step": 1120, "train_runtime": 494.333, "train_tokens_per_second": 148475.267 }, { "epoch": 0.2444829078321073, "grad_norm": 0.98046875, "learning_rate": 0.0005602736842105263, "loss": 0.0872, "num_input_tokens_seen": 74051584, "step": 1130, "train_runtime": 498.1388, "train_tokens_per_second": 148656.534 }, { "epoch": 0.24664647338814366, "grad_norm": 1.1640625, "learning_rate": 0.0005596421052631578, "loss": 0.0822, "num_input_tokens_seen": 74706944, "step": 1140, "train_runtime": 501.9551, "train_tokens_per_second": 148831.924 }, { "epoch": 0.24881003894418, "grad_norm": 1.1484375, "learning_rate": 0.0005590105263157894, "loss": 0.0837, "num_input_tokens_seen": 75362304, "step": 1150, "train_runtime": 505.7692, "train_tokens_per_second": 149005.319 }, { "epoch": 0.25097360450021633, "grad_norm": 1.3359375, "learning_rate": 0.000558378947368421, "loss": 0.0818, "num_input_tokens_seen": 76017664, "step": 1160, "train_runtime": 511.7654, "train_tokens_per_second": 148540.053 }, { "epoch": 0.2531371700562527, "grad_norm": 1.1796875, "learning_rate": 0.0005577473684210526, "loss": 0.0843, "num_input_tokens_seen": 76673024, "step": 1170, "train_runtime": 515.5749, "train_tokens_per_second": 148713.644 }, { "epoch": 0.25530073561228905, "grad_norm": 1.2421875, "learning_rate": 0.0005571157894736842, "loss": 0.084, "num_input_tokens_seen": 77328384, "step": 1180, "train_runtime": 519.3838, "train_tokens_per_second": 148884.869 }, { "epoch": 0.2574643011683254, "grad_norm": 1.4453125, "learning_rate": 0.0005564842105263158, "loss": 0.0813, "num_input_tokens_seen": 77979648, "step": 1190, "train_runtime": 523.1784, "train_tokens_per_second": 149049.825 }, { "epoch": 0.2596278667243617, "grad_norm": 0.984375, "learning_rate": 0.0005558526315789473, "loss": 0.0832, "num_input_tokens_seen": 78635008, "step": 1200, "train_runtime": 526.9952, "train_tokens_per_second": 149213.905 }, { "epoch": 0.2596278667243617, "eval_loss": 0.0685884952545166, "eval_runtime": 1.1892, "eval_samples_per_second": 26.91, "eval_steps_per_second": 0.841, "num_input_tokens_seen": 78635008, "step": 1200 }, { "epoch": 0.2596278667243617, "eval_byte_accuracy": 0.9803412462908012, "eval_chrf": 86.18467528671245, "eval_sacrebleu": 80.29154426872704, "eval_word_accuracy": 0.9519230769230769, "num_input_tokens_seen": 78635008, "perplexity": 1.0709953987678011, "step": 1200 }, { "epoch": 0.2617914322803981, "grad_norm": 1.515625, "learning_rate": 0.000555221052631579, "loss": 0.0736, "num_input_tokens_seen": 79290368, "step": 1210, "train_runtime": 534.0458, "train_tokens_per_second": 148471.108 }, { "epoch": 0.26395499783643445, "grad_norm": 1.0, "learning_rate": 0.0005545894736842104, "loss": 0.0792, "num_input_tokens_seen": 79945728, "step": 1220, "train_runtime": 537.8548, "train_tokens_per_second": 148638.118 }, { "epoch": 0.2661185633924708, "grad_norm": 0.9453125, "learning_rate": 0.0005539578947368421, "loss": 0.0785, "num_input_tokens_seen": 80601088, "step": 1230, "train_runtime": 541.6708, "train_tokens_per_second": 148800.87 }, { "epoch": 0.2682821289485071, "grad_norm": 1.0859375, "learning_rate": 0.0005533263157894736, "loss": 0.0775, "num_input_tokens_seen": 81256448, "step": 1240, "train_runtime": 545.4848, "train_tokens_per_second": 148961.887 }, { "epoch": 0.2704456945045435, "grad_norm": 1.109375, "learning_rate": 0.0005526947368421052, "loss": 0.0781, "num_input_tokens_seen": 81911808, "step": 1250, "train_runtime": 549.2987, "train_tokens_per_second": 149120.689 }, { "epoch": 0.27260926006057984, "grad_norm": 1.1484375, "learning_rate": 0.0005520631578947368, "loss": 0.077, "num_input_tokens_seen": 82563072, "step": 1260, "train_runtime": 554.9402, "train_tokens_per_second": 148778.327 }, { "epoch": 0.2747728256166162, "grad_norm": 1.3984375, "learning_rate": 0.0005514315789473684, "loss": 0.0818, "num_input_tokens_seen": 83218432, "step": 1270, "train_runtime": 558.7404, "train_tokens_per_second": 148939.344 }, { "epoch": 0.2769363911726525, "grad_norm": 1.4765625, "learning_rate": 0.0005507999999999999, "loss": 0.0764, "num_input_tokens_seen": 83873792, "step": 1280, "train_runtime": 562.5456, "train_tokens_per_second": 149096.867 }, { "epoch": 0.27909995672868887, "grad_norm": 1.2421875, "learning_rate": 0.0005501684210526315, "loss": 0.0804, "num_input_tokens_seen": 84529152, "step": 1290, "train_runtime": 566.3426, "train_tokens_per_second": 149254.436 }, { "epoch": 0.28126352228472523, "grad_norm": 1.1796875, "learning_rate": 0.0005495368421052631, "loss": 0.0795, "num_input_tokens_seen": 85180416, "step": 1300, "train_runtime": 570.132, "train_tokens_per_second": 149404.72 }, { "epoch": 0.28126352228472523, "eval_loss": 0.060103051364421844, "eval_runtime": 2.1621, "eval_samples_per_second": 14.8, "eval_steps_per_second": 0.463, "num_input_tokens_seen": 85180416, "step": 1300 }, { "epoch": 0.28126352228472523, "eval_byte_accuracy": 0.9818249258160238, "eval_chrf": 88.10671185800685, "eval_sacrebleu": 84.31771246247678, "eval_word_accuracy": 0.9629120879120879, "num_input_tokens_seen": 85180416, "perplexity": 1.0619459758885987, "step": 1300 }, { "epoch": 0.2834270878407616, "grad_norm": 1.09375, "learning_rate": 0.0005489052631578947, "loss": 0.0728, "num_input_tokens_seen": 85835776, "step": 1310, "train_runtime": 578.2913, "train_tokens_per_second": 148430.004 }, { "epoch": 0.2855906533967979, "grad_norm": 1.4140625, "learning_rate": 0.0005482736842105263, "loss": 0.0748, "num_input_tokens_seen": 86491136, "step": 1320, "train_runtime": 582.1077, "train_tokens_per_second": 148582.709 }, { "epoch": 0.28775421895283426, "grad_norm": 0.8828125, "learning_rate": 0.0005476421052631579, "loss": 0.0738, "num_input_tokens_seen": 87146496, "step": 1330, "train_runtime": 585.9007, "train_tokens_per_second": 148739.354 }, { "epoch": 0.2899177845088706, "grad_norm": 0.75, "learning_rate": 0.0005470105263157895, "loss": 0.0673, "num_input_tokens_seen": 87801856, "step": 1340, "train_runtime": 589.6992, "train_tokens_per_second": 148892.611 }, { "epoch": 0.292081350064907, "grad_norm": 1.25, "learning_rate": 0.000546378947368421, "loss": 0.0691, "num_input_tokens_seen": 88457216, "step": 1350, "train_runtime": 593.5107, "train_tokens_per_second": 149040.639 }, { "epoch": 0.2942449156209433, "grad_norm": 1.5546875, "learning_rate": 0.0005457473684210525, "loss": 0.0731, "num_input_tokens_seen": 89112576, "step": 1360, "train_runtime": 599.3419, "train_tokens_per_second": 148684.049 }, { "epoch": 0.29640848117697965, "grad_norm": 1.2421875, "learning_rate": 0.0005451157894736842, "loss": 0.0676, "num_input_tokens_seen": 89767936, "step": 1370, "train_runtime": 603.1499, "train_tokens_per_second": 148831.876 }, { "epoch": 0.298572046733016, "grad_norm": 1.2421875, "learning_rate": 0.0005444842105263157, "loss": 0.0776, "num_input_tokens_seen": 90423296, "step": 1380, "train_runtime": 606.9619, "train_tokens_per_second": 148976.882 }, { "epoch": 0.3007356122890524, "grad_norm": 1.25, "learning_rate": 0.0005438526315789473, "loss": 0.0724, "num_input_tokens_seen": 91078656, "step": 1390, "train_runtime": 610.7783, "train_tokens_per_second": 149119.01 }, { "epoch": 0.3028991778450887, "grad_norm": 1.1640625, "learning_rate": 0.0005432210526315789, "loss": 0.0721, "num_input_tokens_seen": 91734016, "step": 1400, "train_runtime": 614.5862, "train_tokens_per_second": 149261.44 }, { "epoch": 0.3028991778450887, "eval_loss": 0.062408819794654846, "eval_runtime": 2.3491, "eval_samples_per_second": 13.622, "eval_steps_per_second": 0.426, "num_input_tokens_seen": 91734016, "step": 1400 }, { "epoch": 0.3028991778450887, "eval_byte_accuracy": 0.9803412462908012, "eval_chrf": 86.63365299116445, "eval_sacrebleu": 82.79436667187097, "eval_word_accuracy": 0.9601648351648352, "num_input_tokens_seen": 91734016, "perplexity": 1.064397402519385, "step": 1400 }, { "epoch": 0.30506274340112505, "grad_norm": 1.453125, "learning_rate": 0.0005425894736842105, "loss": 0.0812, "num_input_tokens_seen": 92389376, "step": 1410, "train_runtime": 622.8076, "train_tokens_per_second": 148343.36 }, { "epoch": 0.3072263089571614, "grad_norm": 1.4140625, "learning_rate": 0.000541957894736842, "loss": 0.0775, "num_input_tokens_seen": 93044736, "step": 1420, "train_runtime": 626.6216, "train_tokens_per_second": 148486.309 }, { "epoch": 0.30938987451319777, "grad_norm": 1.015625, "learning_rate": 0.0005413263157894736, "loss": 0.0714, "num_input_tokens_seen": 93700096, "step": 1430, "train_runtime": 630.429, "train_tokens_per_second": 148629.105 }, { "epoch": 0.3115534400692341, "grad_norm": 1.3125, "learning_rate": 0.0005406947368421052, "loss": 0.07, "num_input_tokens_seen": 94355456, "step": 1440, "train_runtime": 634.235, "train_tokens_per_second": 148770.494 }, { "epoch": 0.31371700562527044, "grad_norm": 0.9921875, "learning_rate": 0.0005400631578947368, "loss": 0.0673, "num_input_tokens_seen": 95010816, "step": 1450, "train_runtime": 638.0388, "train_tokens_per_second": 148910.708 }, { "epoch": 0.3158805711813068, "grad_norm": 0.859375, "learning_rate": 0.0005394315789473684, "loss": 0.0797, "num_input_tokens_seen": 95666176, "step": 1460, "train_runtime": 644.0522, "train_tokens_per_second": 148537.927 }, { "epoch": 0.31804413673734316, "grad_norm": 1.1171875, "learning_rate": 0.0005388, "loss": 0.0678, "num_input_tokens_seen": 96321536, "step": 1470, "train_runtime": 647.862, "train_tokens_per_second": 148676.001 }, { "epoch": 0.32020770229337947, "grad_norm": 1.15625, "learning_rate": 0.0005381684210526316, "loss": 0.0623, "num_input_tokens_seen": 96976896, "step": 1480, "train_runtime": 651.6702, "train_tokens_per_second": 148812.847 }, { "epoch": 0.32237126784941583, "grad_norm": 1.484375, "learning_rate": 0.0005375368421052632, "loss": 0.075, "num_input_tokens_seen": 97632256, "step": 1490, "train_runtime": 655.4755, "train_tokens_per_second": 148948.748 }, { "epoch": 0.3245348334054522, "grad_norm": 1.2109375, "learning_rate": 0.0005369052631578947, "loss": 0.069, "num_input_tokens_seen": 98287616, "step": 1500, "train_runtime": 659.2779, "train_tokens_per_second": 149083.749 }, { "epoch": 0.3245348334054522, "eval_loss": 0.05183723196387291, "eval_runtime": 1.9744, "eval_samples_per_second": 16.207, "eval_steps_per_second": 0.506, "num_input_tokens_seen": 98287616, "step": 1500 }, { "epoch": 0.3245348334054522, "eval_byte_accuracy": 0.9847922848664689, "eval_chrf": 88.68279915044877, "eval_sacrebleu": 85.65513337936741, "eval_word_accuracy": 0.967032967032967, "num_input_tokens_seen": 98287616, "perplexity": 1.053204300565465, "step": 1500 }, { "epoch": 0.32669839896148856, "grad_norm": 1.2578125, "learning_rate": 0.0005362736842105263, "loss": 0.0688, "num_input_tokens_seen": 98942976, "step": 1510, "train_runtime": 667.1596, "train_tokens_per_second": 148304.8 }, { "epoch": 0.32886196451752486, "grad_norm": 1.1328125, "learning_rate": 0.0005356421052631578, "loss": 0.0631, "num_input_tokens_seen": 99598336, "step": 1520, "train_runtime": 670.9705, "train_tokens_per_second": 148439.219 }, { "epoch": 0.3310255300735612, "grad_norm": 1.421875, "learning_rate": 0.0005350105263157894, "loss": 0.0648, "num_input_tokens_seen": 100253696, "step": 1530, "train_runtime": 674.7874, "train_tokens_per_second": 148570.779 }, { "epoch": 0.3331890956295976, "grad_norm": 1.1015625, "learning_rate": 0.000534378947368421, "loss": 0.0697, "num_input_tokens_seen": 100909056, "step": 1540, "train_runtime": 678.5971, "train_tokens_per_second": 148702.463 }, { "epoch": 0.33535266118563395, "grad_norm": 1.453125, "learning_rate": 0.0005337473684210526, "loss": 0.0649, "num_input_tokens_seen": 101564416, "step": 1550, "train_runtime": 682.4042, "train_tokens_per_second": 148833.213 }, { "epoch": 0.33751622674167026, "grad_norm": 1.1796875, "learning_rate": 0.0005331157894736841, "loss": 0.0753, "num_input_tokens_seen": 102219776, "step": 1560, "train_runtime": 688.2761, "train_tokens_per_second": 148515.665 }, { "epoch": 0.3396797922977066, "grad_norm": 1.6015625, "learning_rate": 0.0005324842105263157, "loss": 0.0689, "num_input_tokens_seen": 102875136, "step": 1570, "train_runtime": 692.0898, "train_tokens_per_second": 148644.199 }, { "epoch": 0.341843357853743, "grad_norm": 1.3125, "learning_rate": 0.0005318526315789473, "loss": 0.0647, "num_input_tokens_seen": 103530496, "step": 1580, "train_runtime": 695.9042, "train_tokens_per_second": 148771.187 }, { "epoch": 0.34400692340977934, "grad_norm": 0.98046875, "learning_rate": 0.0005312210526315789, "loss": 0.0651, "num_input_tokens_seen": 104185856, "step": 1590, "train_runtime": 699.6945, "train_tokens_per_second": 148901.914 }, { "epoch": 0.34617048896581565, "grad_norm": 1.421875, "learning_rate": 0.0005305894736842105, "loss": 0.0619, "num_input_tokens_seen": 104841216, "step": 1600, "train_runtime": 703.5081, "train_tokens_per_second": 149026.308 }, { "epoch": 0.34617048896581565, "eval_loss": 0.04635748267173767, "eval_runtime": 1.9682, "eval_samples_per_second": 16.259, "eval_steps_per_second": 0.508, "num_input_tokens_seen": 104841216, "step": 1600 }, { "epoch": 0.34617048896581565, "eval_byte_accuracy": 0.9885014836795252, "eval_chrf": 91.37848332537078, "eval_sacrebleu": 88.58898373597441, "eval_word_accuracy": 0.9739010989010989, "num_input_tokens_seen": 104841216, "perplexity": 1.047448788827688, "step": 1600 }, { "epoch": 0.348334054521852, "grad_norm": 1.421875, "learning_rate": 0.0005299578947368421, "loss": 0.0631, "num_input_tokens_seen": 105492480, "step": 1610, "train_runtime": 711.5187, "train_tokens_per_second": 148263.814 }, { "epoch": 0.35049762007788837, "grad_norm": 1.03125, "learning_rate": 0.0005293263157894737, "loss": 0.0639, "num_input_tokens_seen": 106147840, "step": 1620, "train_runtime": 715.3139, "train_tokens_per_second": 148393.364 }, { "epoch": 0.35266118563392473, "grad_norm": 0.63671875, "learning_rate": 0.0005286947368421053, "loss": 0.0642, "num_input_tokens_seen": 106803200, "step": 1630, "train_runtime": 719.1211, "train_tokens_per_second": 148519.067 }, { "epoch": 0.35482475118996104, "grad_norm": 0.93359375, "learning_rate": 0.0005280631578947368, "loss": 0.0652, "num_input_tokens_seen": 107458560, "step": 1640, "train_runtime": 722.9348, "train_tokens_per_second": 148642.112 }, { "epoch": 0.3569883167459974, "grad_norm": 0.97265625, "learning_rate": 0.0005274315789473684, "loss": 0.0678, "num_input_tokens_seen": 108113920, "step": 1650, "train_runtime": 726.7458, "train_tokens_per_second": 148764.421 }, { "epoch": 0.35915188230203376, "grad_norm": 0.95703125, "learning_rate": 0.0005267999999999999, "loss": 0.0592, "num_input_tokens_seen": 108765184, "step": 1660, "train_runtime": 732.6359, "train_tokens_per_second": 148457.343 }, { "epoch": 0.3613154478580701, "grad_norm": 1.2421875, "learning_rate": 0.0005261684210526315, "loss": 0.0594, "num_input_tokens_seen": 109420544, "step": 1670, "train_runtime": 736.4517, "train_tokens_per_second": 148578.034 }, { "epoch": 0.36347901341410643, "grad_norm": 1.1640625, "learning_rate": 0.0005255368421052631, "loss": 0.061, "num_input_tokens_seen": 110071808, "step": 1680, "train_runtime": 740.2501, "train_tokens_per_second": 148695.435 }, { "epoch": 0.3656425789701428, "grad_norm": 1.2265625, "learning_rate": 0.0005249052631578947, "loss": 0.0657, "num_input_tokens_seen": 110727168, "step": 1690, "train_runtime": 744.0635, "train_tokens_per_second": 148814.144 }, { "epoch": 0.36780614452617916, "grad_norm": 0.7578125, "learning_rate": 0.0005242736842105262, "loss": 0.064, "num_input_tokens_seen": 111382528, "step": 1700, "train_runtime": 747.8708, "train_tokens_per_second": 148932.838 }, { "epoch": 0.36780614452617916, "eval_loss": 0.048420779407024384, "eval_runtime": 2.1981, "eval_samples_per_second": 14.558, "eval_steps_per_second": 0.455, "num_input_tokens_seen": 111382528, "step": 1700 }, { "epoch": 0.36780614452617916, "eval_byte_accuracy": 0.9866468842729971, "eval_chrf": 89.9671333112741, "eval_sacrebleu": 87.01567892940223, "eval_word_accuracy": 0.9711538461538461, "num_input_tokens_seen": 111382528, "perplexity": 1.0496122176243317, "step": 1700 }, { "epoch": 0.36996971008221546, "grad_norm": 0.859375, "learning_rate": 0.0005236421052631578, "loss": 0.0638, "num_input_tokens_seen": 112037888, "step": 1710, "train_runtime": 755.9586, "train_tokens_per_second": 148206.383 }, { "epoch": 0.3721332756382518, "grad_norm": 0.84765625, "learning_rate": 0.0005230105263157894, "loss": 0.0679, "num_input_tokens_seen": 112693248, "step": 1720, "train_runtime": 759.7618, "train_tokens_per_second": 148327.078 }, { "epoch": 0.3742968411942882, "grad_norm": 1.4765625, "learning_rate": 0.000522378947368421, "loss": 0.0642, "num_input_tokens_seen": 113348608, "step": 1730, "train_runtime": 763.5726, "train_tokens_per_second": 148445.092 }, { "epoch": 0.37646040675032455, "grad_norm": 2.640625, "learning_rate": 0.0005217473684210526, "loss": 0.0628, "num_input_tokens_seen": 114003968, "step": 1740, "train_runtime": 767.3829, "train_tokens_per_second": 148562.035 }, { "epoch": 0.37862397230636086, "grad_norm": 0.73828125, "learning_rate": 0.0005211157894736842, "loss": 0.0647, "num_input_tokens_seen": 114659328, "step": 1750, "train_runtime": 771.2005, "train_tokens_per_second": 148676.419 }, { "epoch": 0.3807875378623972, "grad_norm": 0.953125, "learning_rate": 0.0005204842105263158, "loss": 0.0594, "num_input_tokens_seen": 115314688, "step": 1760, "train_runtime": 777.1082, "train_tokens_per_second": 148389.491 }, { "epoch": 0.3829511034184336, "grad_norm": 0.63671875, "learning_rate": 0.0005198526315789474, "loss": 0.0599, "num_input_tokens_seen": 115970048, "step": 1770, "train_runtime": 780.9251, "train_tokens_per_second": 148503.426 }, { "epoch": 0.38511466897446994, "grad_norm": 0.9140625, "learning_rate": 0.0005192210526315789, "loss": 0.0584, "num_input_tokens_seen": 116625408, "step": 1780, "train_runtime": 784.7363, "train_tokens_per_second": 148617.319 }, { "epoch": 0.38727823453050625, "grad_norm": 0.81640625, "learning_rate": 0.0005185894736842105, "loss": 0.0607, "num_input_tokens_seen": 117280768, "step": 1790, "train_runtime": 788.5519, "train_tokens_per_second": 148729.304 }, { "epoch": 0.3894418000865426, "grad_norm": 1.09375, "learning_rate": 0.0005179578947368421, "loss": 0.0586, "num_input_tokens_seen": 117936128, "step": 1800, "train_runtime": 792.3423, "train_tokens_per_second": 148844.921 }, { "epoch": 0.3894418000865426, "eval_loss": 0.044286634773015976, "eval_runtime": 2.2358, "eval_samples_per_second": 14.312, "eval_steps_per_second": 0.447, "num_input_tokens_seen": 117936128, "step": 1800 }, { "epoch": 0.3894418000865426, "eval_byte_accuracy": 0.9892433234421365, "eval_chrf": 91.74603528929435, "eval_sacrebleu": 87.57438387373494, "eval_word_accuracy": 0.9711538461538461, "num_input_tokens_seen": 117936128, "perplexity": 1.045281926100255, "step": 1800 }, { "epoch": 0.391605365642579, "grad_norm": 1.1328125, "learning_rate": 0.0005173263157894736, "loss": 0.0646, "num_input_tokens_seen": 118591488, "step": 1810, "train_runtime": 800.6612, "train_tokens_per_second": 148116.933 }, { "epoch": 0.39376893119861534, "grad_norm": 1.140625, "learning_rate": 0.0005166947368421052, "loss": 0.0613, "num_input_tokens_seen": 119246848, "step": 1820, "train_runtime": 804.478, "train_tokens_per_second": 148228.853 }, { "epoch": 0.39593249675465164, "grad_norm": 1.125, "learning_rate": 0.0005160631578947368, "loss": 0.0591, "num_input_tokens_seen": 119902208, "step": 1830, "train_runtime": 808.2907, "train_tokens_per_second": 148340.459 }, { "epoch": 0.398096062310688, "grad_norm": 1.0703125, "learning_rate": 0.0005154315789473684, "loss": 0.0627, "num_input_tokens_seen": 120557568, "step": 1840, "train_runtime": 812.0918, "train_tokens_per_second": 148453.136 }, { "epoch": 0.40025962786672437, "grad_norm": 1.2890625, "learning_rate": 0.0005147999999999999, "loss": 0.0585, "num_input_tokens_seen": 121212928, "step": 1850, "train_runtime": 815.9044, "train_tokens_per_second": 148562.656 }, { "epoch": 0.4024231934227607, "grad_norm": 0.83203125, "learning_rate": 0.0005141684210526315, "loss": 0.0587, "num_input_tokens_seen": 121868288, "step": 1860, "train_runtime": 821.7746, "train_tokens_per_second": 148298.931 }, { "epoch": 0.40458675897879703, "grad_norm": 0.84375, "learning_rate": 0.0005135368421052631, "loss": 0.0587, "num_input_tokens_seen": 122523648, "step": 1870, "train_runtime": 825.5838, "train_tokens_per_second": 148408.498 }, { "epoch": 0.4067503245348334, "grad_norm": 1.484375, "learning_rate": 0.0005129052631578947, "loss": 0.0578, "num_input_tokens_seen": 123179008, "step": 1880, "train_runtime": 829.3992, "train_tokens_per_second": 148515.94 }, { "epoch": 0.40891389009086976, "grad_norm": 1.3046875, "learning_rate": 0.0005122736842105263, "loss": 0.0583, "num_input_tokens_seen": 123834368, "step": 1890, "train_runtime": 833.208, "train_tokens_per_second": 148623.591 }, { "epoch": 0.4110774556469061, "grad_norm": 1.0078125, "learning_rate": 0.0005116421052631579, "loss": 0.0633, "num_input_tokens_seen": 124489728, "step": 1900, "train_runtime": 837.0084, "train_tokens_per_second": 148731.755 }, { "epoch": 0.4110774556469061, "eval_loss": 0.043784890323877335, "eval_runtime": 2.1418, "eval_samples_per_second": 14.94, "eval_steps_per_second": 0.467, "num_input_tokens_seen": 124489728, "step": 1900 }, { "epoch": 0.4110774556469061, "eval_byte_accuracy": 0.987759643916914, "eval_chrf": 92.28593517149247, "eval_sacrebleu": 88.12775358804903, "eval_word_accuracy": 0.9739010989010989, "num_input_tokens_seen": 124489728, "perplexity": 1.0447575932475983, "step": 1900 }, { "epoch": 0.4132410212029424, "grad_norm": 0.9375, "learning_rate": 0.0005110105263157895, "loss": 0.0548, "num_input_tokens_seen": 125145088, "step": 1910, "train_runtime": 845.0802, "train_tokens_per_second": 148086.646 }, { "epoch": 0.4154045867589788, "grad_norm": 0.7265625, "learning_rate": 0.000510378947368421, "loss": 0.0515, "num_input_tokens_seen": 125800448, "step": 1920, "train_runtime": 848.8975, "train_tokens_per_second": 148192.739 }, { "epoch": 0.41756815231501515, "grad_norm": 1.171875, "learning_rate": 0.0005097473684210526, "loss": 0.0573, "num_input_tokens_seen": 126455808, "step": 1930, "train_runtime": 852.6999, "train_tokens_per_second": 148300.488 }, { "epoch": 0.4197317178710515, "grad_norm": 0.7734375, "learning_rate": 0.0005091157894736842, "loss": 0.0562, "num_input_tokens_seen": 127111168, "step": 1940, "train_runtime": 856.5073, "train_tokens_per_second": 148406.397 }, { "epoch": 0.4218952834270878, "grad_norm": 0.76171875, "learning_rate": 0.0005084842105263157, "loss": 0.0565, "num_input_tokens_seen": 127766528, "step": 1950, "train_runtime": 860.3189, "train_tokens_per_second": 148510.659 }, { "epoch": 0.4240588489831242, "grad_norm": 0.71875, "learning_rate": 0.0005078526315789474, "loss": 0.0509, "num_input_tokens_seen": 128421888, "step": 1960, "train_runtime": 866.227, "train_tokens_per_second": 148254.317 }, { "epoch": 0.42622241453916054, "grad_norm": 0.99609375, "learning_rate": 0.0005072210526315789, "loss": 0.0567, "num_input_tokens_seen": 129077248, "step": 1970, "train_runtime": 870.0425, "train_tokens_per_second": 148357.399 }, { "epoch": 0.4283859800951969, "grad_norm": 0.80859375, "learning_rate": 0.0005065894736842105, "loss": 0.0578, "num_input_tokens_seen": 129732608, "step": 1980, "train_runtime": 873.8588, "train_tokens_per_second": 148459.456 }, { "epoch": 0.4305495456512332, "grad_norm": 0.78515625, "learning_rate": 0.000505957894736842, "loss": 0.0546, "num_input_tokens_seen": 130387968, "step": 1990, "train_runtime": 877.6726, "train_tokens_per_second": 148561.062 }, { "epoch": 0.4327131112072696, "grad_norm": 1.0625, "learning_rate": 0.0005053263157894736, "loss": 0.0545, "num_input_tokens_seen": 131043328, "step": 2000, "train_runtime": 881.4783, "train_tokens_per_second": 148663.139 }, { "epoch": 0.4327131112072696, "eval_loss": 0.038986582309007645, "eval_runtime": 2.2975, "eval_samples_per_second": 13.928, "eval_steps_per_second": 0.435, "num_input_tokens_seen": 131043328, "step": 2000 }, { "epoch": 0.4327131112072696, "eval_byte_accuracy": 0.9873887240356083, "eval_chrf": 89.9749087738313, "eval_sacrebleu": 86.45994331005899, "eval_word_accuracy": 0.9697802197802198, "num_input_tokens_seen": 131043328, "perplexity": 1.039756532424702, "step": 2000 }, { "epoch": 0.43487667676330594, "grad_norm": 0.90625, "learning_rate": 0.0005046947368421052, "loss": 0.0533, "num_input_tokens_seen": 131698688, "step": 2010, "train_runtime": 889.92, "train_tokens_per_second": 147989.351 }, { "epoch": 0.4370402423193423, "grad_norm": 0.79296875, "learning_rate": 0.0005040631578947368, "loss": 0.052, "num_input_tokens_seen": 132354048, "step": 2020, "train_runtime": 893.7334, "train_tokens_per_second": 148091.201 }, { "epoch": 0.4392038078753786, "grad_norm": 0.76171875, "learning_rate": 0.0005034315789473684, "loss": 0.0495, "num_input_tokens_seen": 133009408, "step": 2030, "train_runtime": 897.5434, "train_tokens_per_second": 148192.726 }, { "epoch": 0.44136737343141497, "grad_norm": 0.66015625, "learning_rate": 0.0005028, "loss": 0.0528, "num_input_tokens_seen": 133664768, "step": 2040, "train_runtime": 901.3573, "train_tokens_per_second": 148292.767 }, { "epoch": 0.44353093898745133, "grad_norm": 1.078125, "learning_rate": 0.0005021684210526316, "loss": 0.0521, "num_input_tokens_seen": 134320128, "step": 2050, "train_runtime": 905.1614, "train_tokens_per_second": 148393.559 }, { "epoch": 0.4456945045434877, "grad_norm": 0.8515625, "learning_rate": 0.0005015368421052631, "loss": 0.0462, "num_input_tokens_seen": 134975488, "step": 2060, "train_runtime": 911.0618, "train_tokens_per_second": 148151.843 }, { "epoch": 0.447858070099524, "grad_norm": 0.76171875, "learning_rate": 0.0005009052631578947, "loss": 0.0527, "num_input_tokens_seen": 135630848, "step": 2070, "train_runtime": 914.8729, "train_tokens_per_second": 148251.033 }, { "epoch": 0.45002163565556036, "grad_norm": 1.0234375, "learning_rate": 0.0005002736842105263, "loss": 0.0515, "num_input_tokens_seen": 136286208, "step": 2080, "train_runtime": 918.6826, "train_tokens_per_second": 148349.607 }, { "epoch": 0.4521852012115967, "grad_norm": 0.95703125, "learning_rate": 0.0004996421052631578, "loss": 0.052, "num_input_tokens_seen": 136941568, "step": 2090, "train_runtime": 922.4928, "train_tokens_per_second": 148447.301 }, { "epoch": 0.4543487667676331, "grad_norm": 1.125, "learning_rate": 0.0004990105263157895, "loss": 0.0549, "num_input_tokens_seen": 137596928, "step": 2100, "train_runtime": 926.3048, "train_tokens_per_second": 148543.903 }, { "epoch": 0.4543487667676331, "eval_loss": 0.039272863417863846, "eval_runtime": 2.2223, "eval_samples_per_second": 14.4, "eval_steps_per_second": 0.45, "num_input_tokens_seen": 137596928, "step": 2100 }, { "epoch": 0.4543487667676331, "eval_byte_accuracy": 0.9885014836795252, "eval_chrf": 90.0025148718611, "eval_sacrebleu": 87.5087779219263, "eval_word_accuracy": 0.9711538461538461, "num_input_tokens_seen": 137596928, "perplexity": 1.0400542376894084, "step": 2100 }, { "epoch": 0.4565123323236694, "grad_norm": 1.15625, "learning_rate": 0.000498378947368421, "loss": 0.0519, "num_input_tokens_seen": 138252288, "step": 2110, "train_runtime": 934.4617, "train_tokens_per_second": 147948.586 }, { "epoch": 0.45867589787970575, "grad_norm": 0.8125, "learning_rate": 0.0004977473684210526, "loss": 0.0526, "num_input_tokens_seen": 138907648, "step": 2120, "train_runtime": 938.2686, "train_tokens_per_second": 148046.777 }, { "epoch": 0.4608394634357421, "grad_norm": 1.109375, "learning_rate": 0.0004971157894736841, "loss": 0.0519, "num_input_tokens_seen": 139563008, "step": 2130, "train_runtime": 942.0767, "train_tokens_per_second": 148143.991 }, { "epoch": 0.4630030289917785, "grad_norm": 0.578125, "learning_rate": 0.0004964842105263157, "loss": 0.0491, "num_input_tokens_seen": 140218368, "step": 2140, "train_runtime": 945.8626, "train_tokens_per_second": 148243.903 }, { "epoch": 0.4651665945478148, "grad_norm": 0.5390625, "learning_rate": 0.0004958526315789473, "loss": 0.0491, "num_input_tokens_seen": 140873728, "step": 2150, "train_runtime": 949.6741, "train_tokens_per_second": 148339.015 }, { "epoch": 0.46733016010385114, "grad_norm": 0.640625, "learning_rate": 0.0004952210526315789, "loss": 0.0507, "num_input_tokens_seen": 141529088, "step": 2160, "train_runtime": 955.6014, "train_tokens_per_second": 148104.727 }, { "epoch": 0.4694937256598875, "grad_norm": 1.0703125, "learning_rate": 0.0004945894736842105, "loss": 0.0513, "num_input_tokens_seen": 142184448, "step": 2170, "train_runtime": 959.4008, "train_tokens_per_second": 148201.3 }, { "epoch": 0.47165729121592387, "grad_norm": 0.98828125, "learning_rate": 0.0004939578947368421, "loss": 0.0546, "num_input_tokens_seen": 142839808, "step": 2180, "train_runtime": 963.2144, "train_tokens_per_second": 148294.929 }, { "epoch": 0.4738208567719602, "grad_norm": 0.7578125, "learning_rate": 0.0004933263157894737, "loss": 0.0487, "num_input_tokens_seen": 143495168, "step": 2190, "train_runtime": 967.0254, "train_tokens_per_second": 148388.209 }, { "epoch": 0.47598442232799654, "grad_norm": 0.9140625, "learning_rate": 0.0004926947368421052, "loss": 0.0505, "num_input_tokens_seen": 144150528, "step": 2200, "train_runtime": 970.8322, "train_tokens_per_second": 148481.401 }, { "epoch": 0.47598442232799654, "eval_loss": 0.03245285898447037, "eval_runtime": 2.3707, "eval_samples_per_second": 13.498, "eval_steps_per_second": 0.422, "num_input_tokens_seen": 144150528, "step": 2200 }, { "epoch": 0.47598442232799654, "eval_byte_accuracy": 0.9903560830860534, "eval_chrf": 92.28294859196265, "eval_sacrebleu": 89.4412322999391, "eval_word_accuracy": 0.9766483516483516, "num_input_tokens_seen": 144150528, "perplexity": 1.032985196024998, "step": 2200 }, { "epoch": 0.4781479878840329, "grad_norm": 0.87890625, "learning_rate": 0.0004920631578947368, "loss": 0.0504, "num_input_tokens_seen": 144805888, "step": 2210, "train_runtime": 979.3143, "train_tokens_per_second": 147864.564 }, { "epoch": 0.48031155344006926, "grad_norm": 0.74609375, "learning_rate": 0.0004914315789473684, "loss": 0.0489, "num_input_tokens_seen": 145461248, "step": 2220, "train_runtime": 983.1232, "train_tokens_per_second": 147958.314 }, { "epoch": 0.48247511899610557, "grad_norm": 0.546875, "learning_rate": 0.0004907999999999999, "loss": 0.0493, "num_input_tokens_seen": 146116608, "step": 2230, "train_runtime": 986.9267, "train_tokens_per_second": 148052.14 }, { "epoch": 0.48463868455214193, "grad_norm": 0.69140625, "learning_rate": 0.0004901684210526316, "loss": 0.0488, "num_input_tokens_seen": 146771968, "step": 2240, "train_runtime": 990.7291, "train_tokens_per_second": 148145.403 }, { "epoch": 0.4868022501081783, "grad_norm": 0.87109375, "learning_rate": 0.0004895368421052631, "loss": 0.0489, "num_input_tokens_seen": 147427328, "step": 2250, "train_runtime": 994.5395, "train_tokens_per_second": 148236.777 }, { "epoch": 0.4889658156642146, "grad_norm": 0.69921875, "learning_rate": 0.0004889052631578948, "loss": 0.0454, "num_input_tokens_seen": 148074496, "step": 2260, "train_runtime": 1000.4269, "train_tokens_per_second": 148011.307 }, { "epoch": 0.49112938122025096, "grad_norm": 0.66796875, "learning_rate": 0.00048827368421052624, "loss": 0.0432, "num_input_tokens_seen": 148729856, "step": 2270, "train_runtime": 1004.2395, "train_tokens_per_second": 148101.981 }, { "epoch": 0.4932929467762873, "grad_norm": 0.90234375, "learning_rate": 0.0004876421052631579, "loss": 0.0466, "num_input_tokens_seen": 149385216, "step": 2280, "train_runtime": 1008.0401, "train_tokens_per_second": 148193.725 }, { "epoch": 0.4954565123323237, "grad_norm": 0.8515625, "learning_rate": 0.0004870105263157894, "loss": 0.0506, "num_input_tokens_seen": 150040576, "step": 2290, "train_runtime": 1011.8488, "train_tokens_per_second": 148283.587 }, { "epoch": 0.49762007788836, "grad_norm": 0.45703125, "learning_rate": 0.000486378947368421, "loss": 0.0476, "num_input_tokens_seen": 150695936, "step": 2300, "train_runtime": 1015.6644, "train_tokens_per_second": 148371.775 }, { "epoch": 0.49762007788836, "eval_loss": 0.035914402455091476, "eval_runtime": 1.9959, "eval_samples_per_second": 16.032, "eval_steps_per_second": 0.501, "num_input_tokens_seen": 150695936, "step": 2300 }, { "epoch": 0.49762007788836, "eval_byte_accuracy": 0.9888724035608308, "eval_chrf": 92.1404853165006, "eval_sacrebleu": 90.1751620485485, "eval_word_accuracy": 0.9752747252747253, "num_input_tokens_seen": 150695936, "perplexity": 1.036567115093198, "step": 2300 }, { "epoch": 0.49978364344439635, "grad_norm": 0.86328125, "learning_rate": 0.0004857473684210526, "loss": 0.0472, "num_input_tokens_seen": 151351296, "step": 2310, "train_runtime": 1023.6035, "train_tokens_per_second": 147861.259 }, { "epoch": 0.5019472090004327, "grad_norm": 0.71484375, "learning_rate": 0.0004851157894736842, "loss": 0.0497, "num_input_tokens_seen": 152006656, "step": 2320, "train_runtime": 1027.3926, "train_tokens_per_second": 147953.823 }, { "epoch": 0.504110774556469, "grad_norm": 1.0859375, "learning_rate": 0.00048448421052631576, "loss": 0.0512, "num_input_tokens_seen": 152662016, "step": 2330, "train_runtime": 1031.1996, "train_tokens_per_second": 148043.133 }, { "epoch": 0.5062743401125054, "grad_norm": 1.296875, "learning_rate": 0.0004838526315789473, "loss": 0.0481, "num_input_tokens_seen": 153317376, "step": 2340, "train_runtime": 1035.0087, "train_tokens_per_second": 148131.485 }, { "epoch": 0.5084379056685417, "grad_norm": 0.8828125, "learning_rate": 0.00048322105263157893, "loss": 0.0474, "num_input_tokens_seen": 153968640, "step": 2350, "train_runtime": 1038.8024, "train_tokens_per_second": 148217.443 }, { "epoch": 0.5106014712245781, "grad_norm": 1.0546875, "learning_rate": 0.00048258947368421046, "loss": 0.0468, "num_input_tokens_seen": 154624000, "step": 2360, "train_runtime": 1044.7364, "train_tokens_per_second": 148002.883 }, { "epoch": 0.5127650367806145, "grad_norm": 0.98046875, "learning_rate": 0.00048195789473684205, "loss": 0.0494, "num_input_tokens_seen": 155279360, "step": 2370, "train_runtime": 1048.5496, "train_tokens_per_second": 148089.669 }, { "epoch": 0.5149286023366508, "grad_norm": 1.0546875, "learning_rate": 0.00048132631578947364, "loss": 0.048, "num_input_tokens_seen": 155934720, "step": 2380, "train_runtime": 1052.3567, "train_tokens_per_second": 148176.674 }, { "epoch": 0.5170921678926872, "grad_norm": 0.7890625, "learning_rate": 0.0004806947368421052, "loss": 0.0471, "num_input_tokens_seen": 156590080, "step": 2390, "train_runtime": 1056.1663, "train_tokens_per_second": 148262.709 }, { "epoch": 0.5192557334487234, "grad_norm": 0.83984375, "learning_rate": 0.0004800631578947368, "loss": 0.0468, "num_input_tokens_seen": 157245440, "step": 2400, "train_runtime": 1059.976, "train_tokens_per_second": 148348.107 }, { "epoch": 0.5192557334487234, "eval_loss": 0.0342160128057003, "eval_runtime": 2.3939, "eval_samples_per_second": 13.367, "eval_steps_per_second": 0.418, "num_input_tokens_seen": 157245440, "step": 2400 }, { "epoch": 0.5192557334487234, "eval_byte_accuracy": 0.9888724035608308, "eval_chrf": 91.05569648119452, "eval_sacrebleu": 88.20971713831192, "eval_word_accuracy": 0.9752747252747253, "num_input_tokens_seen": 157245440, "perplexity": 1.0348081143911412, "step": 2400 }, { "epoch": 0.5214192990047598, "grad_norm": 1.125, "learning_rate": 0.00047943157894736834, "loss": 0.0463, "num_input_tokens_seen": 157900800, "step": 2410, "train_runtime": 1068.0988, "train_tokens_per_second": 147833.513 }, { "epoch": 0.5235828645607962, "grad_norm": 1.28125, "learning_rate": 0.0004788, "loss": 0.0483, "num_input_tokens_seen": 158556160, "step": 2420, "train_runtime": 1071.9094, "train_tokens_per_second": 147919.371 }, { "epoch": 0.5257464301168325, "grad_norm": 0.7109375, "learning_rate": 0.0004781684210526315, "loss": 0.0472, "num_input_tokens_seen": 159211520, "step": 2430, "train_runtime": 1075.7223, "train_tokens_per_second": 148004.294 }, { "epoch": 0.5279099956728689, "grad_norm": 0.8203125, "learning_rate": 0.0004775368421052631, "loss": 0.0492, "num_input_tokens_seen": 159866880, "step": 2440, "train_runtime": 1079.5268, "train_tokens_per_second": 148089.771 }, { "epoch": 0.5300735612289053, "grad_norm": 0.59765625, "learning_rate": 0.0004769052631578947, "loss": 0.0464, "num_input_tokens_seen": 160522240, "step": 2450, "train_runtime": 1083.3371, "train_tokens_per_second": 148173.859 }, { "epoch": 0.5322371267849416, "grad_norm": 0.703125, "learning_rate": 0.0004762736842105263, "loss": 0.0459, "num_input_tokens_seen": 161177600, "step": 2460, "train_runtime": 1089.2108, "train_tokens_per_second": 147976.498 }, { "epoch": 0.534400692340978, "grad_norm": 0.94140625, "learning_rate": 0.00047564210526315786, "loss": 0.0445, "num_input_tokens_seen": 161832960, "step": 2470, "train_runtime": 1093.0188, "train_tokens_per_second": 148060.54 }, { "epoch": 0.5365642578970142, "grad_norm": 0.9453125, "learning_rate": 0.0004750105263157894, "loss": 0.0444, "num_input_tokens_seen": 162488320, "step": 2480, "train_runtime": 1096.8243, "train_tokens_per_second": 148144.343 }, { "epoch": 0.5387278234530506, "grad_norm": 0.8828125, "learning_rate": 0.00047437894736842103, "loss": 0.0466, "num_input_tokens_seen": 163139584, "step": 2490, "train_runtime": 1100.616, "train_tokens_per_second": 148225.706 }, { "epoch": 0.540891389009087, "grad_norm": 0.71484375, "learning_rate": 0.00047374736842105257, "loss": 0.0436, "num_input_tokens_seen": 163794944, "step": 2500, "train_runtime": 1104.4217, "train_tokens_per_second": 148308.341 }, { "epoch": 0.540891389009087, "eval_loss": 0.03495877608656883, "eval_runtime": 1.9919, "eval_samples_per_second": 16.065, "eval_steps_per_second": 0.502, "num_input_tokens_seen": 163794944, "step": 2500 }, { "epoch": 0.540891389009087, "eval_byte_accuracy": 0.990727002967359, "eval_chrf": 94.08434578822838, "eval_sacrebleu": 91.39427252834908, "eval_word_accuracy": 0.9807692307692307, "num_input_tokens_seen": 163794944, "perplexity": 1.0355770173823597, "step": 2500 }, { "epoch": 0.5430549545651233, "grad_norm": 0.56640625, "learning_rate": 0.00047311578947368415, "loss": 0.0461, "num_input_tokens_seen": 164450304, "step": 2510, "train_runtime": 1112.3764, "train_tokens_per_second": 147836.922 }, { "epoch": 0.5452185201211597, "grad_norm": 0.73046875, "learning_rate": 0.00047248421052631574, "loss": 0.0425, "num_input_tokens_seen": 165105664, "step": 2520, "train_runtime": 1116.5491, "train_tokens_per_second": 147871.39 }, { "epoch": 0.547382085677196, "grad_norm": 1.09375, "learning_rate": 0.0004718526315789473, "loss": 0.0446, "num_input_tokens_seen": 165761024, "step": 2530, "train_runtime": 1120.3597, "train_tokens_per_second": 147953.403 }, { "epoch": 0.5495456512332324, "grad_norm": 0.8125, "learning_rate": 0.0004712210526315789, "loss": 0.0441, "num_input_tokens_seen": 166412288, "step": 2540, "train_runtime": 1124.1458, "train_tokens_per_second": 148034.431 }, { "epoch": 0.5517092167892688, "grad_norm": 0.6796875, "learning_rate": 0.0004705894736842105, "loss": 0.0481, "num_input_tokens_seen": 167067648, "step": 2550, "train_runtime": 1127.9533, "train_tokens_per_second": 148115.745 }, { "epoch": 0.553872782345305, "grad_norm": 0.65625, "learning_rate": 0.0004699578947368421, "loss": 0.0436, "num_input_tokens_seen": 167723008, "step": 2560, "train_runtime": 1133.6639, "train_tokens_per_second": 147947.74 }, { "epoch": 0.5560363479013414, "grad_norm": 0.66796875, "learning_rate": 0.0004693263157894736, "loss": 0.0436, "num_input_tokens_seen": 168378368, "step": 2570, "train_runtime": 1137.4754, "train_tokens_per_second": 148028.141 }, { "epoch": 0.5581999134573777, "grad_norm": 0.5703125, "learning_rate": 0.0004686947368421052, "loss": 0.0451, "num_input_tokens_seen": 169033728, "step": 2580, "train_runtime": 1141.2806, "train_tokens_per_second": 148108.818 }, { "epoch": 0.5603634790134141, "grad_norm": 0.73828125, "learning_rate": 0.0004680631578947368, "loss": 0.0413, "num_input_tokens_seen": 169684992, "step": 2590, "train_runtime": 1145.0582, "train_tokens_per_second": 148188.968 }, { "epoch": 0.5625270445694505, "grad_norm": 1.0625, "learning_rate": 0.0004674315789473684, "loss": 0.0466, "num_input_tokens_seen": 170340352, "step": 2600, "train_runtime": 1148.8626, "train_tokens_per_second": 148268.694 }, { "epoch": 0.5625270445694505, "eval_loss": 0.035643137991428375, "eval_runtime": 2.2313, "eval_samples_per_second": 14.342, "eval_steps_per_second": 0.448, "num_input_tokens_seen": 170340352, "step": 2600 }, { "epoch": 0.5625270445694505, "eval_byte_accuracy": 0.9888724035608308, "eval_chrf": 92.92340614755764, "eval_sacrebleu": 90.15152547055455, "eval_word_accuracy": 0.9766483516483516, "num_input_tokens_seen": 170340352, "perplexity": 1.0362859694048128, "step": 2600 }, { "epoch": 0.5646906101254868, "grad_norm": 0.859375, "learning_rate": 0.00046679999999999996, "loss": 0.0478, "num_input_tokens_seen": 170995712, "step": 2610, "train_runtime": 1156.9829, "train_tokens_per_second": 147794.504 }, { "epoch": 0.5668541756815232, "grad_norm": 0.9609375, "learning_rate": 0.00046616842105263155, "loss": 0.0508, "num_input_tokens_seen": 171651072, "step": 2620, "train_runtime": 1160.7918, "train_tokens_per_second": 147874.125 }, { "epoch": 0.5690177412375595, "grad_norm": 0.83203125, "learning_rate": 0.00046553684210526314, "loss": 0.048, "num_input_tokens_seen": 172306432, "step": 2630, "train_runtime": 1164.6014, "train_tokens_per_second": 147953.141 }, { "epoch": 0.5711813067935958, "grad_norm": 0.6328125, "learning_rate": 0.00046490526315789467, "loss": 0.0451, "num_input_tokens_seen": 172961792, "step": 2640, "train_runtime": 1168.4095, "train_tokens_per_second": 148031.825 }, { "epoch": 0.5733448723496322, "grad_norm": 1.171875, "learning_rate": 0.00046427368421052625, "loss": 0.0409, "num_input_tokens_seen": 173617152, "step": 2650, "train_runtime": 1172.221, "train_tokens_per_second": 148109.573 }, { "epoch": 0.5755084379056685, "grad_norm": 1.03125, "learning_rate": 0.00046364210526315784, "loss": 0.0424, "num_input_tokens_seen": 174268416, "step": 2660, "train_runtime": 1178.2999, "train_tokens_per_second": 147898.18 }, { "epoch": 0.5776720034617049, "grad_norm": 0.9765625, "learning_rate": 0.0004630105263157894, "loss": 0.0472, "num_input_tokens_seen": 174923776, "step": 2670, "train_runtime": 1182.1031, "train_tokens_per_second": 147976.753 }, { "epoch": 0.5798355690177412, "grad_norm": 0.95703125, "learning_rate": 0.000462378947368421, "loss": 0.0511, "num_input_tokens_seen": 175579136, "step": 2680, "train_runtime": 1185.9109, "train_tokens_per_second": 148054.242 }, { "epoch": 0.5819991345737776, "grad_norm": 0.953125, "learning_rate": 0.0004617473684210526, "loss": 0.0438, "num_input_tokens_seen": 176234496, "step": 2690, "train_runtime": 1189.7267, "train_tokens_per_second": 148130.241 }, { "epoch": 0.584162700129814, "grad_norm": 1.265625, "learning_rate": 0.0004611157894736842, "loss": 0.0485, "num_input_tokens_seen": 176889856, "step": 2700, "train_runtime": 1193.5458, "train_tokens_per_second": 148205.339 }, { "epoch": 0.584162700129814, "eval_loss": 0.031199950724840164, "eval_runtime": 1.9652, "eval_samples_per_second": 16.283, "eval_steps_per_second": 0.509, "num_input_tokens_seen": 176889856, "step": 2700 }, { "epoch": 0.584162700129814, "eval_byte_accuracy": 0.9918397626112759, "eval_chrf": 92.79812823458536, "eval_sacrebleu": 90.75543017558854, "eval_word_accuracy": 0.978021978021978, "num_input_tokens_seen": 176889856, "perplexity": 1.0316917707816073, "step": 2700 }, { "epoch": 0.5863262656858503, "grad_norm": 0.65234375, "learning_rate": 0.0004604842105263157, "loss": 0.0387, "num_input_tokens_seen": 177545216, "step": 2710, "train_runtime": 1201.2331, "train_tokens_per_second": 147802.468 }, { "epoch": 0.5884898312418866, "grad_norm": 0.66015625, "learning_rate": 0.00045985263157894736, "loss": 0.0434, "num_input_tokens_seen": 178200576, "step": 2720, "train_runtime": 1205.3285, "train_tokens_per_second": 147843.994 }, { "epoch": 0.590653396797923, "grad_norm": 0.75390625, "learning_rate": 0.0004592210526315789, "loss": 0.042, "num_input_tokens_seen": 178855936, "step": 2730, "train_runtime": 1209.1463, "train_tokens_per_second": 147919.183 }, { "epoch": 0.5928169623539593, "grad_norm": 0.90234375, "learning_rate": 0.0004585894736842105, "loss": 0.0414, "num_input_tokens_seen": 179503104, "step": 2740, "train_runtime": 1212.915, "train_tokens_per_second": 147993.142 }, { "epoch": 0.5949805279099957, "grad_norm": 0.72265625, "learning_rate": 0.00045795789473684206, "loss": 0.038, "num_input_tokens_seen": 180158464, "step": 2750, "train_runtime": 1216.7277, "train_tokens_per_second": 148068.023 }, { "epoch": 0.597144093466032, "grad_norm": 0.84375, "learning_rate": 0.00045732631578947365, "loss": 0.0431, "num_input_tokens_seen": 180813824, "step": 2760, "train_runtime": 1222.4427, "train_tokens_per_second": 147911.901 }, { "epoch": 0.5993076590220684, "grad_norm": 0.6875, "learning_rate": 0.00045669473684210524, "loss": 0.0422, "num_input_tokens_seen": 181469184, "step": 2770, "train_runtime": 1226.2503, "train_tokens_per_second": 147987.072 }, { "epoch": 0.6014712245781048, "grad_norm": 1.109375, "learning_rate": 0.0004560631578947368, "loss": 0.0398, "num_input_tokens_seen": 182124544, "step": 2780, "train_runtime": 1230.0502, "train_tokens_per_second": 148062.688 }, { "epoch": 0.6036347901341411, "grad_norm": 0.79296875, "learning_rate": 0.0004554315789473684, "loss": 0.0384, "num_input_tokens_seen": 182779904, "step": 2790, "train_runtime": 1233.8548, "train_tokens_per_second": 148137.288 }, { "epoch": 0.6057983556901774, "grad_norm": 0.96484375, "learning_rate": 0.00045479999999999994, "loss": 0.0455, "num_input_tokens_seen": 183435264, "step": 2800, "train_runtime": 1237.6628, "train_tokens_per_second": 148211.014 }, { "epoch": 0.6057983556901774, "eval_loss": 0.03436446934938431, "eval_runtime": 2.0082, "eval_samples_per_second": 15.935, "eval_steps_per_second": 0.498, "num_input_tokens_seen": 183435264, "step": 2800 }, { "epoch": 0.6057983556901774, "eval_byte_accuracy": 0.9896142433234422, "eval_chrf": 92.24485602910795, "eval_sacrebleu": 89.54669354297549, "eval_word_accuracy": 0.9766483516483516, "num_input_tokens_seen": 183435264, "perplexity": 1.0349617498309909, "step": 2800 }, { "epoch": 0.6079619212462137, "grad_norm": 0.76171875, "learning_rate": 0.00045416842105263153, "loss": 0.0402, "num_input_tokens_seen": 184090624, "step": 2810, "train_runtime": 1245.5671, "train_tokens_per_second": 147796.639 }, { "epoch": 0.6101254868022501, "grad_norm": 0.66015625, "learning_rate": 0.0004535368421052631, "loss": 0.041, "num_input_tokens_seen": 184745984, "step": 2820, "train_runtime": 1249.3742, "train_tokens_per_second": 147870.819 }, { "epoch": 0.6122890523582865, "grad_norm": 0.67578125, "learning_rate": 0.0004529052631578947, "loss": 0.0399, "num_input_tokens_seen": 185401344, "step": 2830, "train_runtime": 1253.1873, "train_tokens_per_second": 147943.844 }, { "epoch": 0.6144526179143228, "grad_norm": 0.84765625, "learning_rate": 0.0004522736842105263, "loss": 0.0418, "num_input_tokens_seen": 186056704, "step": 2840, "train_runtime": 1256.9935, "train_tokens_per_second": 148017.236 }, { "epoch": 0.6166161834703592, "grad_norm": 0.5546875, "learning_rate": 0.0004516421052631579, "loss": 0.0414, "num_input_tokens_seen": 186712064, "step": 2850, "train_runtime": 1260.8015, "train_tokens_per_second": 148089.971 }, { "epoch": 0.6187797490263955, "grad_norm": 0.55859375, "learning_rate": 0.00045101052631578946, "loss": 0.0415, "num_input_tokens_seen": 187367424, "step": 2860, "train_runtime": 1266.8963, "train_tokens_per_second": 147894.834 }, { "epoch": 0.6209433145824318, "grad_norm": 0.7421875, "learning_rate": 0.000450378947368421, "loss": 0.0428, "num_input_tokens_seen": 188022784, "step": 2870, "train_runtime": 1270.7058, "train_tokens_per_second": 147967.208 }, { "epoch": 0.6231068801384682, "grad_norm": 0.494140625, "learning_rate": 0.0004497473684210526, "loss": 0.0416, "num_input_tokens_seen": 188678144, "step": 2880, "train_runtime": 1274.5129, "train_tokens_per_second": 148039.413 }, { "epoch": 0.6252704456945045, "grad_norm": 0.97265625, "learning_rate": 0.00044911578947368417, "loss": 0.0411, "num_input_tokens_seen": 189333504, "step": 2890, "train_runtime": 1278.3223, "train_tokens_per_second": 148110.924 }, { "epoch": 0.6274340112505409, "grad_norm": 0.640625, "learning_rate": 0.00044848421052631575, "loss": 0.0402, "num_input_tokens_seen": 189988864, "step": 2900, "train_runtime": 1282.1268, "train_tokens_per_second": 148182.58 }, { "epoch": 0.6274340112505409, "eval_loss": 0.02733401581645012, "eval_runtime": 1.9425, "eval_samples_per_second": 16.474, "eval_steps_per_second": 0.515, "num_input_tokens_seen": 189988864, "step": 2900 }, { "epoch": 0.6274340112505409, "eval_byte_accuracy": 0.9914688427299704, "eval_chrf": 92.14666454548373, "eval_sacrebleu": 90.64406606563134, "eval_word_accuracy": 0.978021978021978, "num_input_tokens_seen": 189988864, "perplexity": 1.0277110171752541, "step": 2900 }, { "epoch": 0.6295975768065772, "grad_norm": 0.79296875, "learning_rate": 0.00044785263157894734, "loss": 0.0409, "num_input_tokens_seen": 190644224, "step": 2910, "train_runtime": 1290.0163, "train_tokens_per_second": 147784.353 }, { "epoch": 0.6317611423626136, "grad_norm": 0.8203125, "learning_rate": 0.0004472210526315789, "loss": 0.0421, "num_input_tokens_seen": 191299584, "step": 2920, "train_runtime": 1293.8198, "train_tokens_per_second": 147856.432 }, { "epoch": 0.63392470791865, "grad_norm": 0.80078125, "learning_rate": 0.0004465894736842105, "loss": 0.0406, "num_input_tokens_seen": 191954944, "step": 2930, "train_runtime": 1297.6244, "train_tokens_per_second": 147927.965 }, { "epoch": 0.6360882734746863, "grad_norm": 1.0703125, "learning_rate": 0.00044595789473684204, "loss": 0.0414, "num_input_tokens_seen": 192610304, "step": 2940, "train_runtime": 1301.4321, "train_tokens_per_second": 147998.74 }, { "epoch": 0.6382518390307226, "grad_norm": 0.6796875, "learning_rate": 0.00044532631578947363, "loss": 0.0393, "num_input_tokens_seen": 193265664, "step": 2950, "train_runtime": 1305.2438, "train_tokens_per_second": 148068.632 }, { "epoch": 0.6404154045867589, "grad_norm": 0.796875, "learning_rate": 0.0004446947368421052, "loss": 0.0414, "num_input_tokens_seen": 193921024, "step": 2960, "train_runtime": 1311.2552, "train_tokens_per_second": 147889.609 }, { "epoch": 0.6425789701427953, "grad_norm": 0.71484375, "learning_rate": 0.0004440631578947368, "loss": 0.0424, "num_input_tokens_seen": 194576384, "step": 2970, "train_runtime": 1315.0601, "train_tokens_per_second": 147960.072 }, { "epoch": 0.6447425356988317, "grad_norm": 1.1171875, "learning_rate": 0.0004434315789473684, "loss": 0.04, "num_input_tokens_seen": 195231744, "step": 2980, "train_runtime": 1318.8693, "train_tokens_per_second": 148029.641 }, { "epoch": 0.646906101254868, "grad_norm": 0.51171875, "learning_rate": 0.0004428, "loss": 0.0398, "num_input_tokens_seen": 195887104, "step": 2990, "train_runtime": 1322.6799, "train_tokens_per_second": 148098.645 }, { "epoch": 0.6490696668109044, "grad_norm": 0.5625, "learning_rate": 0.00044216842105263156, "loss": 0.0389, "num_input_tokens_seen": 196542464, "step": 3000, "train_runtime": 1326.4813, "train_tokens_per_second": 148168.289 }, { "epoch": 0.6490696668109044, "eval_loss": 0.028482545167207718, "eval_runtime": 1.9015, "eval_samples_per_second": 16.829, "eval_steps_per_second": 0.526, "num_input_tokens_seen": 196542464, "step": 3000 }, { "epoch": 0.6490696668109044, "eval_byte_accuracy": 0.9910979228486647, "eval_chrf": 94.2210646218814, "eval_sacrebleu": 91.0806734556956, "eval_word_accuracy": 0.9793956043956044, "num_input_tokens_seen": 196542464, "perplexity": 1.0288920515390652, "step": 3000 }, { "epoch": 0.6512332323669408, "grad_norm": 0.90234375, "learning_rate": 0.00044153684210526315, "loss": 0.0402, "num_input_tokens_seen": 197197824, "step": 3010, "train_runtime": 1334.1984, "train_tokens_per_second": 147802.479 }, { "epoch": 0.6533967979229771, "grad_norm": 1.03125, "learning_rate": 0.0004409052631578947, "loss": 0.0398, "num_input_tokens_seen": 197853184, "step": 3020, "train_runtime": 1338.0111, "train_tokens_per_second": 147871.114 }, { "epoch": 0.6555603634790134, "grad_norm": 0.65234375, "learning_rate": 0.00044027368421052627, "loss": 0.0432, "num_input_tokens_seen": 198508544, "step": 3030, "train_runtime": 1341.8307, "train_tokens_per_second": 147938.589 }, { "epoch": 0.6577239290350497, "grad_norm": 0.85546875, "learning_rate": 0.00043964210526315785, "loss": 0.0371, "num_input_tokens_seen": 199163904, "step": 3040, "train_runtime": 1345.648, "train_tokens_per_second": 148005.948 }, { "epoch": 0.6598874945910861, "grad_norm": 0.8515625, "learning_rate": 0.00043901052631578944, "loss": 0.0386, "num_input_tokens_seen": 199819264, "step": 3050, "train_runtime": 1349.4633, "train_tokens_per_second": 148073.135 }, { "epoch": 0.6620510601471224, "grad_norm": 0.89453125, "learning_rate": 0.000438378947368421, "loss": 0.0368, "num_input_tokens_seen": 200474624, "step": 3060, "train_runtime": 1355.5535, "train_tokens_per_second": 147891.339 }, { "epoch": 0.6642146257031588, "grad_norm": 0.6875, "learning_rate": 0.0004377473684210526, "loss": 0.0402, "num_input_tokens_seen": 201129984, "step": 3070, "train_runtime": 1359.3648, "train_tokens_per_second": 147958.801 }, { "epoch": 0.6663781912591952, "grad_norm": 1.0703125, "learning_rate": 0.0004371157894736842, "loss": 0.0403, "num_input_tokens_seen": 201785344, "step": 3080, "train_runtime": 1363.1664, "train_tokens_per_second": 148026.936 }, { "epoch": 0.6685417568152315, "grad_norm": 0.8203125, "learning_rate": 0.00043648421052631573, "loss": 0.04, "num_input_tokens_seen": 202440704, "step": 3090, "train_runtime": 1366.9795, "train_tokens_per_second": 148093.446 }, { "epoch": 0.6707053223712679, "grad_norm": 0.63671875, "learning_rate": 0.0004358526315789473, "loss": 0.0369, "num_input_tokens_seen": 203096064, "step": 3100, "train_runtime": 1370.7841, "train_tokens_per_second": 148160.506 }, { "epoch": 0.6707053223712679, "eval_loss": 0.0291756484657526, "eval_runtime": 2.1304, "eval_samples_per_second": 15.021, "eval_steps_per_second": 0.469, "num_input_tokens_seen": 203096064, "step": 3100 }, { "epoch": 0.6707053223712679, "eval_byte_accuracy": 0.9914688427299704, "eval_chrf": 93.86777853094796, "eval_sacrebleu": 93.01282984759166, "eval_word_accuracy": 0.9807692307692307, "num_input_tokens_seen": 203096064, "perplexity": 1.0296054272067894, "step": 3100 }, { "epoch": 0.6728688879273041, "grad_norm": 0.890625, "learning_rate": 0.0004352210526315789, "loss": 0.0383, "num_input_tokens_seen": 203751424, "step": 3110, "train_runtime": 1378.8411, "train_tokens_per_second": 147770.049 }, { "epoch": 0.6750324534833405, "grad_norm": 0.703125, "learning_rate": 0.0004345894736842105, "loss": 0.0387, "num_input_tokens_seen": 204406784, "step": 3120, "train_runtime": 1382.6559, "train_tokens_per_second": 147836.333 }, { "epoch": 0.6771960190393769, "grad_norm": 1.0, "learning_rate": 0.0004339578947368421, "loss": 0.0366, "num_input_tokens_seen": 205062144, "step": 3130, "train_runtime": 1386.4677, "train_tokens_per_second": 147902.572 }, { "epoch": 0.6793595845954132, "grad_norm": 0.56640625, "learning_rate": 0.00043332631578947366, "loss": 0.0396, "num_input_tokens_seen": 205717504, "step": 3140, "train_runtime": 1390.2775, "train_tokens_per_second": 147968.665 }, { "epoch": 0.6815231501514496, "grad_norm": 0.69140625, "learning_rate": 0.00043269473684210525, "loss": 0.0368, "num_input_tokens_seen": 206372864, "step": 3150, "train_runtime": 1394.0902, "train_tokens_per_second": 148034.082 }, { "epoch": 0.683686715707486, "grad_norm": 0.392578125, "learning_rate": 0.0004320631578947368, "loss": 0.0364, "num_input_tokens_seen": 207028224, "step": 3160, "train_runtime": 1400.0318, "train_tokens_per_second": 147873.939 }, { "epoch": 0.6858502812635223, "grad_norm": 0.7734375, "learning_rate": 0.00043143157894736837, "loss": 0.0365, "num_input_tokens_seen": 207683584, "step": 3170, "train_runtime": 1403.8501, "train_tokens_per_second": 147938.576 }, { "epoch": 0.6880138468195587, "grad_norm": 0.578125, "learning_rate": 0.00043079999999999995, "loss": 0.0381, "num_input_tokens_seen": 208338944, "step": 3180, "train_runtime": 1407.6691, "train_tokens_per_second": 148002.779 }, { "epoch": 0.6901774123755949, "grad_norm": 0.84375, "learning_rate": 0.00043016842105263154, "loss": 0.0363, "num_input_tokens_seen": 208994304, "step": 3190, "train_runtime": 1411.485, "train_tokens_per_second": 148066.971 }, { "epoch": 0.6923409779316313, "grad_norm": 0.96875, "learning_rate": 0.00042953684210526313, "loss": 0.0378, "num_input_tokens_seen": 209649664, "step": 3200, "train_runtime": 1415.2661, "train_tokens_per_second": 148134.448 }, { "epoch": 0.6923409779316313, "eval_loss": 0.025803592056035995, "eval_runtime": 2.1635, "eval_samples_per_second": 14.791, "eval_steps_per_second": 0.462, "num_input_tokens_seen": 209649664, "step": 3200 }, { "epoch": 0.6923409779316313, "eval_byte_accuracy": 0.9936943620178041, "eval_chrf": 95.77357270335774, "eval_sacrebleu": 95.12443110170639, "eval_word_accuracy": 0.9862637362637363, "num_input_tokens_seen": 209649664, "perplexity": 1.026139386752756, "step": 3200 }, { "epoch": 0.6945045434876677, "grad_norm": 0.765625, "learning_rate": 0.0004289052631578947, "loss": 0.0383, "num_input_tokens_seen": 210305024, "step": 3210, "train_runtime": 1423.5396, "train_tokens_per_second": 147733.872 }, { "epoch": 0.696668109043704, "grad_norm": 1.3125, "learning_rate": 0.0004282736842105263, "loss": 0.0417, "num_input_tokens_seen": 210960384, "step": 3220, "train_runtime": 1427.3546, "train_tokens_per_second": 147798.161 }, { "epoch": 0.6988316745997404, "grad_norm": 1.0078125, "learning_rate": 0.00042764210526315783, "loss": 0.0363, "num_input_tokens_seen": 211615744, "step": 3230, "train_runtime": 1431.1454, "train_tokens_per_second": 147864.603 }, { "epoch": 0.7009952401557767, "grad_norm": 0.36328125, "learning_rate": 0.0004270105263157895, "loss": 0.0377, "num_input_tokens_seen": 212271104, "step": 3240, "train_runtime": 1434.9622, "train_tokens_per_second": 147928.013 }, { "epoch": 0.7031588057118131, "grad_norm": 0.53125, "learning_rate": 0.000426378947368421, "loss": 0.036, "num_input_tokens_seen": 212926464, "step": 3250, "train_runtime": 1438.7773, "train_tokens_per_second": 147991.258 }, { "epoch": 0.7053223712678495, "grad_norm": 0.58203125, "learning_rate": 0.0004257473684210526, "loss": 0.0322, "num_input_tokens_seen": 213581824, "step": 3260, "train_runtime": 1444.7363, "train_tokens_per_second": 147834.469 }, { "epoch": 0.7074859368238857, "grad_norm": 0.6953125, "learning_rate": 0.0004251157894736842, "loss": 0.0373, "num_input_tokens_seen": 214237184, "step": 3270, "train_runtime": 1448.5508, "train_tokens_per_second": 147897.598 }, { "epoch": 0.7096495023799221, "grad_norm": 0.6875, "learning_rate": 0.00042448421052631576, "loss": 0.0358, "num_input_tokens_seen": 214892544, "step": 3280, "train_runtime": 1452.3774, "train_tokens_per_second": 147959.161 }, { "epoch": 0.7118130679359584, "grad_norm": 0.53125, "learning_rate": 0.00042385263157894735, "loss": 0.0407, "num_input_tokens_seen": 215547904, "step": 3290, "train_runtime": 1456.1897, "train_tokens_per_second": 148021.855 }, { "epoch": 0.7139766334919948, "grad_norm": 0.55859375, "learning_rate": 0.0004232210526315789, "loss": 0.0365, "num_input_tokens_seen": 216203264, "step": 3300, "train_runtime": 1460.0024, "train_tokens_per_second": 148084.185 }, { "epoch": 0.7139766334919948, "eval_loss": 0.025559077039361, "eval_runtime": 2.0916, "eval_samples_per_second": 15.3, "eval_steps_per_second": 0.478, "num_input_tokens_seen": 216203264, "step": 3300 }, { "epoch": 0.7139766334919948, "eval_byte_accuracy": 0.9910979228486647, "eval_chrf": 93.62674522126692, "eval_sacrebleu": 92.43274818870434, "eval_word_accuracy": 0.9807692307692307, "num_input_tokens_seen": 216203264, "perplexity": 1.0258885109361953, "step": 3300 }, { "epoch": 0.7161401990480312, "grad_norm": 0.7265625, "learning_rate": 0.0004225894736842105, "loss": 0.0393, "num_input_tokens_seen": 216858624, "step": 3310, "train_runtime": 1468.0444, "train_tokens_per_second": 147719.393 }, { "epoch": 0.7183037646040675, "grad_norm": 0.91015625, "learning_rate": 0.00042195789473684206, "loss": 0.0393, "num_input_tokens_seen": 217513984, "step": 3320, "train_runtime": 1471.8608, "train_tokens_per_second": 147781.623 }, { "epoch": 0.7204673301601039, "grad_norm": 0.8359375, "learning_rate": 0.00042132631578947364, "loss": 0.0373, "num_input_tokens_seen": 218169344, "step": 3330, "train_runtime": 1475.669, "train_tokens_per_second": 147844.362 }, { "epoch": 0.7226308957161403, "grad_norm": 0.76953125, "learning_rate": 0.00042069473684210523, "loss": 0.0358, "num_input_tokens_seen": 218824704, "step": 3340, "train_runtime": 1479.4858, "train_tokens_per_second": 147905.919 }, { "epoch": 0.7247944612721765, "grad_norm": 0.6875, "learning_rate": 0.0004200631578947368, "loss": 0.0347, "num_input_tokens_seen": 219480064, "step": 3350, "train_runtime": 1483.2947, "train_tokens_per_second": 147967.941 }, { "epoch": 0.7269580268282129, "grad_norm": 0.58203125, "learning_rate": 0.0004194315789473684, "loss": 0.0365, "num_input_tokens_seen": 220135424, "step": 3360, "train_runtime": 1489.4844, "train_tokens_per_second": 147793.038 }, { "epoch": 0.7291215923842492, "grad_norm": 1.1171875, "learning_rate": 0.00041879999999999993, "loss": 0.0389, "num_input_tokens_seen": 220790784, "step": 3370, "train_runtime": 1493.2964, "train_tokens_per_second": 147854.625 }, { "epoch": 0.7312851579402856, "grad_norm": 0.65234375, "learning_rate": 0.0004181684210526316, "loss": 0.0361, "num_input_tokens_seen": 221446144, "step": 3380, "train_runtime": 1497.1061, "train_tokens_per_second": 147916.135 }, { "epoch": 0.733448723496322, "grad_norm": 0.76171875, "learning_rate": 0.0004175368421052631, "loss": 0.0333, "num_input_tokens_seen": 222101504, "step": 3390, "train_runtime": 1500.9269, "train_tokens_per_second": 147976.229 }, { "epoch": 0.7356122890523583, "grad_norm": 0.53515625, "learning_rate": 0.0004169052631578947, "loss": 0.0331, "num_input_tokens_seen": 222756864, "step": 3400, "train_runtime": 1504.7401, "train_tokens_per_second": 148036.77 }, { "epoch": 0.7356122890523583, "eval_loss": 0.02346055395901203, "eval_runtime": 2.1561, "eval_samples_per_second": 14.841, "eval_steps_per_second": 0.464, "num_input_tokens_seen": 222756864, "step": 3400 }, { "epoch": 0.7356122890523583, "eval_byte_accuracy": 0.9933234421364985, "eval_chrf": 95.48377466124089, "eval_sacrebleu": 94.0722064625731, "eval_word_accuracy": 0.9848901098901099, "num_input_tokens_seen": 222756864, "perplexity": 1.0237379175422994, "step": 3400 }, { "epoch": 0.7377758546083947, "grad_norm": 0.98046875, "learning_rate": 0.0004162736842105263, "loss": 0.0319, "num_input_tokens_seen": 223412224, "step": 3410, "train_runtime": 1512.839, "train_tokens_per_second": 147677.464 }, { "epoch": 0.7399394201644309, "grad_norm": 0.83984375, "learning_rate": 0.00041564210526315787, "loss": 0.0348, "num_input_tokens_seen": 224067584, "step": 3420, "train_runtime": 1516.657, "train_tokens_per_second": 147737.809 }, { "epoch": 0.7421029857204673, "grad_norm": 0.6171875, "learning_rate": 0.00041501052631578945, "loss": 0.0341, "num_input_tokens_seen": 224722944, "step": 3430, "train_runtime": 1520.4755, "train_tokens_per_second": 147797.808 }, { "epoch": 0.7442665512765037, "grad_norm": 0.6171875, "learning_rate": 0.000414378947368421, "loss": 0.0337, "num_input_tokens_seen": 225378304, "step": 3440, "train_runtime": 1524.2832, "train_tokens_per_second": 147858.554 }, { "epoch": 0.74643011683254, "grad_norm": 0.62890625, "learning_rate": 0.0004137473684210526, "loss": 0.038, "num_input_tokens_seen": 226033664, "step": 3450, "train_runtime": 1528.0999, "train_tokens_per_second": 147918.116 }, { "epoch": 0.7485936823885764, "grad_norm": 0.625, "learning_rate": 0.00041311578947368416, "loss": 0.0326, "num_input_tokens_seen": 226684928, "step": 3460, "train_runtime": 1534.0356, "train_tokens_per_second": 147770.314 }, { "epoch": 0.7507572479446127, "grad_norm": 0.92578125, "learning_rate": 0.0004124842105263158, "loss": 0.0351, "num_input_tokens_seen": 227340288, "step": 3470, "train_runtime": 1537.8508, "train_tokens_per_second": 147829.871 }, { "epoch": 0.7529208135006491, "grad_norm": 0.58984375, "learning_rate": 0.00041185263157894733, "loss": 0.0312, "num_input_tokens_seen": 227995648, "step": 3480, "train_runtime": 1541.6591, "train_tokens_per_second": 147889.795 }, { "epoch": 0.7550843790566855, "grad_norm": 0.78125, "learning_rate": 0.0004112210526315789, "loss": 0.0345, "num_input_tokens_seen": 228651008, "step": 3490, "train_runtime": 1545.4724, "train_tokens_per_second": 147948.943 }, { "epoch": 0.7572479446127217, "grad_norm": 0.95703125, "learning_rate": 0.0004105894736842105, "loss": 0.0375, "num_input_tokens_seen": 229306368, "step": 3500, "train_runtime": 1549.2818, "train_tokens_per_second": 148008.174 }, { "epoch": 0.7572479446127217, "eval_loss": 0.02329455316066742, "eval_runtime": 2.0292, "eval_samples_per_second": 15.77, "eval_steps_per_second": 0.493, "num_input_tokens_seen": 229306368, "step": 3500 }, { "epoch": 0.7572479446127217, "eval_byte_accuracy": 0.9940652818991098, "eval_chrf": 95.63532154828998, "eval_sacrebleu": 94.57672912284198, "eval_word_accuracy": 0.9835164835164835, "num_input_tokens_seen": 229306368, "perplexity": 1.023567990335108, "step": 3500 }, { "epoch": 0.7594115101687581, "grad_norm": 0.62890625, "learning_rate": 0.00040995789473684204, "loss": 0.0336, "num_input_tokens_seen": 229961728, "step": 3510, "train_runtime": 1557.3127, "train_tokens_per_second": 147665.739 }, { "epoch": 0.7615750757247944, "grad_norm": 0.67578125, "learning_rate": 0.0004093263157894737, "loss": 0.0311, "num_input_tokens_seen": 230617088, "step": 3520, "train_runtime": 1561.1204, "train_tokens_per_second": 147725.367 }, { "epoch": 0.7637386412808308, "grad_norm": 0.68359375, "learning_rate": 0.0004086947368421052, "loss": 0.0314, "num_input_tokens_seen": 231272448, "step": 3530, "train_runtime": 1564.9367, "train_tokens_per_second": 147783.895 }, { "epoch": 0.7659022068368672, "grad_norm": 0.8515625, "learning_rate": 0.00040806315789473685, "loss": 0.0365, "num_input_tokens_seen": 231927808, "step": 3540, "train_runtime": 1568.7575, "train_tokens_per_second": 147841.718 }, { "epoch": 0.7680657723929035, "grad_norm": 0.5234375, "learning_rate": 0.0004074315789473684, "loss": 0.032, "num_input_tokens_seen": 232583168, "step": 3550, "train_runtime": 1572.5709, "train_tokens_per_second": 147899.954 }, { "epoch": 0.7702293379489399, "grad_norm": 0.80078125, "learning_rate": 0.00040679999999999997, "loss": 0.0318, "num_input_tokens_seen": 233238528, "step": 3560, "train_runtime": 1578.5239, "train_tokens_per_second": 147757.366 }, { "epoch": 0.7723929035049762, "grad_norm": 0.74609375, "learning_rate": 0.00040616842105263155, "loss": 0.0362, "num_input_tokens_seen": 233893888, "step": 3570, "train_runtime": 1582.3272, "train_tokens_per_second": 147816.385 }, { "epoch": 0.7745564690610125, "grad_norm": 0.451171875, "learning_rate": 0.0004055368421052631, "loss": 0.0354, "num_input_tokens_seen": 234549248, "step": 3580, "train_runtime": 1586.1479, "train_tokens_per_second": 147873.501 }, { "epoch": 0.7767200346170489, "grad_norm": 0.470703125, "learning_rate": 0.0004049052631578947, "loss": 0.0333, "num_input_tokens_seen": 235204608, "step": 3590, "train_runtime": 1589.9649, "train_tokens_per_second": 147930.69 }, { "epoch": 0.7788836001730852, "grad_norm": 1.0703125, "learning_rate": 0.00040427368421052626, "loss": 0.0375, "num_input_tokens_seen": 235859968, "step": 3600, "train_runtime": 1593.7746, "train_tokens_per_second": 147988.287 }, { "epoch": 0.7788836001730852, "eval_loss": 0.020246392115950584, "eval_runtime": 2.1515, "eval_samples_per_second": 14.873, "eval_steps_per_second": 0.465, "num_input_tokens_seen": 235859968, "step": 3600 }, { "epoch": 0.7788836001730852, "eval_byte_accuracy": 0.9944362017804155, "eval_chrf": 96.81607520480924, "eval_sacrebleu": 96.20892605110268, "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 235859968, "perplexity": 1.020452740563904, "step": 3600 }, { "epoch": 0.7810471657291216, "grad_norm": 0.64453125, "learning_rate": 0.0004036421052631579, "loss": 0.0331, "num_input_tokens_seen": 236515328, "step": 3610, "train_runtime": 1602.0598, "train_tokens_per_second": 147632.018 }, { "epoch": 0.783210731285158, "grad_norm": 0.75, "learning_rate": 0.00040301052631578943, "loss": 0.0335, "num_input_tokens_seen": 237170688, "step": 3620, "train_runtime": 1605.8778, "train_tokens_per_second": 147689.128 }, { "epoch": 0.7853742968411943, "grad_norm": 0.62109375, "learning_rate": 0.000402378947368421, "loss": 0.0363, "num_input_tokens_seen": 237821952, "step": 3630, "train_runtime": 1609.6669, "train_tokens_per_second": 147746.067 }, { "epoch": 0.7875378623972307, "grad_norm": 0.734375, "learning_rate": 0.0004017473684210526, "loss": 0.0356, "num_input_tokens_seen": 238477312, "step": 3640, "train_runtime": 1613.462, "train_tokens_per_second": 147804.73 }, { "epoch": 0.789701427953267, "grad_norm": 0.60546875, "learning_rate": 0.00040111578947368414, "loss": 0.0343, "num_input_tokens_seen": 239132672, "step": 3650, "train_runtime": 1617.2673, "train_tokens_per_second": 147862.178 }, { "epoch": 0.7918649935093033, "grad_norm": 0.984375, "learning_rate": 0.0004004842105263158, "loss": 0.0359, "num_input_tokens_seen": 239788032, "step": 3660, "train_runtime": 1623.2242, "train_tokens_per_second": 147723.298 }, { "epoch": 0.7940285590653396, "grad_norm": 0.50390625, "learning_rate": 0.0003998526315789473, "loss": 0.0372, "num_input_tokens_seen": 240443392, "step": 3670, "train_runtime": 1627.027, "train_tokens_per_second": 147780.827 }, { "epoch": 0.796192124621376, "grad_norm": 0.9921875, "learning_rate": 0.00039922105263157895, "loss": 0.0322, "num_input_tokens_seen": 241098752, "step": 3680, "train_runtime": 1630.8244, "train_tokens_per_second": 147838.573 }, { "epoch": 0.7983556901774124, "grad_norm": 0.859375, "learning_rate": 0.0003985894736842105, "loss": 0.0368, "num_input_tokens_seen": 241754112, "step": 3690, "train_runtime": 1634.6421, "train_tokens_per_second": 147894.218 }, { "epoch": 0.8005192557334487, "grad_norm": 0.65234375, "learning_rate": 0.0003979578947368421, "loss": 0.0657, "num_input_tokens_seen": 242409472, "step": 3700, "train_runtime": 1638.4625, "train_tokens_per_second": 147949.357 }, { "epoch": 0.8005192557334487, "eval_loss": 0.027874412015080452, "eval_runtime": 2.3954, "eval_samples_per_second": 13.359, "eval_steps_per_second": 0.417, "num_input_tokens_seen": 242409472, "step": 3700 }, { "epoch": 0.8005192557334487, "eval_byte_accuracy": 0.9925816023738873, "eval_chrf": 93.66102977336031, "eval_sacrebleu": 92.05374753299421, "eval_word_accuracy": 0.9821428571428571, "num_input_tokens_seen": 242409472, "perplexity": 1.0282665383894833, "step": 3700 }, { "epoch": 0.8026828212894851, "grad_norm": 0.63671875, "learning_rate": 0.00039732631578947366, "loss": 0.0331, "num_input_tokens_seen": 243064832, "step": 3710, "train_runtime": 1646.8333, "train_tokens_per_second": 147595.283 }, { "epoch": 0.8048463868455215, "grad_norm": 0.6484375, "learning_rate": 0.0003966947368421052, "loss": 0.0335, "num_input_tokens_seen": 243720192, "step": 3720, "train_runtime": 1650.6505, "train_tokens_per_second": 147650.997 }, { "epoch": 0.8070099524015578, "grad_norm": 0.35546875, "learning_rate": 0.00039606315789473683, "loss": 0.0308, "num_input_tokens_seen": 244375552, "step": 3730, "train_runtime": 1654.463, "train_tokens_per_second": 147706.872 }, { "epoch": 0.8091735179575941, "grad_norm": 0.66015625, "learning_rate": 0.00039543157894736836, "loss": 0.0319, "num_input_tokens_seen": 245030912, "step": 3740, "train_runtime": 1658.2795, "train_tokens_per_second": 147762.13 }, { "epoch": 0.8113370835136304, "grad_norm": 0.51953125, "learning_rate": 0.0003948, "loss": 0.0321, "num_input_tokens_seen": 245686272, "step": 3750, "train_runtime": 1662.0901, "train_tokens_per_second": 147817.663 }, { "epoch": 0.8135006490696668, "grad_norm": 0.63671875, "learning_rate": 0.00039416842105263153, "loss": 0.0323, "num_input_tokens_seen": 246337536, "step": 3760, "train_runtime": 1668.0409, "train_tokens_per_second": 147680.756 }, { "epoch": 0.8156642146257032, "grad_norm": 0.56640625, "learning_rate": 0.0003935368421052632, "loss": 0.0361, "num_input_tokens_seen": 246988800, "step": 3770, "train_runtime": 1671.8351, "train_tokens_per_second": 147735.145 }, { "epoch": 0.8178277801817395, "grad_norm": 0.94140625, "learning_rate": 0.0003929052631578947, "loss": 0.033, "num_input_tokens_seen": 247644160, "step": 3780, "train_runtime": 1675.649, "train_tokens_per_second": 147789.994 }, { "epoch": 0.8199913457377759, "grad_norm": 0.6875, "learning_rate": 0.00039227368421052624, "loss": 0.0325, "num_input_tokens_seen": 248299520, "step": 3790, "train_runtime": 1679.4499, "train_tokens_per_second": 147845.743 }, { "epoch": 0.8221549112938122, "grad_norm": 0.6328125, "learning_rate": 0.0003916421052631579, "loss": 0.0329, "num_input_tokens_seen": 248954880, "step": 3800, "train_runtime": 1683.2644, "train_tokens_per_second": 147900.044 }, { "epoch": 0.8221549112938122, "eval_loss": 0.023587677627801895, "eval_runtime": 2.0373, "eval_samples_per_second": 15.707, "eval_steps_per_second": 0.491, "num_input_tokens_seen": 248954880, "step": 3800 }, { "epoch": 0.8221549112938122, "eval_byte_accuracy": 0.9940652818991098, "eval_chrf": 95.56456555243946, "eval_sacrebleu": 94.62541130797075, "eval_word_accuracy": 0.9848901098901099, "num_input_tokens_seen": 248954880, "perplexity": 1.0238680671346283, "step": 3800 }, { "epoch": 0.8243184768498486, "grad_norm": 0.5390625, "learning_rate": 0.0003910105263157894, "loss": 0.0315, "num_input_tokens_seen": 249610240, "step": 3810, "train_runtime": 1691.2895, "train_tokens_per_second": 147585.76 }, { "epoch": 0.8264820424058849, "grad_norm": 0.56640625, "learning_rate": 0.00039037894736842105, "loss": 0.032, "num_input_tokens_seen": 250265600, "step": 3820, "train_runtime": 1695.1146, "train_tokens_per_second": 147639.343 }, { "epoch": 0.8286456079619212, "grad_norm": 0.453125, "learning_rate": 0.0003897473684210526, "loss": 0.0306, "num_input_tokens_seen": 250920960, "step": 3830, "train_runtime": 1698.9403, "train_tokens_per_second": 147692.632 }, { "epoch": 0.8308091735179576, "grad_norm": 0.66796875, "learning_rate": 0.0003891157894736842, "loss": 0.0326, "num_input_tokens_seen": 251576320, "step": 3840, "train_runtime": 1702.7553, "train_tokens_per_second": 147746.612 }, { "epoch": 0.8329727390739939, "grad_norm": 0.5859375, "learning_rate": 0.00038848421052631576, "loss": 0.0306, "num_input_tokens_seen": 252231680, "step": 3850, "train_runtime": 1706.5717, "train_tokens_per_second": 147800.22 }, { "epoch": 0.8351363046300303, "grad_norm": 0.859375, "learning_rate": 0.0003878526315789473, "loss": 0.0349, "num_input_tokens_seen": 252887040, "step": 3860, "train_runtime": 1712.5469, "train_tokens_per_second": 147667.224 }, { "epoch": 0.8372998701860667, "grad_norm": 0.7109375, "learning_rate": 0.00038722105263157893, "loss": 0.0327, "num_input_tokens_seen": 253542400, "step": 3870, "train_runtime": 1716.3594, "train_tokens_per_second": 147721.046 }, { "epoch": 0.839463435742103, "grad_norm": 0.8515625, "learning_rate": 0.00038658947368421046, "loss": 0.0721, "num_input_tokens_seen": 254197760, "step": 3880, "train_runtime": 1720.1745, "train_tokens_per_second": 147774.405 }, { "epoch": 0.8416270012981394, "grad_norm": 0.6953125, "learning_rate": 0.0003859578947368421, "loss": 0.0352, "num_input_tokens_seen": 254853120, "step": 3890, "train_runtime": 1723.9877, "train_tokens_per_second": 147827.693 }, { "epoch": 0.8437905668541756, "grad_norm": 0.68359375, "learning_rate": 0.00038532631578947363, "loss": 0.032, "num_input_tokens_seen": 255508480, "step": 3900, "train_runtime": 1727.7959, "train_tokens_per_second": 147881.17 }, { "epoch": 0.8437905668541756, "eval_loss": 0.021019412204623222, "eval_runtime": 2.1668, "eval_samples_per_second": 14.769, "eval_steps_per_second": 0.462, "num_input_tokens_seen": 255508480, "step": 3900 }, { "epoch": 0.8437905668541756, "eval_byte_accuracy": 0.9940652818991098, "eval_chrf": 95.6553581195663, "eval_sacrebleu": 94.6463989521252, "eval_word_accuracy": 0.9835164835164835, "num_input_tokens_seen": 255508480, "perplexity": 1.0212418760013775, "step": 3900 }, { "epoch": 0.845954132410212, "grad_norm": 0.74609375, "learning_rate": 0.0003846947368421053, "loss": 0.032, "num_input_tokens_seen": 256163840, "step": 3910, "train_runtime": 1735.9497, "train_tokens_per_second": 147564.095 }, { "epoch": 0.8481176979662484, "grad_norm": 0.6796875, "learning_rate": 0.0003840631578947368, "loss": 0.033, "num_input_tokens_seen": 256819200, "step": 3920, "train_runtime": 1739.7714, "train_tokens_per_second": 147616.637 }, { "epoch": 0.8502812635222847, "grad_norm": 0.75, "learning_rate": 0.00038343157894736834, "loss": 0.0322, "num_input_tokens_seen": 257474560, "step": 3930, "train_runtime": 1743.5864, "train_tokens_per_second": 147669.512 }, { "epoch": 0.8524448290783211, "grad_norm": 0.6484375, "learning_rate": 0.0003828, "loss": 0.0341, "num_input_tokens_seen": 258129920, "step": 3940, "train_runtime": 1747.3972, "train_tokens_per_second": 147722.52 }, { "epoch": 0.8546083946343574, "grad_norm": 0.64453125, "learning_rate": 0.0003821684210526315, "loss": 0.0287, "num_input_tokens_seen": 258785280, "step": 3950, "train_runtime": 1751.1973, "train_tokens_per_second": 147776.198 }, { "epoch": 0.8567719601903938, "grad_norm": 0.52734375, "learning_rate": 0.00038153684210526315, "loss": 0.0342, "num_input_tokens_seen": 259440640, "step": 3960, "train_runtime": 1756.985, "train_tokens_per_second": 147662.414 }, { "epoch": 0.8589355257464301, "grad_norm": 0.58203125, "learning_rate": 0.0003809052631578947, "loss": 0.0253, "num_input_tokens_seen": 260096000, "step": 3970, "train_runtime": 1760.7782, "train_tokens_per_second": 147716.507 }, { "epoch": 0.8610990913024664, "grad_norm": 0.71875, "learning_rate": 0.0003802736842105263, "loss": 0.0289, "num_input_tokens_seen": 260751360, "step": 3980, "train_runtime": 1764.5928, "train_tokens_per_second": 147768.576 }, { "epoch": 0.8632626568585028, "grad_norm": 0.78125, "learning_rate": 0.00037964210526315786, "loss": 0.0324, "num_input_tokens_seen": 261406720, "step": 3990, "train_runtime": 1768.3999, "train_tokens_per_second": 147821.046 }, { "epoch": 0.8654262224145391, "grad_norm": 0.427734375, "learning_rate": 0.0003790105263157894, "loss": 0.0306, "num_input_tokens_seen": 262062080, "step": 4000, "train_runtime": 1772.2207, "train_tokens_per_second": 147872.143 }, { "epoch": 0.8654262224145391, "eval_loss": 0.018921982496976852, "eval_runtime": 2.1514, "eval_samples_per_second": 14.874, "eval_steps_per_second": 0.465, "num_input_tokens_seen": 262062080, "step": 4000 }, { "epoch": 0.8654262224145391, "eval_byte_accuracy": 0.9936943620178041, "eval_chrf": 95.4335711604272, "eval_sacrebleu": 93.94661550016679, "eval_word_accuracy": 0.9835164835164835, "num_input_tokens_seen": 262062080, "perplexity": 1.0191021377117173, "step": 4000 }, { "epoch": 0.8675897879705755, "grad_norm": 0.5390625, "learning_rate": 0.00037837894736842103, "loss": 0.0315, "num_input_tokens_seen": 262717440, "step": 4010, "train_runtime": 1780.524, "train_tokens_per_second": 147550.629 }, { "epoch": 0.8697533535266119, "grad_norm": 0.47265625, "learning_rate": 0.00037774736842105256, "loss": 0.0305, "num_input_tokens_seen": 263372800, "step": 4020, "train_runtime": 1784.3244, "train_tokens_per_second": 147603.655 }, { "epoch": 0.8719169190826482, "grad_norm": 0.63671875, "learning_rate": 0.0003771157894736842, "loss": 0.0286, "num_input_tokens_seen": 264028160, "step": 4030, "train_runtime": 1788.12, "train_tokens_per_second": 147656.847 }, { "epoch": 0.8740804846386846, "grad_norm": 0.494140625, "learning_rate": 0.00037648421052631574, "loss": 0.0289, "num_input_tokens_seen": 264683520, "step": 4040, "train_runtime": 1791.923, "train_tokens_per_second": 147709.207 }, { "epoch": 0.8762440501947208, "grad_norm": 0.609375, "learning_rate": 0.0003758526315789474, "loss": 0.031, "num_input_tokens_seen": 265338880, "step": 4050, "train_runtime": 1795.7271, "train_tokens_per_second": 147761.251 }, { "epoch": 0.8784076157507572, "grad_norm": 0.59375, "learning_rate": 0.0003752210526315789, "loss": 0.0295, "num_input_tokens_seen": 265994240, "step": 4060, "train_runtime": 1801.6832, "train_tokens_per_second": 147636.518 }, { "epoch": 0.8805711813067936, "grad_norm": 0.703125, "learning_rate": 0.00037458947368421044, "loss": 0.0298, "num_input_tokens_seen": 266649600, "step": 4070, "train_runtime": 1805.4894, "train_tokens_per_second": 147688.264 }, { "epoch": 0.8827347468628299, "grad_norm": 0.81640625, "learning_rate": 0.0003739578947368421, "loss": 0.0309, "num_input_tokens_seen": 267304960, "step": 4080, "train_runtime": 1809.3053, "train_tokens_per_second": 147739.004 }, { "epoch": 0.8848983124188663, "grad_norm": 0.4609375, "learning_rate": 0.0003733263157894736, "loss": 0.0295, "num_input_tokens_seen": 267960320, "step": 4090, "train_runtime": 1813.1124, "train_tokens_per_second": 147790.242 }, { "epoch": 0.8870618779749027, "grad_norm": 0.4765625, "learning_rate": 0.00037269473684210525, "loss": 0.0302, "num_input_tokens_seen": 268615680, "step": 4100, "train_runtime": 1816.9268, "train_tokens_per_second": 147840.673 }, { "epoch": 0.8870618779749027, "eval_loss": 0.018717404454946518, "eval_runtime": 2.135, "eval_samples_per_second": 14.988, "eval_steps_per_second": 0.468, "num_input_tokens_seen": 268615680, "step": 4100 }, { "epoch": 0.8870618779749027, "eval_byte_accuracy": 0.9944362017804155, "eval_chrf": 95.60721087616592, "eval_sacrebleu": 95.5313400244968, "eval_word_accuracy": 0.9862637362637363, "num_input_tokens_seen": 268615680, "perplexity": 1.0188936731161218, "step": 4100 }, { "epoch": 0.889225443530939, "grad_norm": 0.62109375, "learning_rate": 0.0003720631578947368, "loss": 0.028, "num_input_tokens_seen": 269271040, "step": 4110, "train_runtime": 1825.0501, "train_tokens_per_second": 147541.722 }, { "epoch": 0.8913890090869754, "grad_norm": 0.4609375, "learning_rate": 0.00037143157894736843, "loss": 0.0317, "num_input_tokens_seen": 269926400, "step": 4120, "train_runtime": 1828.8662, "train_tokens_per_second": 147592.21 }, { "epoch": 0.8935525746430116, "grad_norm": 0.90625, "learning_rate": 0.00037079999999999996, "loss": 0.031, "num_input_tokens_seen": 270581760, "step": 4130, "train_runtime": 1832.6868, "train_tokens_per_second": 147642.115 }, { "epoch": 0.895716140199048, "grad_norm": 0.78125, "learning_rate": 0.0003701684210526315, "loss": 0.0334, "num_input_tokens_seen": 271237120, "step": 4140, "train_runtime": 1836.4971, "train_tokens_per_second": 147692.644 }, { "epoch": 0.8978797057550844, "grad_norm": 0.6640625, "learning_rate": 0.00036953684210526313, "loss": 0.0298, "num_input_tokens_seen": 271892480, "step": 4150, "train_runtime": 1840.3089, "train_tokens_per_second": 147742.851 }, { "epoch": 0.9000432713111207, "grad_norm": 0.515625, "learning_rate": 0.00036890526315789466, "loss": 0.0329, "num_input_tokens_seen": 272547840, "step": 4160, "train_runtime": 1846.4577, "train_tokens_per_second": 147605.789 }, { "epoch": 0.9022068368671571, "grad_norm": 0.5078125, "learning_rate": 0.0003682736842105263, "loss": 0.0292, "num_input_tokens_seen": 273203200, "step": 4170, "train_runtime": 1850.249, "train_tokens_per_second": 147657.532 }, { "epoch": 0.9043704024231934, "grad_norm": 0.48828125, "learning_rate": 0.00036764210526315784, "loss": 0.0287, "num_input_tokens_seen": 273858560, "step": 4180, "train_runtime": 1854.0593, "train_tokens_per_second": 147707.553 }, { "epoch": 0.9065339679792298, "grad_norm": 0.578125, "learning_rate": 0.0003670105263157895, "loss": 0.0327, "num_input_tokens_seen": 274513920, "step": 4190, "train_runtime": 1857.8704, "train_tokens_per_second": 147757.306 }, { "epoch": 0.9086975335352662, "grad_norm": 0.6015625, "learning_rate": 0.000366378947368421, "loss": 0.031, "num_input_tokens_seen": 275165184, "step": 4200, "train_runtime": 1861.6603, "train_tokens_per_second": 147806.335 }, { "epoch": 0.9086975335352662, "eval_loss": 0.017941121011972427, "eval_runtime": 1.9902, "eval_samples_per_second": 16.079, "eval_steps_per_second": 0.502, "num_input_tokens_seen": 275165184, "step": 4200 }, { "epoch": 0.9086975335352662, "eval_byte_accuracy": 0.9955489614243324, "eval_chrf": 97.07386912321206, "eval_sacrebleu": 95.99172930598161, "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 275165184, "perplexity": 1.0181030297489129, "step": 4200 }, { "epoch": 0.9108610990913024, "grad_norm": 0.70703125, "learning_rate": 0.00036574736842105254, "loss": 0.0324, "num_input_tokens_seen": 275820544, "step": 4210, "train_runtime": 1869.6358, "train_tokens_per_second": 147526.35 }, { "epoch": 0.9130246646473388, "grad_norm": 0.46484375, "learning_rate": 0.0003651157894736842, "loss": 0.0293, "num_input_tokens_seen": 276475904, "step": 4220, "train_runtime": 1873.4536, "train_tokens_per_second": 147575.532 }, { "epoch": 0.9151882302033751, "grad_norm": 0.56640625, "learning_rate": 0.0003644842105263157, "loss": 0.0647, "num_input_tokens_seen": 277131264, "step": 4230, "train_runtime": 1877.2653, "train_tokens_per_second": 147624.983 }, { "epoch": 0.9173517957594115, "grad_norm": 0.466796875, "learning_rate": 0.00036385263157894736, "loss": 0.0344, "num_input_tokens_seen": 277786624, "step": 4240, "train_runtime": 1881.0625, "train_tokens_per_second": 147675.379 }, { "epoch": 0.9195153613154479, "grad_norm": 0.52734375, "learning_rate": 0.0003632210526315789, "loss": 0.0298, "num_input_tokens_seen": 278441984, "step": 4250, "train_runtime": 1884.8718, "train_tokens_per_second": 147724.625 }, { "epoch": 0.9216789268714842, "grad_norm": 0.703125, "learning_rate": 0.00036258947368421053, "loss": 0.0272, "num_input_tokens_seen": 279097344, "step": 4260, "train_runtime": 1890.8449, "train_tokens_per_second": 147604.568 }, { "epoch": 0.9238424924275206, "grad_norm": 0.5, "learning_rate": 0.00036195789473684206, "loss": 0.0279, "num_input_tokens_seen": 279748608, "step": 4270, "train_runtime": 1894.6384, "train_tokens_per_second": 147652.772 }, { "epoch": 0.926006057983557, "grad_norm": 0.67578125, "learning_rate": 0.0003613263157894736, "loss": 0.0305, "num_input_tokens_seen": 280403968, "step": 4280, "train_runtime": 1898.4464, "train_tokens_per_second": 147701.806 }, { "epoch": 0.9281696235395932, "grad_norm": 0.69140625, "learning_rate": 0.00036069473684210523, "loss": 0.0312, "num_input_tokens_seen": 281059328, "step": 4290, "train_runtime": 1902.2624, "train_tokens_per_second": 147750.028 }, { "epoch": 0.9303331890956296, "grad_norm": 0.59765625, "learning_rate": 0.00036006315789473677, "loss": 0.0315, "num_input_tokens_seen": 281714688, "step": 4300, "train_runtime": 1906.0694, "train_tokens_per_second": 147798.754 }, { "epoch": 0.9303331890956296, "eval_loss": 0.01897530071437359, "eval_runtime": 2.0156, "eval_samples_per_second": 15.876, "eval_steps_per_second": 0.496, "num_input_tokens_seen": 281714688, "step": 4300 }, { "epoch": 0.9303331890956296, "eval_byte_accuracy": 0.9936943620178041, "eval_chrf": 95.0236625071108, "eval_sacrebleu": 94.9115842118578, "eval_word_accuracy": 0.9848901098901099, "num_input_tokens_seen": 281714688, "perplexity": 1.0191564758696394, "step": 4300 }, { "epoch": 0.9324967546516659, "grad_norm": 0.6640625, "learning_rate": 0.0003594315789473684, "loss": 0.031, "num_input_tokens_seen": 282370048, "step": 4310, "train_runtime": 1914.2808, "train_tokens_per_second": 147507.122 }, { "epoch": 0.9346603202077023, "grad_norm": 0.51171875, "learning_rate": 0.00035879999999999994, "loss": 0.0299, "num_input_tokens_seen": 283025408, "step": 4320, "train_runtime": 1918.0892, "train_tokens_per_second": 147555.918 }, { "epoch": 0.9368238857637387, "grad_norm": 0.52734375, "learning_rate": 0.0003581684210526316, "loss": 0.0344, "num_input_tokens_seen": 283680768, "step": 4330, "train_runtime": 1921.9064, "train_tokens_per_second": 147603.843 }, { "epoch": 0.938987451319775, "grad_norm": 0.5234375, "learning_rate": 0.0003575368421052631, "loss": 0.026, "num_input_tokens_seen": 284336128, "step": 4340, "train_runtime": 1925.715, "train_tokens_per_second": 147652.234 }, { "epoch": 0.9411510168758114, "grad_norm": 0.50390625, "learning_rate": 0.00035690526315789475, "loss": 0.0288, "num_input_tokens_seen": 284991488, "step": 4350, "train_runtime": 1929.5298, "train_tokens_per_second": 147699.968 }, { "epoch": 0.9433145824318477, "grad_norm": 0.59765625, "learning_rate": 0.0003562736842105263, "loss": 0.0342, "num_input_tokens_seen": 285646848, "step": 4360, "train_runtime": 1935.5282, "train_tokens_per_second": 147580.828 }, { "epoch": 0.945478147987884, "grad_norm": 0.6953125, "learning_rate": 0.0003556421052631578, "loss": 0.0295, "num_input_tokens_seen": 286302208, "step": 4370, "train_runtime": 1939.3378, "train_tokens_per_second": 147628.849 }, { "epoch": 0.9476417135439203, "grad_norm": 0.76953125, "learning_rate": 0.00035501052631578946, "loss": 0.0318, "num_input_tokens_seen": 286957568, "step": 4380, "train_runtime": 1943.1309, "train_tokens_per_second": 147677.942 }, { "epoch": 0.9498052790999567, "grad_norm": 0.6015625, "learning_rate": 0.000354378947368421, "loss": 0.0308, "num_input_tokens_seen": 287612928, "step": 4390, "train_runtime": 1946.9273, "train_tokens_per_second": 147726.585 }, { "epoch": 0.9519688446559931, "grad_norm": 0.51953125, "learning_rate": 0.00035374736842105263, "loss": 0.0322, "num_input_tokens_seen": 288268288, "step": 4400, "train_runtime": 1950.7432, "train_tokens_per_second": 147773.567 }, { "epoch": 0.9519688446559931, "eval_loss": 0.019506528973579407, "eval_runtime": 2.1658, "eval_samples_per_second": 14.775, "eval_steps_per_second": 0.462, "num_input_tokens_seen": 288268288, "step": 4400 }, { "epoch": 0.9519688446559931, "eval_byte_accuracy": 0.9933234421364985, "eval_chrf": 94.24507398874036, "eval_sacrebleu": 92.8561495846159, "eval_word_accuracy": 0.9821428571428571, "num_input_tokens_seen": 288268288, "perplexity": 1.0196980244203853, "step": 4400 }, { "epoch": 0.9541324102120294, "grad_norm": 0.75, "learning_rate": 0.00035311578947368416, "loss": 0.0307, "num_input_tokens_seen": 288923648, "step": 4410, "train_runtime": 1958.9228, "train_tokens_per_second": 147491.081 }, { "epoch": 0.9562959757680658, "grad_norm": 0.408203125, "learning_rate": 0.0003524842105263158, "loss": 0.0283, "num_input_tokens_seen": 289579008, "step": 4420, "train_runtime": 1962.7389, "train_tokens_per_second": 147538.225 }, { "epoch": 0.9584595413241022, "grad_norm": 0.62109375, "learning_rate": 0.00035185263157894734, "loss": 0.0303, "num_input_tokens_seen": 290234368, "step": 4430, "train_runtime": 1966.5423, "train_tokens_per_second": 147586.133 }, { "epoch": 0.9606231068801385, "grad_norm": 0.6953125, "learning_rate": 0.00035122105263157887, "loss": 0.031, "num_input_tokens_seen": 290889728, "step": 4440, "train_runtime": 1970.3612, "train_tokens_per_second": 147632.695 }, { "epoch": 0.9627866724361748, "grad_norm": 0.47265625, "learning_rate": 0.0003505894736842105, "loss": 0.0316, "num_input_tokens_seen": 291545088, "step": 4450, "train_runtime": 1974.1835, "train_tokens_per_second": 147678.822 }, { "epoch": 0.9649502379922111, "grad_norm": 0.59375, "learning_rate": 0.00034995789473684204, "loss": 0.0284, "num_input_tokens_seen": 292200448, "step": 4460, "train_runtime": 1980.166, "train_tokens_per_second": 147563.609 }, { "epoch": 0.9671138035482475, "grad_norm": 0.60546875, "learning_rate": 0.0003493263157894737, "loss": 0.0303, "num_input_tokens_seen": 292855808, "step": 4470, "train_runtime": 1983.9865, "train_tokens_per_second": 147609.78 }, { "epoch": 0.9692773691042839, "grad_norm": 0.53515625, "learning_rate": 0.0003486947368421052, "loss": 0.0268, "num_input_tokens_seen": 293511168, "step": 4480, "train_runtime": 1987.7995, "train_tokens_per_second": 147656.327 }, { "epoch": 0.9714409346603202, "grad_norm": 0.6875, "learning_rate": 0.00034806315789473685, "loss": 0.0304, "num_input_tokens_seen": 294166528, "step": 4490, "train_runtime": 1991.6159, "train_tokens_per_second": 147702.439 }, { "epoch": 0.9736045002163566, "grad_norm": 0.55859375, "learning_rate": 0.0003474315789473684, "loss": 0.0309, "num_input_tokens_seen": 294821888, "step": 4500, "train_runtime": 1995.424, "train_tokens_per_second": 147748.994 }, { "epoch": 0.9736045002163566, "eval_loss": 0.017310751602053642, "eval_runtime": 2.0408, "eval_samples_per_second": 15.68, "eval_steps_per_second": 0.49, "num_input_tokens_seen": 294821888, "step": 4500 }, { "epoch": 0.9736045002163566, "eval_byte_accuracy": 0.9940652818991098, "eval_chrf": 95.3397660833719, "eval_sacrebleu": 95.29534882340593, "eval_word_accuracy": 0.9876373626373627, "num_input_tokens_seen": 294821888, "perplexity": 1.0174614509798736, "step": 4500 }, { "epoch": 0.975768065772393, "grad_norm": 0.45703125, "learning_rate": 0.0003467999999999999, "loss": 0.0277, "num_input_tokens_seen": 295477248, "step": 4510, "train_runtime": 2003.6989, "train_tokens_per_second": 147465.895 }, { "epoch": 0.9779316313284292, "grad_norm": 0.61328125, "learning_rate": 0.00034616842105263156, "loss": 0.0323, "num_input_tokens_seen": 296132608, "step": 4520, "train_runtime": 2007.5052, "train_tokens_per_second": 147512.751 }, { "epoch": 0.9800951968844656, "grad_norm": 0.5078125, "learning_rate": 0.0003455368421052631, "loss": 0.0314, "num_input_tokens_seen": 296787968, "step": 4530, "train_runtime": 2011.3248, "train_tokens_per_second": 147558.447 }, { "epoch": 0.9822587624405019, "grad_norm": 0.6875, "learning_rate": 0.00034490526315789473, "loss": 0.0275, "num_input_tokens_seen": 297443328, "step": 4540, "train_runtime": 2015.1382, "train_tokens_per_second": 147604.429 }, { "epoch": 0.9844223279965383, "grad_norm": 0.5546875, "learning_rate": 0.00034427368421052626, "loss": 0.0296, "num_input_tokens_seen": 298098688, "step": 4550, "train_runtime": 2018.9595, "train_tokens_per_second": 147649.663 }, { "epoch": 0.9865858935525746, "grad_norm": 0.69921875, "learning_rate": 0.0003436421052631579, "loss": 0.0294, "num_input_tokens_seen": 298754048, "step": 4560, "train_runtime": 2024.9701, "train_tokens_per_second": 147535.042 }, { "epoch": 0.988749459108611, "grad_norm": 0.640625, "learning_rate": 0.00034301052631578944, "loss": 0.0292, "num_input_tokens_seen": 299405312, "step": 4570, "train_runtime": 2028.7655, "train_tokens_per_second": 147580.048 }, { "epoch": 0.9909130246646474, "grad_norm": 0.5078125, "learning_rate": 0.000342378947368421, "loss": 0.0277, "num_input_tokens_seen": 300056576, "step": 4580, "train_runtime": 2032.5336, "train_tokens_per_second": 147626.871 }, { "epoch": 0.9930765902206837, "grad_norm": 0.55078125, "learning_rate": 0.0003417473684210526, "loss": 0.0321, "num_input_tokens_seen": 300711936, "step": 4590, "train_runtime": 2036.3318, "train_tokens_per_second": 147673.349 }, { "epoch": 0.99524015577672, "grad_norm": 0.640625, "learning_rate": 0.00034111578947368414, "loss": 0.0275, "num_input_tokens_seen": 301367296, "step": 4600, "train_runtime": 2040.1253, "train_tokens_per_second": 147719.992 }, { "epoch": 0.99524015577672, "eval_loss": 0.017312878742814064, "eval_runtime": 2.0657, "eval_samples_per_second": 15.491, "eval_steps_per_second": 0.484, "num_input_tokens_seen": 301367296, "step": 4600 }, { "epoch": 0.99524015577672, "eval_byte_accuracy": 0.9951780415430267, "eval_chrf": 96.36001642042964, "eval_sacrebleu": 95.65640954170594, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 301367296, "perplexity": 1.0174636152659, "step": 4600 }, { "epoch": 0.9974037213327563, "grad_norm": 0.796875, "learning_rate": 0.0003404842105263158, "loss": 0.0316, "num_input_tokens_seen": 302022656, "step": 4610, "train_runtime": 2048.1812, "train_tokens_per_second": 147458.95 }, { "epoch": 0.9995672868887927, "grad_norm": 0.6796875, "learning_rate": 0.0003398526315789473, "loss": 0.0275, "num_input_tokens_seen": 302678016, "step": 4620, "train_runtime": 2051.9727, "train_tokens_per_second": 147505.87 }, { "epoch": 1.0017308524448292, "grad_norm": 0.58984375, "learning_rate": 0.00033922105263157896, "loss": 0.0268, "num_input_tokens_seen": 303321088, "step": 4630, "train_runtime": 2055.7464, "train_tokens_per_second": 147547.911 }, { "epoch": 1.0038944180008653, "grad_norm": 0.69921875, "learning_rate": 0.0003385894736842105, "loss": 0.0278, "num_input_tokens_seen": 303976448, "step": 4640, "train_runtime": 2059.555, "train_tokens_per_second": 147593.262 }, { "epoch": 1.0060579835569017, "grad_norm": 0.66015625, "learning_rate": 0.0003379578947368421, "loss": 0.0257, "num_input_tokens_seen": 304631808, "step": 4650, "train_runtime": 2063.3612, "train_tokens_per_second": 147638.623 }, { "epoch": 1.008221549112938, "grad_norm": 0.55078125, "learning_rate": 0.00033732631578947366, "loss": 0.0265, "num_input_tokens_seen": 305287168, "step": 4660, "train_runtime": 2069.364, "train_tokens_per_second": 147527.051 }, { "epoch": 1.0103851146689744, "grad_norm": 0.52734375, "learning_rate": 0.0003366947368421052, "loss": 0.0247, "num_input_tokens_seen": 305942528, "step": 4670, "train_runtime": 2073.1806, "train_tokens_per_second": 147571.575 }, { "epoch": 1.0125486802250108, "grad_norm": 0.51171875, "learning_rate": 0.00033606315789473683, "loss": 0.026, "num_input_tokens_seen": 306597888, "step": 4680, "train_runtime": 2076.9852, "train_tokens_per_second": 147616.792 }, { "epoch": 1.0147122457810471, "grad_norm": 0.70703125, "learning_rate": 0.00033543157894736837, "loss": 0.027, "num_input_tokens_seen": 307253248, "step": 4690, "train_runtime": 2080.8031, "train_tokens_per_second": 147660.893 }, { "epoch": 1.0168758113370835, "grad_norm": 0.5078125, "learning_rate": 0.0003348, "loss": 0.0268, "num_input_tokens_seen": 307908608, "step": 4700, "train_runtime": 2084.6037, "train_tokens_per_second": 147706.064 }, { "epoch": 1.0168758113370835, "eval_loss": 0.015669453889131546, "eval_runtime": 2.0897, "eval_samples_per_second": 15.313, "eval_steps_per_second": 0.479, "num_input_tokens_seen": 307908608, "step": 4700 }, { "epoch": 1.0168758113370835, "eval_byte_accuracy": 0.9955489614243324, "eval_chrf": 96.59396178137179, "eval_sacrebleu": 95.77338932371731, "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 307908608, "perplexity": 1.0157928635263578, "step": 4700 }, { "epoch": 1.0190393768931199, "grad_norm": 0.67578125, "learning_rate": 0.00033416842105263154, "loss": 0.0275, "num_input_tokens_seen": 308563968, "step": 4710, "train_runtime": 2092.891, "train_tokens_per_second": 147434.321 }, { "epoch": 1.0212029424491562, "grad_norm": 0.765625, "learning_rate": 0.0003335368421052631, "loss": 0.0276, "num_input_tokens_seen": 309219328, "step": 4720, "train_runtime": 2096.697, "train_tokens_per_second": 147479.265 }, { "epoch": 1.0233665080051926, "grad_norm": 0.494140625, "learning_rate": 0.0003329052631578947, "loss": 0.0276, "num_input_tokens_seen": 309874688, "step": 4730, "train_runtime": 2100.5115, "train_tokens_per_second": 147523.446 }, { "epoch": 1.025530073561229, "grad_norm": 0.373046875, "learning_rate": 0.00033227368421052624, "loss": 0.0241, "num_input_tokens_seen": 310530048, "step": 4740, "train_runtime": 2104.32, "train_tokens_per_second": 147567.887 }, { "epoch": 1.0276936391172653, "grad_norm": 0.5234375, "learning_rate": 0.0003316421052631579, "loss": 0.0285, "num_input_tokens_seen": 311185408, "step": 4750, "train_runtime": 2108.1335, "train_tokens_per_second": 147611.81 }, { "epoch": 1.0298572046733017, "grad_norm": 0.55859375, "learning_rate": 0.0003310105263157894, "loss": 0.0266, "num_input_tokens_seen": 311840768, "step": 4760, "train_runtime": 2114.1392, "train_tokens_per_second": 147502.479 }, { "epoch": 1.032020770229338, "grad_norm": 0.5703125, "learning_rate": 0.00033037894736842106, "loss": 0.027, "num_input_tokens_seen": 312496128, "step": 4770, "train_runtime": 2117.9312, "train_tokens_per_second": 147547.821 }, { "epoch": 1.0341843357853744, "grad_norm": 0.55078125, "learning_rate": 0.0003297473684210526, "loss": 0.023, "num_input_tokens_seen": 313151488, "step": 4780, "train_runtime": 2121.7355, "train_tokens_per_second": 147592.141 }, { "epoch": 1.0363479013414107, "grad_norm": 0.421875, "learning_rate": 0.0003291157894736842, "loss": 0.0247, "num_input_tokens_seen": 313806848, "step": 4790, "train_runtime": 2125.5459, "train_tokens_per_second": 147635.883 }, { "epoch": 1.0385114668974469, "grad_norm": 0.66015625, "learning_rate": 0.00032848421052631576, "loss": 0.0256, "num_input_tokens_seen": 314462208, "step": 4800, "train_runtime": 2129.3486, "train_tokens_per_second": 147680.003 }, { "epoch": 1.0385114668974469, "eval_loss": 0.015679502859711647, "eval_runtime": 2.1061, "eval_samples_per_second": 15.194, "eval_steps_per_second": 0.475, "num_input_tokens_seen": 314462208, "step": 4800 }, { "epoch": 1.0385114668974469, "eval_byte_accuracy": 0.9955489614243324, "eval_chrf": 96.5138850641643, "eval_sacrebleu": 95.39446008456702, "eval_word_accuracy": 0.9876373626373627, "num_input_tokens_seen": 314462208, "perplexity": 1.0158030712502473, "step": 4800 }, { "epoch": 1.0406750324534833, "grad_norm": 0.78515625, "learning_rate": 0.00032785263157894735, "loss": 0.0256, "num_input_tokens_seen": 315117568, "step": 4810, "train_runtime": 2137.468, "train_tokens_per_second": 147425.63 }, { "epoch": 1.0428385980095196, "grad_norm": 0.6484375, "learning_rate": 0.00032722105263157894, "loss": 0.026, "num_input_tokens_seen": 315772928, "step": 4820, "train_runtime": 2141.2729, "train_tokens_per_second": 147469.726 }, { "epoch": 1.045002163565556, "grad_norm": 0.5625, "learning_rate": 0.00032658947368421047, "loss": 0.0275, "num_input_tokens_seen": 316428288, "step": 4830, "train_runtime": 2145.0816, "train_tokens_per_second": 147513.402 }, { "epoch": 1.0471657291215923, "grad_norm": 0.578125, "learning_rate": 0.0003259578947368421, "loss": 0.0273, "num_input_tokens_seen": 317083648, "step": 4840, "train_runtime": 2148.8867, "train_tokens_per_second": 147557.175 }, { "epoch": 1.0493292946776287, "grad_norm": 0.50390625, "learning_rate": 0.00032532631578947364, "loss": 0.0252, "num_input_tokens_seen": 317739008, "step": 4850, "train_runtime": 2152.6994, "train_tokens_per_second": 147600.267 }, { "epoch": 1.051492860233665, "grad_norm": 0.4375, "learning_rate": 0.0003246947368421052, "loss": 0.0256, "num_input_tokens_seen": 318394368, "step": 4860, "train_runtime": 2158.693, "train_tokens_per_second": 147494.048 }, { "epoch": 1.0536564257897014, "grad_norm": 0.57421875, "learning_rate": 0.0003240631578947368, "loss": 0.026, "num_input_tokens_seen": 319049728, "step": 4870, "train_runtime": 2162.5068, "train_tokens_per_second": 147536.981 }, { "epoch": 1.0558199913457378, "grad_norm": 0.66796875, "learning_rate": 0.0003234315789473684, "loss": 0.0249, "num_input_tokens_seen": 319705088, "step": 4880, "train_runtime": 2166.3052, "train_tokens_per_second": 147580.812 }, { "epoch": 1.0579835569017741, "grad_norm": 0.65234375, "learning_rate": 0.0003228, "loss": 0.025, "num_input_tokens_seen": 320360448, "step": 4890, "train_runtime": 2170.1088, "train_tokens_per_second": 147624.142 }, { "epoch": 1.0601471224578105, "grad_norm": 0.63671875, "learning_rate": 0.0003221684210526315, "loss": 0.0277, "num_input_tokens_seen": 321015808, "step": 4900, "train_runtime": 2173.8996, "train_tokens_per_second": 147668.186 }, { "epoch": 1.0601471224578105, "eval_loss": 0.01509543601423502, "eval_runtime": 2.2745, "eval_samples_per_second": 14.069, "eval_steps_per_second": 0.44, "num_input_tokens_seen": 321015808, "step": 4900 }, { "epoch": 1.0601471224578105, "eval_byte_accuracy": 0.995919881305638, "eval_chrf": 96.76141864085723, "eval_sacrebleu": 96.18078243947666, "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 321015808, "perplexity": 1.0152099475835925, "step": 4900 }, { "epoch": 1.0623106880138469, "grad_norm": 0.75, "learning_rate": 0.00032153684210526316, "loss": 0.0278, "num_input_tokens_seen": 321671168, "step": 4910, "train_runtime": 2182.1878, "train_tokens_per_second": 147407.643 }, { "epoch": 1.0644742535698832, "grad_norm": 0.5234375, "learning_rate": 0.0003209052631578947, "loss": 0.0273, "num_input_tokens_seen": 322326528, "step": 4920, "train_runtime": 2186.0007, "train_tokens_per_second": 147450.33 }, { "epoch": 1.0666378191259196, "grad_norm": 0.498046875, "learning_rate": 0.0003202736842105263, "loss": 0.0259, "num_input_tokens_seen": 322981888, "step": 4930, "train_runtime": 2189.8029, "train_tokens_per_second": 147493.591 }, { "epoch": 1.068801384681956, "grad_norm": 0.7578125, "learning_rate": 0.00031964210526315786, "loss": 0.0242, "num_input_tokens_seen": 323637248, "step": 4940, "train_runtime": 2193.6076, "train_tokens_per_second": 147536.527 }, { "epoch": 1.0709649502379923, "grad_norm": 0.45703125, "learning_rate": 0.00031901052631578945, "loss": 0.0245, "num_input_tokens_seen": 324292608, "step": 4950, "train_runtime": 2197.4066, "train_tokens_per_second": 147579.7 }, { "epoch": 1.0731285157940285, "grad_norm": 0.423828125, "learning_rate": 0.00031837894736842104, "loss": 0.0252, "num_input_tokens_seen": 324947968, "step": 4960, "train_runtime": 2203.6181, "train_tokens_per_second": 147461.111 }, { "epoch": 1.0752920813500648, "grad_norm": 0.55078125, "learning_rate": 0.00031774736842105257, "loss": 0.0254, "num_input_tokens_seen": 325603328, "step": 4970, "train_runtime": 2207.4386, "train_tokens_per_second": 147502.782 }, { "epoch": 1.0774556469061012, "grad_norm": 0.703125, "learning_rate": 0.0003171157894736842, "loss": 0.0258, "num_input_tokens_seen": 326258688, "step": 4980, "train_runtime": 2211.2502, "train_tokens_per_second": 147544.899 }, { "epoch": 1.0796192124621375, "grad_norm": 0.455078125, "learning_rate": 0.00031648421052631574, "loss": 0.0241, "num_input_tokens_seen": 326914048, "step": 4990, "train_runtime": 2215.0686, "train_tokens_per_second": 147586.421 }, { "epoch": 1.081782778018174, "grad_norm": 0.73046875, "learning_rate": 0.00031585263157894733, "loss": 0.0241, "num_input_tokens_seen": 327569408, "step": 5000, "train_runtime": 2218.8662, "train_tokens_per_second": 147629.183 }, { "epoch": 1.081782778018174, "eval_loss": 0.01370089128613472, "eval_runtime": 2.0517, "eval_samples_per_second": 15.597, "eval_steps_per_second": 0.487, "num_input_tokens_seen": 327569408, "step": 5000 }, { "epoch": 1.081782778018174, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 98.56639547519084, "eval_sacrebleu": 97.89877731533007, "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 327569408, "perplexity": 1.0137951786118617, "step": 5000 }, { "epoch": 1.0839463435742103, "grad_norm": 0.3984375, "learning_rate": 0.0003152210526315789, "loss": 0.0227, "num_input_tokens_seen": 328224768, "step": 5010, "train_runtime": 2226.993, "train_tokens_per_second": 147384.73 }, { "epoch": 1.0861099091302466, "grad_norm": 0.431640625, "learning_rate": 0.0003145894736842105, "loss": 0.0251, "num_input_tokens_seen": 328880128, "step": 5020, "train_runtime": 2230.8082, "train_tokens_per_second": 147426.446 }, { "epoch": 1.088273474686283, "grad_norm": 0.515625, "learning_rate": 0.0003139578947368421, "loss": 0.0244, "num_input_tokens_seen": 329535488, "step": 5030, "train_runtime": 2234.6284, "train_tokens_per_second": 147467.688 }, { "epoch": 1.0904370402423194, "grad_norm": 0.52734375, "learning_rate": 0.0003133263157894737, "loss": 0.026, "num_input_tokens_seen": 330186752, "step": 5040, "train_runtime": 2238.4126, "train_tokens_per_second": 147509.335 }, { "epoch": 1.0926006057983557, "grad_norm": 0.373046875, "learning_rate": 0.00031269473684210526, "loss": 0.0257, "num_input_tokens_seen": 330842112, "step": 5050, "train_runtime": 2242.2218, "train_tokens_per_second": 147551.017 }, { "epoch": 1.094764171354392, "grad_norm": 0.37109375, "learning_rate": 0.0003120631578947368, "loss": 0.0228, "num_input_tokens_seen": 331497472, "step": 5060, "train_runtime": 2248.2359, "train_tokens_per_second": 147447.817 }, { "epoch": 1.0969277369104284, "grad_norm": 0.52734375, "learning_rate": 0.0003114315789473684, "loss": 0.0237, "num_input_tokens_seen": 332152832, "step": 5070, "train_runtime": 2252.0457, "train_tokens_per_second": 147489.382 }, { "epoch": 1.0990913024664648, "grad_norm": 0.54296875, "learning_rate": 0.00031079999999999997, "loss": 0.0259, "num_input_tokens_seen": 332808192, "step": 5080, "train_runtime": 2255.8557, "train_tokens_per_second": 147530.798 }, { "epoch": 1.1012548680225012, "grad_norm": 0.65234375, "learning_rate": 0.00031016842105263155, "loss": 0.0255, "num_input_tokens_seen": 333463552, "step": 5090, "train_runtime": 2259.6673, "train_tokens_per_second": 147571.971 }, { "epoch": 1.1034184335785375, "grad_norm": 0.76171875, "learning_rate": 0.00030953684210526314, "loss": 0.0251, "num_input_tokens_seen": 334118912, "step": 5100, "train_runtime": 2263.4742, "train_tokens_per_second": 147613.306 }, { "epoch": 1.1034184335785375, "eval_loss": 0.013377854600548744, "eval_runtime": 2.0698, "eval_samples_per_second": 15.46, "eval_steps_per_second": 0.483, "num_input_tokens_seen": 334118912, "step": 5100 }, { "epoch": 1.1034184335785375, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 96.85382382959251, "eval_sacrebleu": 96.41686458971391, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 334118912, "perplexity": 1.013467738467937, "step": 5100 }, { "epoch": 1.1055819991345737, "grad_norm": 0.462890625, "learning_rate": 0.0003089052631578947, "loss": 0.0233, "num_input_tokens_seen": 334774272, "step": 5110, "train_runtime": 2271.5876, "train_tokens_per_second": 147374.581 }, { "epoch": 1.10774556469061, "grad_norm": 0.65625, "learning_rate": 0.0003082736842105263, "loss": 0.0231, "num_input_tokens_seen": 335429632, "step": 5120, "train_runtime": 2275.3995, "train_tokens_per_second": 147415.708 }, { "epoch": 1.1099091302466464, "grad_norm": 0.408203125, "learning_rate": 0.00030764210526315784, "loss": 0.0255, "num_input_tokens_seen": 336084992, "step": 5130, "train_runtime": 2279.2058, "train_tokens_per_second": 147457.06 }, { "epoch": 1.1120726958026828, "grad_norm": 0.58984375, "learning_rate": 0.00030701052631578943, "loss": 0.0224, "num_input_tokens_seen": 336740352, "step": 5140, "train_runtime": 2283.014, "train_tokens_per_second": 147498.152 }, { "epoch": 1.1142362613587191, "grad_norm": 0.474609375, "learning_rate": 0.000306378947368421, "loss": 0.0243, "num_input_tokens_seen": 337395712, "step": 5150, "train_runtime": 2286.8196, "train_tokens_per_second": 147539.279 }, { "epoch": 1.1163998269147555, "grad_norm": 0.51171875, "learning_rate": 0.0003057473684210526, "loss": 0.0232, "num_input_tokens_seen": 338051072, "step": 5160, "train_runtime": 2292.84, "train_tokens_per_second": 147437.708 }, { "epoch": 1.1185633924707918, "grad_norm": 0.4609375, "learning_rate": 0.0003051157894736842, "loss": 0.0235, "num_input_tokens_seen": 338706432, "step": 5170, "train_runtime": 2296.6595, "train_tokens_per_second": 147477.859 }, { "epoch": 1.1207269580268282, "grad_norm": 0.55078125, "learning_rate": 0.0003044842105263158, "loss": 0.0243, "num_input_tokens_seen": 339361792, "step": 5180, "train_runtime": 2300.471, "train_tokens_per_second": 147518.394 }, { "epoch": 1.1228905235828646, "grad_norm": 0.388671875, "learning_rate": 0.00030385263157894736, "loss": 0.0236, "num_input_tokens_seen": 340017152, "step": 5190, "train_runtime": 2304.2794, "train_tokens_per_second": 147558.997 }, { "epoch": 1.125054089138901, "grad_norm": 0.490234375, "learning_rate": 0.0003032210526315789, "loss": 0.0249, "num_input_tokens_seen": 340672512, "step": 5200, "train_runtime": 2308.0903, "train_tokens_per_second": 147599.302 }, { "epoch": 1.125054089138901, "eval_loss": 0.012978856451809406, "eval_runtime": 2.1092, "eval_samples_per_second": 15.172, "eval_steps_per_second": 0.474, "num_input_tokens_seen": 340672512, "step": 5200 }, { "epoch": 1.125054089138901, "eval_byte_accuracy": 0.9970326409495549, "eval_chrf": 97.87302942819403, "eval_sacrebleu": 97.56427984542309, "eval_word_accuracy": 0.9931318681318682, "num_input_tokens_seen": 340672512, "perplexity": 1.013063447377543, "step": 5200 }, { "epoch": 1.1272176546949373, "grad_norm": 0.58984375, "learning_rate": 0.0003025894736842105, "loss": 0.0216, "num_input_tokens_seen": 341327872, "step": 5210, "train_runtime": 2316.2667, "train_tokens_per_second": 147361.212 }, { "epoch": 1.1293812202509736, "grad_norm": 0.5078125, "learning_rate": 0.00030195789473684207, "loss": 0.0239, "num_input_tokens_seen": 341983232, "step": 5220, "train_runtime": 2320.082, "train_tokens_per_second": 147401.356 }, { "epoch": 1.13154478580701, "grad_norm": 0.5390625, "learning_rate": 0.00030132631578947365, "loss": 0.0276, "num_input_tokens_seen": 342638592, "step": 5230, "train_runtime": 2323.8936, "train_tokens_per_second": 147441.603 }, { "epoch": 1.1337083513630464, "grad_norm": 0.640625, "learning_rate": 0.00030069473684210524, "loss": 0.0234, "num_input_tokens_seen": 343293952, "step": 5240, "train_runtime": 2327.7124, "train_tokens_per_second": 147481.259 }, { "epoch": 1.1358719169190827, "grad_norm": 0.6484375, "learning_rate": 0.0003000631578947368, "loss": 0.0263, "num_input_tokens_seen": 343949312, "step": 5250, "train_runtime": 2331.5303, "train_tokens_per_second": 147520.839 }, { "epoch": 1.1380354824751189, "grad_norm": 0.640625, "learning_rate": 0.0002994315789473684, "loss": 0.0239, "num_input_tokens_seen": 344604672, "step": 5260, "train_runtime": 2337.5477, "train_tokens_per_second": 147421.452 }, { "epoch": 1.1401990480311555, "grad_norm": 0.53125, "learning_rate": 0.0002988, "loss": 0.0253, "num_input_tokens_seen": 345260032, "step": 5270, "train_runtime": 2341.3694, "train_tokens_per_second": 147460.726 }, { "epoch": 1.1423626135871916, "grad_norm": 0.53125, "learning_rate": 0.00029816842105263153, "loss": 0.0227, "num_input_tokens_seen": 345915392, "step": 5280, "train_runtime": 2345.1866, "train_tokens_per_second": 147500.159 }, { "epoch": 1.144526179143228, "grad_norm": 0.5859375, "learning_rate": 0.0002975368421052631, "loss": 0.024, "num_input_tokens_seen": 346570752, "step": 5290, "train_runtime": 2348.9992, "train_tokens_per_second": 147539.746 }, { "epoch": 1.1466897446992643, "grad_norm": 0.46875, "learning_rate": 0.0002969052631578947, "loss": 0.0259, "num_input_tokens_seen": 347226112, "step": 5300, "train_runtime": 2352.8085, "train_tokens_per_second": 147579.42 }, { "epoch": 1.1466897446992643, "eval_loss": 0.015913764014840126, "eval_runtime": 2.0214, "eval_samples_per_second": 15.831, "eval_steps_per_second": 0.495, "num_input_tokens_seen": 347226112, "step": 5300 }, { "epoch": 1.1466897446992643, "eval_byte_accuracy": 0.9944362017804155, "eval_chrf": 96.27201255408502, "eval_sacrebleu": 96.39469981452005, "eval_word_accuracy": 0.9876373626373627, "num_input_tokens_seen": 347226112, "perplexity": 1.0160410623260452, "step": 5300 }, { "epoch": 1.1488533102553007, "grad_norm": 0.416015625, "learning_rate": 0.0002962736842105263, "loss": 0.025, "num_input_tokens_seen": 347881472, "step": 5310, "train_runtime": 2360.8879, "train_tokens_per_second": 147351.964 }, { "epoch": 1.151016875811337, "grad_norm": 0.376953125, "learning_rate": 0.0002956421052631579, "loss": 0.0249, "num_input_tokens_seen": 348536832, "step": 5320, "train_runtime": 2364.6967, "train_tokens_per_second": 147391.77 }, { "epoch": 1.1531804413673734, "grad_norm": 0.390625, "learning_rate": 0.00029501052631578946, "loss": 0.0252, "num_input_tokens_seen": 349192192, "step": 5330, "train_runtime": 2368.5236, "train_tokens_per_second": 147430.318 }, { "epoch": 1.1553440069234098, "grad_norm": 0.62890625, "learning_rate": 0.00029437894736842105, "loss": 0.0246, "num_input_tokens_seen": 349847552, "step": 5340, "train_runtime": 2372.3427, "train_tokens_per_second": 147469.233 }, { "epoch": 1.1575075724794461, "grad_norm": 0.46875, "learning_rate": 0.00029374736842105264, "loss": 0.0256, "num_input_tokens_seen": 350502912, "step": 5350, "train_runtime": 2376.1584, "train_tokens_per_second": 147508.228 }, { "epoch": 1.1596711380354825, "grad_norm": 0.7421875, "learning_rate": 0.00029311578947368417, "loss": 0.0247, "num_input_tokens_seen": 351158272, "step": 5360, "train_runtime": 2382.1833, "train_tokens_per_second": 147410.268 }, { "epoch": 1.1618347035915189, "grad_norm": 0.6328125, "learning_rate": 0.00029248421052631575, "loss": 0.024, "num_input_tokens_seen": 351813632, "step": 5370, "train_runtime": 2385.9801, "train_tokens_per_second": 147450.361 }, { "epoch": 1.1639982691475552, "grad_norm": 0.53515625, "learning_rate": 0.00029185263157894734, "loss": 0.0247, "num_input_tokens_seen": 352468992, "step": 5380, "train_runtime": 2389.7895, "train_tokens_per_second": 147489.558 }, { "epoch": 1.1661618347035916, "grad_norm": 0.5390625, "learning_rate": 0.00029122105263157893, "loss": 0.0234, "num_input_tokens_seen": 353124352, "step": 5390, "train_runtime": 2393.6014, "train_tokens_per_second": 147528.467 }, { "epoch": 1.168325400259628, "grad_norm": 0.53125, "learning_rate": 0.0002905894736842105, "loss": 0.0236, "num_input_tokens_seen": 353779712, "step": 5400, "train_runtime": 2397.4054, "train_tokens_per_second": 147567.749 }, { "epoch": 1.168325400259628, "eval_loss": 0.015481002628803253, "eval_runtime": 2.2867, "eval_samples_per_second": 13.994, "eval_steps_per_second": 0.437, "num_input_tokens_seen": 353779712, "step": 5400 }, { "epoch": 1.168325400259628, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 97.64325335494277, "eval_sacrebleu": 95.8953860263406, "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 353779712, "perplexity": 1.0156014541172325, "step": 5400 }, { "epoch": 1.1704889658156643, "grad_norm": 0.640625, "learning_rate": 0.0002899578947368421, "loss": 0.0211, "num_input_tokens_seen": 354435072, "step": 5410, "train_runtime": 2405.5282, "train_tokens_per_second": 147341.887 }, { "epoch": 1.1726525313717007, "grad_norm": 0.6328125, "learning_rate": 0.0002893263157894737, "loss": 0.0224, "num_input_tokens_seen": 355090432, "step": 5420, "train_runtime": 2409.3446, "train_tokens_per_second": 147380.51 }, { "epoch": 1.1748160969277368, "grad_norm": 0.671875, "learning_rate": 0.0002886947368421052, "loss": 0.025, "num_input_tokens_seen": 355745792, "step": 5430, "train_runtime": 2413.1608, "train_tokens_per_second": 147419.019 }, { "epoch": 1.1769796624837732, "grad_norm": 0.69140625, "learning_rate": 0.0002880631578947368, "loss": 0.0235, "num_input_tokens_seen": 356401152, "step": 5440, "train_runtime": 2416.9521, "train_tokens_per_second": 147458.924 }, { "epoch": 1.1791432280398095, "grad_norm": 0.373046875, "learning_rate": 0.0002874315789473684, "loss": 0.0233, "num_input_tokens_seen": 357056512, "step": 5450, "train_runtime": 2420.7669, "train_tokens_per_second": 147497.268 }, { "epoch": 1.181306793595846, "grad_norm": 0.58984375, "learning_rate": 0.0002868, "loss": 0.0217, "num_input_tokens_seen": 357711872, "step": 5460, "train_runtime": 2426.9173, "train_tokens_per_second": 147393.515 }, { "epoch": 1.1834703591518823, "grad_norm": 0.439453125, "learning_rate": 0.00028616842105263156, "loss": 0.0211, "num_input_tokens_seen": 358367232, "step": 5470, "train_runtime": 2430.738, "train_tokens_per_second": 147431.449 }, { "epoch": 1.1856339247079186, "grad_norm": 0.609375, "learning_rate": 0.00028553684210526315, "loss": 0.0228, "num_input_tokens_seen": 359022592, "step": 5480, "train_runtime": 2434.5618, "train_tokens_per_second": 147469.083 }, { "epoch": 1.187797490263955, "grad_norm": 0.296875, "learning_rate": 0.00028490526315789474, "loss": 0.0231, "num_input_tokens_seen": 359673856, "step": 5490, "train_runtime": 2438.3485, "train_tokens_per_second": 147507.158 }, { "epoch": 1.1899610558199913, "grad_norm": 0.392578125, "learning_rate": 0.0002842736842105263, "loss": 0.0267, "num_input_tokens_seen": 360329216, "step": 5500, "train_runtime": 2442.168, "train_tokens_per_second": 147544.813 }, { "epoch": 1.1899610558199913, "eval_loss": 0.01402247790247202, "eval_runtime": 2.1152, "eval_samples_per_second": 15.129, "eval_steps_per_second": 0.473, "num_input_tokens_seen": 360329216, "step": 5500 }, { "epoch": 1.1899610558199913, "eval_byte_accuracy": 0.9955489614243324, "eval_chrf": 96.96256197348622, "eval_sacrebleu": 95.41032845057128, "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 360329216, "perplexity": 1.014121254000941, "step": 5500 }, { "epoch": 1.1921246213760277, "grad_norm": 0.4765625, "learning_rate": 0.00028364210526315786, "loss": 0.0232, "num_input_tokens_seen": 360984576, "step": 5510, "train_runtime": 2450.3602, "train_tokens_per_second": 147318.987 }, { "epoch": 1.194288186932064, "grad_norm": 0.474609375, "learning_rate": 0.00028301052631578944, "loss": 0.0224, "num_input_tokens_seen": 361639936, "step": 5520, "train_runtime": 2454.1587, "train_tokens_per_second": 147358.01 }, { "epoch": 1.1964517524881004, "grad_norm": 0.6640625, "learning_rate": 0.00028237894736842103, "loss": 0.0225, "num_input_tokens_seen": 362295296, "step": 5530, "train_runtime": 2457.9704, "train_tokens_per_second": 147396.119 }, { "epoch": 1.1986153180441368, "grad_norm": 0.423828125, "learning_rate": 0.0002817473684210526, "loss": 0.0207, "num_input_tokens_seen": 362950656, "step": 5540, "train_runtime": 2461.7834, "train_tokens_per_second": 147434.036 }, { "epoch": 1.2007788836001732, "grad_norm": 0.451171875, "learning_rate": 0.0002811157894736842, "loss": 0.0224, "num_input_tokens_seen": 363606016, "step": 5550, "train_runtime": 2465.5888, "train_tokens_per_second": 147472.285 }, { "epoch": 1.2029424491562095, "grad_norm": 0.47265625, "learning_rate": 0.0002804842105263158, "loss": 0.023, "num_input_tokens_seen": 364261376, "step": 5560, "train_runtime": 2471.6087, "train_tokens_per_second": 147378.254 }, { "epoch": 1.2051060147122459, "grad_norm": 0.318359375, "learning_rate": 0.0002798526315789474, "loss": 0.0243, "num_input_tokens_seen": 364916736, "step": 5570, "train_runtime": 2475.4093, "train_tokens_per_second": 147416.726 }, { "epoch": 1.207269580268282, "grad_norm": 0.474609375, "learning_rate": 0.00027922105263157896, "loss": 0.0217, "num_input_tokens_seen": 365572096, "step": 5580, "train_runtime": 2479.2175, "train_tokens_per_second": 147454.628 }, { "epoch": 1.2094331458243186, "grad_norm": 0.365234375, "learning_rate": 0.0002785894736842105, "loss": 0.0226, "num_input_tokens_seen": 366227456, "step": 5590, "train_runtime": 2483.0274, "train_tokens_per_second": 147492.311 }, { "epoch": 1.2115967113803547, "grad_norm": 0.47265625, "learning_rate": 0.0002779578947368421, "loss": 0.0228, "num_input_tokens_seen": 366878720, "step": 5600, "train_runtime": 2486.8135, "train_tokens_per_second": 147529.65 }, { "epoch": 1.2115967113803547, "eval_loss": 0.01578563265502453, "eval_runtime": 2.1536, "eval_samples_per_second": 14.859, "eval_steps_per_second": 0.464, "num_input_tokens_seen": 366878720, "step": 5600 }, { "epoch": 1.2115967113803547, "eval_byte_accuracy": 0.9951780415430267, "eval_chrf": 97.24990341576824, "eval_sacrebleu": 96.77994309150014, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 366878720, "perplexity": 1.0159108839432456, "step": 5600 }, { "epoch": 1.213760276936391, "grad_norm": 0.3984375, "learning_rate": 0.00027732631578947367, "loss": 0.0214, "num_input_tokens_seen": 367534080, "step": 5610, "train_runtime": 2495.1977, "train_tokens_per_second": 147296.576 }, { "epoch": 1.2159238424924275, "grad_norm": 0.427734375, "learning_rate": 0.00027669473684210525, "loss": 0.0218, "num_input_tokens_seen": 368189440, "step": 5620, "train_runtime": 2499.0073, "train_tokens_per_second": 147334.277 }, { "epoch": 1.2180874080484638, "grad_norm": 0.400390625, "learning_rate": 0.00027606315789473684, "loss": 0.024, "num_input_tokens_seen": 368844800, "step": 5630, "train_runtime": 2502.8184, "train_tokens_per_second": 147371.776 }, { "epoch": 1.2202509736045002, "grad_norm": 0.51953125, "learning_rate": 0.0002754315789473684, "loss": 0.0247, "num_input_tokens_seen": 369500160, "step": 5640, "train_runtime": 2506.6357, "train_tokens_per_second": 147408.8 }, { "epoch": 1.2224145391605366, "grad_norm": 0.5, "learning_rate": 0.0002748, "loss": 0.0228, "num_input_tokens_seen": 370151424, "step": 5650, "train_runtime": 2510.4299, "train_tokens_per_second": 147445.433 }, { "epoch": 1.224578104716573, "grad_norm": 0.412109375, "learning_rate": 0.00027416842105263154, "loss": 0.0258, "num_input_tokens_seen": 370806784, "step": 5660, "train_runtime": 2516.4496, "train_tokens_per_second": 147353.151 }, { "epoch": 1.2267416702726093, "grad_norm": 0.68359375, "learning_rate": 0.00027353684210526313, "loss": 0.0263, "num_input_tokens_seen": 371462144, "step": 5670, "train_runtime": 2520.2461, "train_tokens_per_second": 147391.221 }, { "epoch": 1.2289052358286456, "grad_norm": 0.62109375, "learning_rate": 0.0002729052631578947, "loss": 0.0254, "num_input_tokens_seen": 372117504, "step": 5680, "train_runtime": 2524.0507, "train_tokens_per_second": 147428.695 }, { "epoch": 1.231068801384682, "grad_norm": 0.5234375, "learning_rate": 0.0002722736842105263, "loss": 0.024, "num_input_tokens_seen": 372768768, "step": 5690, "train_runtime": 2527.8427, "train_tokens_per_second": 147465.177 }, { "epoch": 1.2332323669407184, "grad_norm": 0.53515625, "learning_rate": 0.0002716421052631579, "loss": 0.0227, "num_input_tokens_seen": 373424128, "step": 5700, "train_runtime": 2531.6566, "train_tokens_per_second": 147501.888 }, { "epoch": 1.2332323669407184, "eval_loss": 0.01439273077994585, "eval_runtime": 2.1944, "eval_samples_per_second": 14.583, "eval_steps_per_second": 0.456, "num_input_tokens_seen": 373424128, "step": 5700 }, { "epoch": 1.2332323669407184, "eval_byte_accuracy": 0.9951780415430267, "eval_chrf": 96.76431117455587, "eval_sacrebleu": 96.44528222520958, "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 373424128, "perplexity": 1.0144968048334402, "step": 5700 }, { "epoch": 1.2353959324967547, "grad_norm": 0.56640625, "learning_rate": 0.0002710105263157895, "loss": 0.0232, "num_input_tokens_seen": 374079488, "step": 5710, "train_runtime": 2539.9047, "train_tokens_per_second": 147280.912 }, { "epoch": 1.237559498052791, "grad_norm": 0.53515625, "learning_rate": 0.00027037894736842106, "loss": 0.0242, "num_input_tokens_seen": 374734848, "step": 5720, "train_runtime": 2543.7103, "train_tokens_per_second": 147318.211 }, { "epoch": 1.2397230636088272, "grad_norm": 0.53515625, "learning_rate": 0.00026974736842105265, "loss": 0.0223, "num_input_tokens_seen": 375386112, "step": 5730, "train_runtime": 2547.5088, "train_tokens_per_second": 147354.199 }, { "epoch": 1.2418866291648638, "grad_norm": 0.390625, "learning_rate": 0.0002691157894736842, "loss": 0.022, "num_input_tokens_seen": 376041472, "step": 5740, "train_runtime": 2551.311, "train_tokens_per_second": 147391.465 }, { "epoch": 1.2440501947209, "grad_norm": 0.30859375, "learning_rate": 0.00026848421052631577, "loss": 0.0248, "num_input_tokens_seen": 376696832, "step": 5750, "train_runtime": 2555.1174, "train_tokens_per_second": 147428.384 }, { "epoch": 1.2462137602769363, "grad_norm": 0.53515625, "learning_rate": 0.00026785263157894735, "loss": 0.0215, "num_input_tokens_seen": 377352192, "step": 5760, "train_runtime": 2561.1355, "train_tokens_per_second": 147337.848 }, { "epoch": 1.2483773258329727, "grad_norm": 0.408203125, "learning_rate": 0.00026722105263157894, "loss": 0.0259, "num_input_tokens_seen": 378007552, "step": 5770, "train_runtime": 2564.9477, "train_tokens_per_second": 147374.368 }, { "epoch": 1.250540891389009, "grad_norm": 0.5, "learning_rate": 0.0002665894736842105, "loss": 0.0228, "num_input_tokens_seen": 378662912, "step": 5780, "train_runtime": 2568.7519, "train_tokens_per_second": 147411.241 }, { "epoch": 1.2527044569450454, "grad_norm": 0.3359375, "learning_rate": 0.0002659578947368421, "loss": 0.0221, "num_input_tokens_seen": 379318272, "step": 5790, "train_runtime": 2572.5685, "train_tokens_per_second": 147447.299 }, { "epoch": 1.2548680225010818, "grad_norm": 0.5390625, "learning_rate": 0.0002653263157894737, "loss": 0.0235, "num_input_tokens_seen": 379973632, "step": 5800, "train_runtime": 2576.363, "train_tokens_per_second": 147484.511 }, { "epoch": 1.2548680225010818, "eval_loss": 0.013111169449985027, "eval_runtime": 2.2119, "eval_samples_per_second": 14.467, "eval_steps_per_second": 0.452, "num_input_tokens_seen": 379973632, "step": 5800 }, { "epoch": 1.2548680225010818, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 96.67853700001116, "eval_sacrebleu": 96.53538171173645, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 379973632, "perplexity": 1.0131974977077127, "step": 5800 }, { "epoch": 1.2570315880571181, "grad_norm": 0.515625, "learning_rate": 0.00026469473684210523, "loss": 0.0222, "num_input_tokens_seen": 380628992, "step": 5810, "train_runtime": 2584.6272, "train_tokens_per_second": 147266.499 }, { "epoch": 1.2591951536131545, "grad_norm": 0.390625, "learning_rate": 0.0002640631578947368, "loss": 0.0233, "num_input_tokens_seen": 381284352, "step": 5820, "train_runtime": 2588.4283, "train_tokens_per_second": 147303.427 }, { "epoch": 1.2613587191691908, "grad_norm": 0.5078125, "learning_rate": 0.0002634315789473684, "loss": 0.0214, "num_input_tokens_seen": 381939712, "step": 5830, "train_runtime": 2592.2337, "train_tokens_per_second": 147340.002 }, { "epoch": 1.2635222847252272, "grad_norm": 0.486328125, "learning_rate": 0.0002628, "loss": 0.0215, "num_input_tokens_seen": 382595072, "step": 5840, "train_runtime": 2596.0288, "train_tokens_per_second": 147377.052 }, { "epoch": 1.2656858502812636, "grad_norm": 0.57421875, "learning_rate": 0.0002621684210526316, "loss": 0.0221, "num_input_tokens_seen": 383250432, "step": 5850, "train_runtime": 2599.8368, "train_tokens_per_second": 147413.267 }, { "epoch": 1.2678494158373, "grad_norm": 0.66796875, "learning_rate": 0.00026153684210526316, "loss": 0.0248, "num_input_tokens_seen": 383905792, "step": 5860, "train_runtime": 2606.0753, "train_tokens_per_second": 147311.857 }, { "epoch": 1.2700129813933363, "grad_norm": 0.640625, "learning_rate": 0.00026090526315789475, "loss": 0.0237, "num_input_tokens_seen": 384561152, "step": 5870, "train_runtime": 2609.8901, "train_tokens_per_second": 147347.642 }, { "epoch": 1.2721765469493724, "grad_norm": 0.5234375, "learning_rate": 0.0002602736842105263, "loss": 0.0235, "num_input_tokens_seen": 385216512, "step": 5880, "train_runtime": 2613.6997, "train_tokens_per_second": 147383.617 }, { "epoch": 1.274340112505409, "grad_norm": 0.427734375, "learning_rate": 0.00025964210526315787, "loss": 0.0211, "num_input_tokens_seen": 385871872, "step": 5890, "train_runtime": 2617.5106, "train_tokens_per_second": 147419.412 }, { "epoch": 1.2765036780614452, "grad_norm": 0.50390625, "learning_rate": 0.00025901052631578946, "loss": 0.0263, "num_input_tokens_seen": 386527232, "step": 5900, "train_runtime": 2621.3256, "train_tokens_per_second": 147454.874 }, { "epoch": 1.2765036780614452, "eval_loss": 0.014893613755702972, "eval_runtime": 2.2011, "eval_samples_per_second": 14.538, "eval_steps_per_second": 0.454, "num_input_tokens_seen": 386527232, "step": 5900 }, { "epoch": 1.2765036780614452, "eval_byte_accuracy": 0.9951780415430267, "eval_chrf": 97.5606524584088, "eval_sacrebleu": 96.5155008972932, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 386527232, "perplexity": 1.0150050762935756, "step": 5900 }, { "epoch": 1.2786672436174817, "grad_norm": 0.6640625, "learning_rate": 0.00025837894736842104, "loss": 0.0248, "num_input_tokens_seen": 387182592, "step": 5910, "train_runtime": 2629.6051, "train_tokens_per_second": 147239.825 }, { "epoch": 1.2808308091735179, "grad_norm": 0.54296875, "learning_rate": 0.00025774736842105263, "loss": 0.0227, "num_input_tokens_seen": 387837952, "step": 5920, "train_runtime": 2633.4187, "train_tokens_per_second": 147275.462 }, { "epoch": 1.2829943747295542, "grad_norm": 0.58203125, "learning_rate": 0.0002571157894736842, "loss": 0.0251, "num_input_tokens_seen": 388493312, "step": 5930, "train_runtime": 2637.2287, "train_tokens_per_second": 147311.197 }, { "epoch": 1.2851579402855906, "grad_norm": 0.59765625, "learning_rate": 0.0002564842105263158, "loss": 0.0233, "num_input_tokens_seen": 389148672, "step": 5940, "train_runtime": 2641.041, "train_tokens_per_second": 147346.701 }, { "epoch": 1.287321505841627, "grad_norm": 0.4140625, "learning_rate": 0.00025585263157894733, "loss": 0.0225, "num_input_tokens_seen": 389804032, "step": 5950, "train_runtime": 2644.8514, "train_tokens_per_second": 147382.205 }, { "epoch": 1.2894850713976633, "grad_norm": 0.431640625, "learning_rate": 0.0002552210526315789, "loss": 0.0232, "num_input_tokens_seen": 390459392, "step": 5960, "train_runtime": 2650.9057, "train_tokens_per_second": 147292.823 }, { "epoch": 1.2916486369536997, "grad_norm": 0.326171875, "learning_rate": 0.0002545894736842105, "loss": 0.0216, "num_input_tokens_seen": 391114752, "step": 5970, "train_runtime": 2654.7136, "train_tokens_per_second": 147328.415 }, { "epoch": 1.293812202509736, "grad_norm": 0.5, "learning_rate": 0.0002539578947368421, "loss": 0.0233, "num_input_tokens_seen": 391770112, "step": 5980, "train_runtime": 2658.5272, "train_tokens_per_second": 147363.588 }, { "epoch": 1.2959757680657724, "grad_norm": 0.52734375, "learning_rate": 0.0002533263157894737, "loss": 0.024, "num_input_tokens_seen": 392425472, "step": 5990, "train_runtime": 2662.3361, "train_tokens_per_second": 147398.924 }, { "epoch": 1.2981393336218088, "grad_norm": 0.5234375, "learning_rate": 0.00025269473684210527, "loss": 0.0242, "num_input_tokens_seen": 393080832, "step": 6000, "train_runtime": 2666.1611, "train_tokens_per_second": 147433.261 }, { "epoch": 1.2981393336218088, "eval_loss": 0.014728213660418987, "eval_runtime": 2.1539, "eval_samples_per_second": 14.857, "eval_steps_per_second": 0.464, "num_input_tokens_seen": 393080832, "step": 6000 }, { "epoch": 1.2981393336218088, "eval_byte_accuracy": 0.995919881305638, "eval_chrf": 97.03697158984805, "eval_sacrebleu": 97.0441363353587, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 393080832, "perplexity": 1.0148372082403216, "step": 6000 }, { "epoch": 1.3003028991778451, "grad_norm": 0.65625, "learning_rate": 0.00025206315789473685, "loss": 0.0226, "num_input_tokens_seen": 393736192, "step": 6010, "train_runtime": 2674.4254, "train_tokens_per_second": 147222.724 }, { "epoch": 1.3024664647338815, "grad_norm": 0.49609375, "learning_rate": 0.0002514315789473684, "loss": 0.0222, "num_input_tokens_seen": 394391552, "step": 6020, "train_runtime": 2678.2398, "train_tokens_per_second": 147257.744 }, { "epoch": 1.3046300302899179, "grad_norm": 0.39453125, "learning_rate": 0.00025079999999999997, "loss": 0.0236, "num_input_tokens_seen": 395046912, "step": 6030, "train_runtime": 2682.0502, "train_tokens_per_second": 147292.888 }, { "epoch": 1.3067935958459542, "grad_norm": 0.470703125, "learning_rate": 0.00025016842105263156, "loss": 0.0262, "num_input_tokens_seen": 395702272, "step": 6040, "train_runtime": 2685.8612, "train_tokens_per_second": 147327.891 }, { "epoch": 1.3089571614019904, "grad_norm": 0.43359375, "learning_rate": 0.00024953684210526314, "loss": 0.0237, "num_input_tokens_seen": 396357632, "step": 6050, "train_runtime": 2689.6805, "train_tokens_per_second": 147362.345 }, { "epoch": 1.311120726958027, "grad_norm": 0.486328125, "learning_rate": 0.00024890526315789473, "loss": 0.0217, "num_input_tokens_seen": 397012992, "step": 6060, "train_runtime": 2695.7306, "train_tokens_per_second": 147274.727 }, { "epoch": 1.313284292514063, "grad_norm": 0.54296875, "learning_rate": 0.0002482736842105263, "loss": 0.024, "num_input_tokens_seen": 397668352, "step": 6070, "train_runtime": 2699.5518, "train_tokens_per_second": 147309.027 }, { "epoch": 1.3154478580700995, "grad_norm": 0.41796875, "learning_rate": 0.0002476421052631579, "loss": 0.0213, "num_input_tokens_seen": 398323712, "step": 6080, "train_runtime": 2703.3717, "train_tokens_per_second": 147343.303 }, { "epoch": 1.3176114236261358, "grad_norm": 0.3828125, "learning_rate": 0.00024701052631578943, "loss": 0.0238, "num_input_tokens_seen": 398974976, "step": 6090, "train_runtime": 2707.1731, "train_tokens_per_second": 147376.97 }, { "epoch": 1.3197749891821722, "grad_norm": 0.451171875, "learning_rate": 0.000246378947368421, "loss": 0.0234, "num_input_tokens_seen": 399626240, "step": 6100, "train_runtime": 2710.974, "train_tokens_per_second": 147410.577 }, { "epoch": 1.3197749891821722, "eval_loss": 0.014207079075276852, "eval_runtime": 2.3254, "eval_samples_per_second": 13.761, "eval_steps_per_second": 0.43, "num_input_tokens_seen": 399626240, "step": 6100 }, { "epoch": 1.3197749891821722, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 96.607979287624, "eval_sacrebleu": 96.44528222520958, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 399626240, "perplexity": 1.014308479254265, "step": 6100 }, { "epoch": 1.3219385547382085, "grad_norm": 0.53515625, "learning_rate": 0.0002457473684210526, "loss": 0.0221, "num_input_tokens_seen": 400281600, "step": 6110, "train_runtime": 2719.3656, "train_tokens_per_second": 147196.683 }, { "epoch": 1.324102120294245, "grad_norm": 0.443359375, "learning_rate": 0.0002451157894736842, "loss": 0.0207, "num_input_tokens_seen": 400936960, "step": 6120, "train_runtime": 2723.1458, "train_tokens_per_second": 147233.014 }, { "epoch": 1.3262656858502813, "grad_norm": 0.58984375, "learning_rate": 0.0002444842105263158, "loss": 0.024, "num_input_tokens_seen": 401592320, "step": 6130, "train_runtime": 2726.9626, "train_tokens_per_second": 147267.263 }, { "epoch": 1.3284292514063176, "grad_norm": 0.474609375, "learning_rate": 0.00024385263157894737, "loss": 0.0214, "num_input_tokens_seen": 402247680, "step": 6140, "train_runtime": 2730.774, "train_tokens_per_second": 147301.709 }, { "epoch": 1.330592816962354, "grad_norm": 0.419921875, "learning_rate": 0.00024322105263157893, "loss": 0.0216, "num_input_tokens_seen": 402903040, "step": 6150, "train_runtime": 2734.5891, "train_tokens_per_second": 147335.863 }, { "epoch": 1.3327563825183903, "grad_norm": 0.3828125, "learning_rate": 0.00024258947368421049, "loss": 0.0216, "num_input_tokens_seen": 403558400, "step": 6160, "train_runtime": 2740.6289, "train_tokens_per_second": 147250.292 }, { "epoch": 1.3349199480744267, "grad_norm": 0.5390625, "learning_rate": 0.00024195789473684207, "loss": 0.0229, "num_input_tokens_seen": 404213760, "step": 6170, "train_runtime": 2744.4401, "train_tokens_per_second": 147284.598 }, { "epoch": 1.337083513630463, "grad_norm": 0.48828125, "learning_rate": 0.00024132631578947366, "loss": 0.0224, "num_input_tokens_seen": 404869120, "step": 6180, "train_runtime": 2748.2532, "train_tokens_per_second": 147318.71 }, { "epoch": 1.3392470791864994, "grad_norm": 0.46875, "learning_rate": 0.00024069473684210524, "loss": 0.0238, "num_input_tokens_seen": 405524480, "step": 6190, "train_runtime": 2752.0697, "train_tokens_per_second": 147352.548 }, { "epoch": 1.3414106447425356, "grad_norm": 0.4140625, "learning_rate": 0.00024006315789473683, "loss": 0.0207, "num_input_tokens_seen": 406179840, "step": 6200, "train_runtime": 2755.863, "train_tokens_per_second": 147387.528 }, { "epoch": 1.3414106447425356, "eval_loss": 0.013366620987653732, "eval_runtime": 2.3739, "eval_samples_per_second": 13.48, "eval_steps_per_second": 0.421, "num_input_tokens_seen": 406179840, "step": 6200 }, { "epoch": 1.3414106447425356, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 97.6580883112683, "eval_sacrebleu": 96.60546188979983, "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 406179840, "perplexity": 1.013456353627628, "step": 6200 }, { "epoch": 1.3435742102985722, "grad_norm": 0.6171875, "learning_rate": 0.00023943157894736842, "loss": 0.0253, "num_input_tokens_seen": 406835200, "step": 6210, "train_runtime": 2764.2874, "train_tokens_per_second": 147175.436 }, { "epoch": 1.3457377758546083, "grad_norm": 0.5703125, "learning_rate": 0.0002388, "loss": 0.0212, "num_input_tokens_seen": 407490560, "step": 6220, "train_runtime": 2768.0945, "train_tokens_per_second": 147209.774 }, { "epoch": 1.3479013414106447, "grad_norm": 0.52734375, "learning_rate": 0.00023816842105263154, "loss": 0.0212, "num_input_tokens_seen": 408145920, "step": 6230, "train_runtime": 2771.8982, "train_tokens_per_second": 147244.197 }, { "epoch": 1.350064906966681, "grad_norm": 0.296875, "learning_rate": 0.00023753684210526312, "loss": 0.0195, "num_input_tokens_seen": 408801280, "step": 6240, "train_runtime": 2775.6897, "train_tokens_per_second": 147279.173 }, { "epoch": 1.3522284725227174, "grad_norm": 0.416015625, "learning_rate": 0.0002369052631578947, "loss": 0.0226, "num_input_tokens_seen": 409456640, "step": 6250, "train_runtime": 2779.506, "train_tokens_per_second": 147312.741 }, { "epoch": 1.3543920380787537, "grad_norm": 0.490234375, "learning_rate": 0.0002362736842105263, "loss": 0.022, "num_input_tokens_seen": 410112000, "step": 6260, "train_runtime": 2785.5514, "train_tokens_per_second": 147228.3 }, { "epoch": 1.35655560363479, "grad_norm": 0.453125, "learning_rate": 0.00023564210526315788, "loss": 0.0229, "num_input_tokens_seen": 410767360, "step": 6270, "train_runtime": 2789.3661, "train_tokens_per_second": 147261.903 }, { "epoch": 1.3587191691908265, "grad_norm": 0.30859375, "learning_rate": 0.00023501052631578947, "loss": 0.0214, "num_input_tokens_seen": 411422720, "step": 6280, "train_runtime": 2793.186, "train_tokens_per_second": 147295.136 }, { "epoch": 1.3608827347468628, "grad_norm": 0.439453125, "learning_rate": 0.00023437894736842105, "loss": 0.0226, "num_input_tokens_seen": 412078080, "step": 6290, "train_runtime": 2797.0053, "train_tokens_per_second": 147328.315 }, { "epoch": 1.3630463003028992, "grad_norm": 0.43359375, "learning_rate": 0.00023374736842105264, "loss": 0.0213, "num_input_tokens_seen": 412733440, "step": 6300, "train_runtime": 2800.8201, "train_tokens_per_second": 147361.637 }, { "epoch": 1.3630463003028992, "eval_loss": 0.011968844570219517, "eval_runtime": 2.2624, "eval_samples_per_second": 14.144, "eval_steps_per_second": 0.442, "num_input_tokens_seen": 412733440, "step": 6300 }, { "epoch": 1.3630463003028992, "eval_byte_accuracy": 0.9970326409495549, "eval_chrf": 97.74613558028801, "eval_sacrebleu": 97.11458584551553, "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 412733440, "perplexity": 1.0120407578101336, "step": 6300 }, { "epoch": 1.3652098658589356, "grad_norm": 0.4375, "learning_rate": 0.00023311578947368417, "loss": 0.0218, "num_input_tokens_seen": 413388800, "step": 6310, "train_runtime": 2808.9578, "train_tokens_per_second": 147168.037 }, { "epoch": 1.367373431414972, "grad_norm": 0.384765625, "learning_rate": 0.00023248421052631576, "loss": 0.02, "num_input_tokens_seen": 414044160, "step": 6320, "train_runtime": 2812.7793, "train_tokens_per_second": 147201.084 }, { "epoch": 1.3695369969710083, "grad_norm": 0.486328125, "learning_rate": 0.00023185263157894735, "loss": 0.0223, "num_input_tokens_seen": 414699520, "step": 6330, "train_runtime": 2816.5902, "train_tokens_per_second": 147234.595 }, { "epoch": 1.3717005625270446, "grad_norm": 0.54296875, "learning_rate": 0.00023122105263157893, "loss": 0.0209, "num_input_tokens_seen": 415354880, "step": 6340, "train_runtime": 2820.4056, "train_tokens_per_second": 147267.781 }, { "epoch": 1.3738641280830808, "grad_norm": 0.48828125, "learning_rate": 0.00023058947368421052, "loss": 0.0232, "num_input_tokens_seen": 416010240, "step": 6350, "train_runtime": 2824.215, "train_tokens_per_second": 147301.193 }, { "epoch": 1.3760276936391174, "grad_norm": 0.5859375, "learning_rate": 0.0002299578947368421, "loss": 0.0213, "num_input_tokens_seen": 416665600, "step": 6360, "train_runtime": 2830.3894, "train_tokens_per_second": 147211.405 }, { "epoch": 1.3781912591951535, "grad_norm": 0.52734375, "learning_rate": 0.0002293263157894737, "loss": 0.0228, "num_input_tokens_seen": 417320960, "step": 6370, "train_runtime": 2834.2046, "train_tokens_per_second": 147244.473 }, { "epoch": 1.38035482475119, "grad_norm": 0.57421875, "learning_rate": 0.00022869473684210522, "loss": 0.0231, "num_input_tokens_seen": 417976320, "step": 6380, "train_runtime": 2838.014, "train_tokens_per_second": 147277.752 }, { "epoch": 1.3825183903072262, "grad_norm": 0.470703125, "learning_rate": 0.0002280631578947368, "loss": 0.0196, "num_input_tokens_seen": 418631680, "step": 6390, "train_runtime": 2841.8354, "train_tokens_per_second": 147310.319 }, { "epoch": 1.3846819558632626, "grad_norm": 0.390625, "learning_rate": 0.0002274315789473684, "loss": 0.0215, "num_input_tokens_seen": 419287040, "step": 6400, "train_runtime": 2845.6548, "train_tokens_per_second": 147342.904 }, { "epoch": 1.3846819558632626, "eval_loss": 0.012704423628747463, "eval_runtime": 2.187, "eval_samples_per_second": 14.632, "eval_steps_per_second": 0.457, "num_input_tokens_seen": 419287040, "step": 6400 }, { "epoch": 1.3846819558632626, "eval_byte_accuracy": 0.9970326409495549, "eval_chrf": 98.23489619471393, "eval_sacrebleu": 97.07371933981932, "eval_word_accuracy": 0.9931318681318682, "num_input_tokens_seen": 419287040, "perplexity": 1.012785467660862, "step": 6400 }, { "epoch": 1.386845521419299, "grad_norm": 0.490234375, "learning_rate": 0.00022679999999999998, "loss": 0.0223, "num_input_tokens_seen": 419942400, "step": 6410, "train_runtime": 2853.8879, "train_tokens_per_second": 147147.474 }, { "epoch": 1.3890090869753353, "grad_norm": 0.50390625, "learning_rate": 0.00022616842105263157, "loss": 0.0205, "num_input_tokens_seen": 420597760, "step": 6420, "train_runtime": 2857.696, "train_tokens_per_second": 147180.72 }, { "epoch": 1.3911726525313717, "grad_norm": 0.4296875, "learning_rate": 0.00022553684210526316, "loss": 0.0225, "num_input_tokens_seen": 421253120, "step": 6430, "train_runtime": 2861.5115, "train_tokens_per_second": 147213.5 }, { "epoch": 1.393336218087408, "grad_norm": 0.357421875, "learning_rate": 0.00022490526315789474, "loss": 0.0205, "num_input_tokens_seen": 421908480, "step": 6440, "train_runtime": 2865.3128, "train_tokens_per_second": 147246.916 }, { "epoch": 1.3954997836434444, "grad_norm": 0.62109375, "learning_rate": 0.00022427368421052627, "loss": 0.0231, "num_input_tokens_seen": 422563840, "step": 6450, "train_runtime": 2869.1293, "train_tokens_per_second": 147279.467 }, { "epoch": 1.3976633491994808, "grad_norm": 0.44921875, "learning_rate": 0.00022364210526315786, "loss": 0.0212, "num_input_tokens_seen": 423219200, "step": 6460, "train_runtime": 2875.1773, "train_tokens_per_second": 147197.602 }, { "epoch": 1.3998269147555171, "grad_norm": 0.439453125, "learning_rate": 0.00022301052631578945, "loss": 0.0221, "num_input_tokens_seen": 423874560, "step": 6470, "train_runtime": 2878.9924, "train_tokens_per_second": 147230.179 }, { "epoch": 1.4019904803115535, "grad_norm": 0.60546875, "learning_rate": 0.00022237894736842103, "loss": 0.0195, "num_input_tokens_seen": 424529920, "step": 6480, "train_runtime": 2882.8075, "train_tokens_per_second": 147262.669 }, { "epoch": 1.4041540458675899, "grad_norm": 0.546875, "learning_rate": 0.00022174736842105262, "loss": 0.0224, "num_input_tokens_seen": 425185280, "step": 6490, "train_runtime": 2886.6208, "train_tokens_per_second": 147295.164 }, { "epoch": 1.4063176114236262, "grad_norm": 0.35546875, "learning_rate": 0.0002211157894736842, "loss": 0.0237, "num_input_tokens_seen": 425840640, "step": 6500, "train_runtime": 2890.4362, "train_tokens_per_second": 147327.464 }, { "epoch": 1.4063176114236262, "eval_loss": 0.013664661906659603, "eval_runtime": 2.0687, "eval_samples_per_second": 15.468, "eval_steps_per_second": 0.483, "num_input_tokens_seen": 425840640, "step": 6500 }, { "epoch": 1.4063176114236262, "eval_byte_accuracy": 0.9970326409495549, "eval_chrf": 98.07622351272572, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 425840640, "perplexity": 1.0137584501069554, "step": 6500 }, { "epoch": 1.4084811769796626, "grad_norm": 0.5390625, "learning_rate": 0.0002204842105263158, "loss": 0.0211, "num_input_tokens_seen": 426496000, "step": 6510, "train_runtime": 2898.7837, "train_tokens_per_second": 147129.293 }, { "epoch": 1.4106447425356987, "grad_norm": 0.46875, "learning_rate": 0.00021985263157894733, "loss": 0.0213, "num_input_tokens_seen": 427151360, "step": 6520, "train_runtime": 2902.6053, "train_tokens_per_second": 147161.368 }, { "epoch": 1.4128083080917353, "grad_norm": 0.423828125, "learning_rate": 0.0002192210526315789, "loss": 0.0216, "num_input_tokens_seen": 427806720, "step": 6530, "train_runtime": 2906.4268, "train_tokens_per_second": 147193.358 }, { "epoch": 1.4149718736477714, "grad_norm": 0.5078125, "learning_rate": 0.0002185894736842105, "loss": 0.0191, "num_input_tokens_seen": 428462080, "step": 6540, "train_runtime": 2910.2357, "train_tokens_per_second": 147225.906 }, { "epoch": 1.4171354392038078, "grad_norm": 0.482421875, "learning_rate": 0.00021795789473684208, "loss": 0.0191, "num_input_tokens_seen": 429117440, "step": 6550, "train_runtime": 2914.0494, "train_tokens_per_second": 147258.121 }, { "epoch": 1.4192990047598442, "grad_norm": 0.44140625, "learning_rate": 0.00021732631578947367, "loss": 0.0198, "num_input_tokens_seen": 429772800, "step": 6560, "train_runtime": 2920.0844, "train_tokens_per_second": 147178.211 }, { "epoch": 1.4214625703158805, "grad_norm": 0.408203125, "learning_rate": 0.00021669473684210526, "loss": 0.0212, "num_input_tokens_seen": 430428160, "step": 6570, "train_runtime": 2923.8842, "train_tokens_per_second": 147211.081 }, { "epoch": 1.4236261358719169, "grad_norm": 0.306640625, "learning_rate": 0.00021606315789473684, "loss": 0.0198, "num_input_tokens_seen": 431083520, "step": 6580, "train_runtime": 2927.6985, "train_tokens_per_second": 147243.14 }, { "epoch": 1.4257897014279532, "grad_norm": 0.34765625, "learning_rate": 0.00021543157894736838, "loss": 0.0209, "num_input_tokens_seen": 431738880, "step": 6590, "train_runtime": 2931.5141, "train_tokens_per_second": 147275.046 }, { "epoch": 1.4279532669839896, "grad_norm": 0.50390625, "learning_rate": 0.00021479999999999996, "loss": 0.0207, "num_input_tokens_seen": 432394240, "step": 6600, "train_runtime": 2935.3272, "train_tokens_per_second": 147307.0 }, { "epoch": 1.4279532669839896, "eval_loss": 0.013251814059913158, "eval_runtime": 2.1468, "eval_samples_per_second": 14.906, "eval_steps_per_second": 0.466, "num_input_tokens_seen": 432394240, "step": 6600 }, { "epoch": 1.4279532669839896, "eval_byte_accuracy": 0.9970326409495549, "eval_chrf": 97.51098357963586, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 432394240, "perplexity": 1.01334000849601, "step": 6600 }, { "epoch": 1.430116832540026, "grad_norm": 0.4921875, "learning_rate": 0.00021416842105263155, "loss": 0.0197, "num_input_tokens_seen": 433049600, "step": 6610, "train_runtime": 2943.524, "train_tokens_per_second": 147119.439 }, { "epoch": 1.4322803980960623, "grad_norm": 0.400390625, "learning_rate": 0.00021353684210526314, "loss": 0.0206, "num_input_tokens_seen": 433704960, "step": 6620, "train_runtime": 2947.3456, "train_tokens_per_second": 147151.037 }, { "epoch": 1.4344439636520987, "grad_norm": 0.47265625, "learning_rate": 0.00021290526315789472, "loss": 0.0256, "num_input_tokens_seen": 434360320, "step": 6630, "train_runtime": 2951.1633, "train_tokens_per_second": 147182.748 }, { "epoch": 1.436607529208135, "grad_norm": 0.419921875, "learning_rate": 0.0002122736842105263, "loss": 0.0204, "num_input_tokens_seen": 435015680, "step": 6640, "train_runtime": 2954.9846, "train_tokens_per_second": 147214.194 }, { "epoch": 1.4387710947641714, "grad_norm": 0.4375, "learning_rate": 0.0002116421052631579, "loss": 0.0198, "num_input_tokens_seen": 435671040, "step": 6650, "train_runtime": 2958.7979, "train_tokens_per_second": 147245.961 }, { "epoch": 1.4409346603202078, "grad_norm": 0.4609375, "learning_rate": 0.00021101052631578945, "loss": 0.0188, "num_input_tokens_seen": 436326400, "step": 6660, "train_runtime": 2964.8365, "train_tokens_per_second": 147167.102 }, { "epoch": 1.443098225876244, "grad_norm": 0.349609375, "learning_rate": 0.000210378947368421, "loss": 0.0205, "num_input_tokens_seen": 436981760, "step": 6670, "train_runtime": 2968.6512, "train_tokens_per_second": 147198.754 }, { "epoch": 1.4452617914322805, "grad_norm": 0.59375, "learning_rate": 0.0002097473684210526, "loss": 0.0214, "num_input_tokens_seen": 437637120, "step": 6680, "train_runtime": 2972.4583, "train_tokens_per_second": 147230.699 }, { "epoch": 1.4474253569883166, "grad_norm": 0.4921875, "learning_rate": 0.00020911578947368419, "loss": 0.0209, "num_input_tokens_seen": 438292480, "step": 6690, "train_runtime": 2976.2746, "train_tokens_per_second": 147262.113 }, { "epoch": 1.4495889225443532, "grad_norm": 0.5078125, "learning_rate": 0.00020848421052631577, "loss": 0.0219, "num_input_tokens_seen": 438947840, "step": 6700, "train_runtime": 2980.0894, "train_tokens_per_second": 147293.516 }, { "epoch": 1.4495889225443532, "eval_loss": 0.012278531678020954, "eval_runtime": 2.2, "eval_samples_per_second": 14.546, "eval_steps_per_second": 0.455, "num_input_tokens_seen": 438947840, "step": 6700 }, { "epoch": 1.4495889225443532, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 97.32826860603707, "eval_sacrebleu": 95.31161095839255, "eval_word_accuracy": 0.989010989010989, "num_input_tokens_seen": 438947840, "perplexity": 1.0123542223208506, "step": 6700 }, { "epoch": 1.4517524881003894, "grad_norm": 0.3515625, "learning_rate": 0.00020785263157894736, "loss": 0.0211, "num_input_tokens_seen": 439603200, "step": 6710, "train_runtime": 2988.5337, "train_tokens_per_second": 147096.618 }, { "epoch": 1.4539160536564257, "grad_norm": 0.404296875, "learning_rate": 0.00020722105263157895, "loss": 0.0186, "num_input_tokens_seen": 440258560, "step": 6720, "train_runtime": 2992.3565, "train_tokens_per_second": 147127.71 }, { "epoch": 1.456079619212462, "grad_norm": 0.36328125, "learning_rate": 0.0002065894736842105, "loss": 0.0216, "num_input_tokens_seen": 440913920, "step": 6730, "train_runtime": 2996.1679, "train_tokens_per_second": 147159.282 }, { "epoch": 1.4582431847684985, "grad_norm": 0.462890625, "learning_rate": 0.00020595789473684206, "loss": 0.0203, "num_input_tokens_seen": 441569280, "step": 6740, "train_runtime": 2999.9752, "train_tokens_per_second": 147190.979 }, { "epoch": 1.4604067503245348, "grad_norm": 0.439453125, "learning_rate": 0.00020532631578947365, "loss": 0.0225, "num_input_tokens_seen": 442224640, "step": 6750, "train_runtime": 3003.779, "train_tokens_per_second": 147222.762 }, { "epoch": 1.4625703158805712, "grad_norm": 0.57421875, "learning_rate": 0.00020469473684210524, "loss": 0.0198, "num_input_tokens_seen": 442880000, "step": 6760, "train_runtime": 3009.7963, "train_tokens_per_second": 147146.172 }, { "epoch": 1.4647338814366075, "grad_norm": 0.412109375, "learning_rate": 0.00020406315789473682, "loss": 0.0219, "num_input_tokens_seen": 443535360, "step": 6770, "train_runtime": 3013.6084, "train_tokens_per_second": 147177.502 }, { "epoch": 1.466897446992644, "grad_norm": 0.330078125, "learning_rate": 0.0002034315789473684, "loss": 0.0202, "num_input_tokens_seen": 444190720, "step": 6780, "train_runtime": 3017.3963, "train_tokens_per_second": 147209.935 }, { "epoch": 1.4690610125486803, "grad_norm": 0.359375, "learning_rate": 0.0002028, "loss": 0.0213, "num_input_tokens_seen": 444846080, "step": 6790, "train_runtime": 3021.2137, "train_tokens_per_second": 147240.854 }, { "epoch": 1.4712245781047166, "grad_norm": 0.380859375, "learning_rate": 0.00020216842105263156, "loss": 0.0192, "num_input_tokens_seen": 445497344, "step": 6800, "train_runtime": 3025.0076, "train_tokens_per_second": 147271.478 }, { "epoch": 1.4712245781047166, "eval_loss": 0.011801213026046753, "eval_runtime": 2.0912, "eval_samples_per_second": 15.302, "eval_steps_per_second": 0.478, "num_input_tokens_seen": 445497344, "step": 6800 }, { "epoch": 1.4712245781047166, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 97.91173752160901, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 445497344, "perplexity": 1.0118711220736836, "step": 6800 }, { "epoch": 1.473388143660753, "grad_norm": 0.40625, "learning_rate": 0.00020153684210526314, "loss": 0.0203, "num_input_tokens_seen": 446152704, "step": 6810, "train_runtime": 3033.2934, "train_tokens_per_second": 147085.246 }, { "epoch": 1.4755517092167891, "grad_norm": 0.44140625, "learning_rate": 0.0002009052631578947, "loss": 0.0204, "num_input_tokens_seen": 446808064, "step": 6820, "train_runtime": 3037.1092, "train_tokens_per_second": 147116.232 }, { "epoch": 1.4777152747728257, "grad_norm": 0.484375, "learning_rate": 0.0002002736842105263, "loss": 0.0207, "num_input_tokens_seen": 447463424, "step": 6830, "train_runtime": 3040.9219, "train_tokens_per_second": 147147.292 }, { "epoch": 1.4798788403288619, "grad_norm": 0.384765625, "learning_rate": 0.00019964210526315787, "loss": 0.0219, "num_input_tokens_seen": 448118784, "step": 6840, "train_runtime": 3044.7287, "train_tokens_per_second": 147178.561 }, { "epoch": 1.4820424058848984, "grad_norm": 0.5234375, "learning_rate": 0.00019901052631578946, "loss": 0.0223, "num_input_tokens_seen": 448774144, "step": 6850, "train_runtime": 3048.5411, "train_tokens_per_second": 147209.476 }, { "epoch": 1.4842059714409346, "grad_norm": 0.30859375, "learning_rate": 0.00019837894736842105, "loss": 0.0214, "num_input_tokens_seen": 449429504, "step": 6860, "train_runtime": 3054.5882, "train_tokens_per_second": 147132.6 }, { "epoch": 1.486369536996971, "grad_norm": 0.50390625, "learning_rate": 0.0001977473684210526, "loss": 0.0488, "num_input_tokens_seen": 450084864, "step": 6870, "train_runtime": 3058.3992, "train_tokens_per_second": 147163.546 }, { "epoch": 1.4885331025530073, "grad_norm": 0.322265625, "learning_rate": 0.0001971157894736842, "loss": 0.02, "num_input_tokens_seen": 450740224, "step": 6880, "train_runtime": 3062.2077, "train_tokens_per_second": 147194.528 }, { "epoch": 1.4906966681090437, "grad_norm": 0.55078125, "learning_rate": 0.00019648421052631578, "loss": 0.0216, "num_input_tokens_seen": 451395584, "step": 6890, "train_runtime": 3066.0215, "train_tokens_per_second": 147225.186 }, { "epoch": 1.49286023366508, "grad_norm": 0.400390625, "learning_rate": 0.00019585263157894734, "loss": 0.0206, "num_input_tokens_seen": 452050944, "step": 6900, "train_runtime": 3069.8357, "train_tokens_per_second": 147255.746 }, { "epoch": 1.49286023366508, "eval_loss": 0.010991868562996387, "eval_runtime": 2.029, "eval_samples_per_second": 15.771, "eval_steps_per_second": 0.493, "num_input_tokens_seen": 452050944, "step": 6900 }, { "epoch": 1.49286023366508, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 97.10549441712193, "eval_sacrebleu": 96.36994641700896, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 452050944, "perplexity": 1.0110525011015739, "step": 6900 }, { "epoch": 1.4950237992211164, "grad_norm": 0.38671875, "learning_rate": 0.00019522105263157892, "loss": 0.0202, "num_input_tokens_seen": 452706304, "step": 6910, "train_runtime": 3078.111, "train_tokens_per_second": 147072.767 }, { "epoch": 1.4971873647771528, "grad_norm": 0.4921875, "learning_rate": 0.0001945894736842105, "loss": 0.0227, "num_input_tokens_seen": 453361664, "step": 6920, "train_runtime": 3081.8971, "train_tokens_per_second": 147104.737 }, { "epoch": 1.4993509303331891, "grad_norm": 0.51171875, "learning_rate": 0.0001939578947368421, "loss": 0.0204, "num_input_tokens_seen": 454017024, "step": 6930, "train_runtime": 3085.7057, "train_tokens_per_second": 147135.556 }, { "epoch": 1.5015144958892255, "grad_norm": 0.392578125, "learning_rate": 0.00019332631578947366, "loss": 0.0195, "num_input_tokens_seen": 454672384, "step": 6940, "train_runtime": 3089.5175, "train_tokens_per_second": 147166.147 }, { "epoch": 1.5036780614452618, "grad_norm": 0.3671875, "learning_rate": 0.00019269473684210524, "loss": 0.0202, "num_input_tokens_seen": 455327744, "step": 6950, "train_runtime": 3093.3221, "train_tokens_per_second": 147197.004 }, { "epoch": 1.5058416270012982, "grad_norm": 0.4375, "learning_rate": 0.00019206315789473683, "loss": 0.0201, "num_input_tokens_seen": 455983104, "step": 6960, "train_runtime": 3099.3571, "train_tokens_per_second": 147121.833 }, { "epoch": 1.5080051925573343, "grad_norm": 0.4453125, "learning_rate": 0.0001914315789473684, "loss": 0.0191, "num_input_tokens_seen": 456638464, "step": 6970, "train_runtime": 3103.1587, "train_tokens_per_second": 147152.793 }, { "epoch": 1.510168758113371, "grad_norm": 0.439453125, "learning_rate": 0.00019079999999999998, "loss": 0.019, "num_input_tokens_seen": 457293824, "step": 6980, "train_runtime": 3106.9748, "train_tokens_per_second": 147182.983 }, { "epoch": 1.512332323669407, "grad_norm": 0.341796875, "learning_rate": 0.00019016842105263156, "loss": 0.0198, "num_input_tokens_seen": 457949184, "step": 6990, "train_runtime": 3110.7791, "train_tokens_per_second": 147213.661 }, { "epoch": 1.5144958892254436, "grad_norm": 0.376953125, "learning_rate": 0.00018953684210526315, "loss": 0.0216, "num_input_tokens_seen": 458604544, "step": 7000, "train_runtime": 3114.5883, "train_tokens_per_second": 147244.034 }, { "epoch": 1.5144958892254436, "eval_loss": 0.010852305218577385, "eval_runtime": 2.1629, "eval_samples_per_second": 14.795, "eval_steps_per_second": 0.462, "num_input_tokens_seen": 458604544, "step": 7000 }, { "epoch": 1.5144958892254436, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 98.28090039473334, "eval_sacrebleu": 97.54443059083923, "eval_word_accuracy": 0.9931318681318682, "num_input_tokens_seen": 458604544, "perplexity": 1.0109114050792825, "step": 7000 }, { "epoch": 1.5166594547814798, "grad_norm": 0.4296875, "learning_rate": 0.0001889052631578947, "loss": 0.02, "num_input_tokens_seen": 459259904, "step": 7010, "train_runtime": 3122.8668, "train_tokens_per_second": 147063.557 }, { "epoch": 1.5188230203375164, "grad_norm": 0.4609375, "learning_rate": 0.0001882736842105263, "loss": 0.0196, "num_input_tokens_seen": 459915264, "step": 7020, "train_runtime": 3126.6547, "train_tokens_per_second": 147094.997 }, { "epoch": 1.5209865858935525, "grad_norm": 0.33984375, "learning_rate": 0.00018764210526315788, "loss": 0.0201, "num_input_tokens_seen": 460570624, "step": 7030, "train_runtime": 3130.458, "train_tokens_per_second": 147125.634 }, { "epoch": 1.5231501514495889, "grad_norm": 0.435546875, "learning_rate": 0.00018701052631578947, "loss": 0.0195, "num_input_tokens_seen": 461225984, "step": 7040, "train_runtime": 3134.2733, "train_tokens_per_second": 147155.635 }, { "epoch": 1.5253137170056252, "grad_norm": 0.400390625, "learning_rate": 0.00018637894736842103, "loss": 0.0209, "num_input_tokens_seen": 461881344, "step": 7050, "train_runtime": 3138.0781, "train_tokens_per_second": 147186.058 }, { "epoch": 1.5274772825616616, "grad_norm": 0.451171875, "learning_rate": 0.0001857473684210526, "loss": 0.0205, "num_input_tokens_seen": 462536704, "step": 7060, "train_runtime": 3144.1331, "train_tokens_per_second": 147111.046 }, { "epoch": 1.529640848117698, "grad_norm": 0.486328125, "learning_rate": 0.0001851157894736842, "loss": 0.0209, "num_input_tokens_seen": 463192064, "step": 7070, "train_runtime": 3147.9443, "train_tokens_per_second": 147141.124 }, { "epoch": 1.5318044136737343, "grad_norm": 0.47265625, "learning_rate": 0.00018448421052631579, "loss": 0.0198, "num_input_tokens_seen": 463847424, "step": 7080, "train_runtime": 3151.7527, "train_tokens_per_second": 147171.262 }, { "epoch": 1.5339679792297707, "grad_norm": 0.37890625, "learning_rate": 0.00018385263157894735, "loss": 0.0201, "num_input_tokens_seen": 464502784, "step": 7090, "train_runtime": 3155.5665, "train_tokens_per_second": 147201.077 }, { "epoch": 1.536131544785807, "grad_norm": 0.412109375, "learning_rate": 0.00018322105263157893, "loss": 0.019, "num_input_tokens_seen": 465158144, "step": 7100, "train_runtime": 3159.3692, "train_tokens_per_second": 147231.333 }, { "epoch": 1.536131544785807, "eval_loss": 0.011366115882992744, "eval_runtime": 2.0634, "eval_samples_per_second": 15.508, "eval_steps_per_second": 0.485, "num_input_tokens_seen": 465158144, "step": 7100 }, { "epoch": 1.536131544785807, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 98.23489619471393, "eval_sacrebleu": 97.07371933981932, "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 465158144, "perplexity": 1.0114309556038612, "step": 7100 }, { "epoch": 1.5382951103418434, "grad_norm": 0.4609375, "learning_rate": 0.00018258947368421052, "loss": 0.0198, "num_input_tokens_seen": 465813504, "step": 7110, "train_runtime": 3167.5112, "train_tokens_per_second": 147059.78 }, { "epoch": 1.5404586758978795, "grad_norm": 0.396484375, "learning_rate": 0.0001819578947368421, "loss": 0.0212, "num_input_tokens_seen": 466468864, "step": 7120, "train_runtime": 3171.3238, "train_tokens_per_second": 147089.636 }, { "epoch": 1.5426222414539161, "grad_norm": 0.380859375, "learning_rate": 0.00018132631578947366, "loss": 0.0207, "num_input_tokens_seen": 467124224, "step": 7130, "train_runtime": 3175.134, "train_tokens_per_second": 147119.53 }, { "epoch": 1.5447858070099523, "grad_norm": 0.349609375, "learning_rate": 0.00018069473684210525, "loss": 0.0202, "num_input_tokens_seen": 467779584, "step": 7140, "train_runtime": 3178.9275, "train_tokens_per_second": 147150.128 }, { "epoch": 1.5469493725659889, "grad_norm": 0.396484375, "learning_rate": 0.00018006315789473684, "loss": 0.0197, "num_input_tokens_seen": 468434944, "step": 7150, "train_runtime": 3182.7383, "train_tokens_per_second": 147179.848 }, { "epoch": 1.549112938122025, "grad_norm": 0.384765625, "learning_rate": 0.0001794315789473684, "loss": 0.0192, "num_input_tokens_seen": 469090304, "step": 7160, "train_runtime": 3188.9696, "train_tokens_per_second": 147097.768 }, { "epoch": 1.5512765036780616, "grad_norm": 0.4140625, "learning_rate": 0.00017879999999999998, "loss": 0.0219, "num_input_tokens_seen": 469745664, "step": 7170, "train_runtime": 3192.7902, "train_tokens_per_second": 147127.004 }, { "epoch": 1.5534400692340977, "grad_norm": 0.482421875, "learning_rate": 0.00017816842105263157, "loss": 0.0185, "num_input_tokens_seen": 470401024, "step": 7180, "train_runtime": 3196.6084, "train_tokens_per_second": 147156.289 }, { "epoch": 1.5556036347901343, "grad_norm": 0.50390625, "learning_rate": 0.00017753684210526316, "loss": 0.0193, "num_input_tokens_seen": 471056384, "step": 7190, "train_runtime": 3200.4207, "train_tokens_per_second": 147185.77 }, { "epoch": 1.5577672003461704, "grad_norm": 0.443359375, "learning_rate": 0.00017690526315789471, "loss": 0.0213, "num_input_tokens_seen": 471711744, "step": 7200, "train_runtime": 3204.2436, "train_tokens_per_second": 147214.696 }, { "epoch": 1.5577672003461704, "eval_loss": 0.012076064012944698, "eval_runtime": 2.0546, "eval_samples_per_second": 15.575, "eval_steps_per_second": 0.487, "num_input_tokens_seen": 471711744, "step": 7200 }, { "epoch": 1.5577672003461704, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 97.89186504468623, "eval_sacrebleu": 96.96578008549908, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 471711744, "perplexity": 1.0121492740736238, "step": 7200 }, { "epoch": 1.5599307659022068, "grad_norm": 0.4375, "learning_rate": 0.0001762736842105263, "loss": 0.0238, "num_input_tokens_seen": 472367104, "step": 7210, "train_runtime": 3212.3634, "train_tokens_per_second": 147046.598 }, { "epoch": 1.5620943314582432, "grad_norm": 0.412109375, "learning_rate": 0.0001756421052631579, "loss": 0.019, "num_input_tokens_seen": 473022464, "step": 7220, "train_runtime": 3216.1804, "train_tokens_per_second": 147075.85 }, { "epoch": 1.5642578970142795, "grad_norm": 0.390625, "learning_rate": 0.00017501052631578945, "loss": 0.0176, "num_input_tokens_seen": 473673728, "step": 7230, "train_runtime": 3219.9613, "train_tokens_per_second": 147105.412 }, { "epoch": 1.566421462570316, "grad_norm": 0.33203125, "learning_rate": 0.00017437894736842103, "loss": 0.0204, "num_input_tokens_seen": 474329088, "step": 7240, "train_runtime": 3223.7734, "train_tokens_per_second": 147134.751 }, { "epoch": 1.5685850281263523, "grad_norm": 0.28515625, "learning_rate": 0.00017374736842105262, "loss": 0.0174, "num_input_tokens_seen": 474984448, "step": 7250, "train_runtime": 3227.5767, "train_tokens_per_second": 147164.42 }, { "epoch": 1.5707485936823886, "grad_norm": 0.3828125, "learning_rate": 0.0001731157894736842, "loss": 0.0202, "num_input_tokens_seen": 475639808, "step": 7260, "train_runtime": 3233.6382, "train_tokens_per_second": 147091.224 }, { "epoch": 1.572912159238425, "grad_norm": 0.43359375, "learning_rate": 0.0001724842105263158, "loss": 0.022, "num_input_tokens_seen": 476295168, "step": 7270, "train_runtime": 3237.4441, "train_tokens_per_second": 147120.739 }, { "epoch": 1.5750757247944613, "grad_norm": 0.4296875, "learning_rate": 0.00017185263157894735, "loss": 0.0214, "num_input_tokens_seen": 476950528, "step": 7280, "train_runtime": 3241.2576, "train_tokens_per_second": 147149.836 }, { "epoch": 1.5772392903504975, "grad_norm": 0.380859375, "learning_rate": 0.00017122105263157894, "loss": 0.0207, "num_input_tokens_seen": 477605888, "step": 7290, "train_runtime": 3245.0654, "train_tokens_per_second": 147179.126 }, { "epoch": 1.579402855906534, "grad_norm": 0.40625, "learning_rate": 0.0001705894736842105, "loss": 0.0201, "num_input_tokens_seen": 478261248, "step": 7300, "train_runtime": 3248.8769, "train_tokens_per_second": 147208.177 }, { "epoch": 1.579402855906534, "eval_loss": 0.011056940071284771, "eval_runtime": 2.0875, "eval_samples_per_second": 15.33, "eval_steps_per_second": 0.479, "num_input_tokens_seen": 478261248, "step": 7300 }, { "epoch": 1.579402855906534, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 97.51098357963586, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 478261248, "perplexity": 1.011118293953376, "step": 7300 }, { "epoch": 1.5815664214625702, "grad_norm": 0.396484375, "learning_rate": 0.00016995789473684208, "loss": 0.0197, "num_input_tokens_seen": 478916608, "step": 7310, "train_runtime": 3257.0094, "train_tokens_per_second": 147041.825 }, { "epoch": 1.5837299870186068, "grad_norm": 0.455078125, "learning_rate": 0.00016932631578947367, "loss": 0.0207, "num_input_tokens_seen": 479567872, "step": 7320, "train_runtime": 3260.806, "train_tokens_per_second": 147070.347 }, { "epoch": 1.585893552574643, "grad_norm": 0.396484375, "learning_rate": 0.00016869473684210526, "loss": 0.0188, "num_input_tokens_seen": 480223232, "step": 7330, "train_runtime": 3264.6033, "train_tokens_per_second": 147100.026 }, { "epoch": 1.5880571181306795, "grad_norm": 0.53125, "learning_rate": 0.00016806315789473684, "loss": 0.0233, "num_input_tokens_seen": 480878592, "step": 7340, "train_runtime": 3268.4136, "train_tokens_per_second": 147129.053 }, { "epoch": 1.5902206836867157, "grad_norm": 0.42578125, "learning_rate": 0.00016743157894736843, "loss": 0.0179, "num_input_tokens_seen": 481533952, "step": 7350, "train_runtime": 3272.2159, "train_tokens_per_second": 147158.367 }, { "epoch": 1.592384249242752, "grad_norm": 0.412109375, "learning_rate": 0.0001668, "loss": 0.0203, "num_input_tokens_seen": 482189312, "step": 7360, "train_runtime": 3278.2597, "train_tokens_per_second": 147086.979 }, { "epoch": 1.5945478147987884, "grad_norm": 0.37109375, "learning_rate": 0.00016616842105263155, "loss": 0.0207, "num_input_tokens_seen": 482844672, "step": 7370, "train_runtime": 3282.076, "train_tokens_per_second": 147115.627 }, { "epoch": 1.5967113803548247, "grad_norm": 0.392578125, "learning_rate": 0.00016553684210526313, "loss": 0.0202, "num_input_tokens_seen": 483500032, "step": 7380, "train_runtime": 3285.8809, "train_tokens_per_second": 147144.724 }, { "epoch": 1.598874945910861, "grad_norm": 0.54296875, "learning_rate": 0.00016490526315789472, "loss": 0.0206, "num_input_tokens_seen": 484155392, "step": 7390, "train_runtime": 3289.6598, "train_tokens_per_second": 147174.913 }, { "epoch": 1.6010385114668975, "grad_norm": 0.46484375, "learning_rate": 0.0001642736842105263, "loss": 0.0205, "num_input_tokens_seen": 484810752, "step": 7400, "train_runtime": 3293.4779, "train_tokens_per_second": 147203.282 }, { "epoch": 1.6010385114668975, "eval_loss": 0.012541618198156357, "eval_runtime": 2.0947, "eval_samples_per_second": 15.277, "eval_steps_per_second": 0.477, "num_input_tokens_seen": 484810752, "step": 7400 }, { "epoch": 1.6010385114668975, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 97.57226152603681, "eval_sacrebleu": 96.77994309150014, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 484810752, "perplexity": 1.0126205941082222, "step": 7400 }, { "epoch": 1.6032020770229338, "grad_norm": 0.359375, "learning_rate": 0.0001636421052631579, "loss": 0.0205, "num_input_tokens_seen": 485466112, "step": 7410, "train_runtime": 3301.6581, "train_tokens_per_second": 147037.062 }, { "epoch": 1.6053656425789702, "grad_norm": 0.361328125, "learning_rate": 0.00016301052631578948, "loss": 0.02, "num_input_tokens_seen": 486121472, "step": 7420, "train_runtime": 3305.4728, "train_tokens_per_second": 147065.64 }, { "epoch": 1.6075292081350065, "grad_norm": 0.5, "learning_rate": 0.00016237894736842104, "loss": 0.0204, "num_input_tokens_seen": 486776832, "step": 7430, "train_runtime": 3309.3, "train_tokens_per_second": 147093.596 }, { "epoch": 1.6096927736910427, "grad_norm": 0.466796875, "learning_rate": 0.0001617473684210526, "loss": 0.02, "num_input_tokens_seen": 487432192, "step": 7440, "train_runtime": 3313.1196, "train_tokens_per_second": 147121.821 }, { "epoch": 1.6118563392470793, "grad_norm": 0.3515625, "learning_rate": 0.00016111578947368419, "loss": 0.0191, "num_input_tokens_seen": 488087552, "step": 7450, "train_runtime": 3316.9304, "train_tokens_per_second": 147150.375 }, { "epoch": 1.6140199048031154, "grad_norm": 0.34765625, "learning_rate": 0.00016048421052631577, "loss": 0.0175, "num_input_tokens_seen": 488742912, "step": 7460, "train_runtime": 3323.1698, "train_tokens_per_second": 147071.304 }, { "epoch": 1.616183470359152, "grad_norm": 0.53515625, "learning_rate": 0.00015985263157894736, "loss": 0.0214, "num_input_tokens_seen": 489398272, "step": 7470, "train_runtime": 3326.9742, "train_tokens_per_second": 147100.111 }, { "epoch": 1.6183470359151881, "grad_norm": 0.396484375, "learning_rate": 0.00015922105263157894, "loss": 0.0184, "num_input_tokens_seen": 490053632, "step": 7480, "train_runtime": 3330.7823, "train_tokens_per_second": 147128.691 }, { "epoch": 1.6205106014712247, "grad_norm": 0.427734375, "learning_rate": 0.00015858947368421053, "loss": 0.0191, "num_input_tokens_seen": 490708992, "step": 7490, "train_runtime": 3334.5974, "train_tokens_per_second": 147156.895 }, { "epoch": 1.6226741670272609, "grad_norm": 0.3046875, "learning_rate": 0.00015795789473684212, "loss": 0.0193, "num_input_tokens_seen": 491364352, "step": 7500, "train_runtime": 3338.4043, "train_tokens_per_second": 147185.395 }, { "epoch": 1.6226741670272609, "eval_loss": 0.01195054966956377, "eval_runtime": 2.1304, "eval_samples_per_second": 15.021, "eval_steps_per_second": 0.469, "num_input_tokens_seen": 491364352, "step": 7500 }, { "epoch": 1.6226741670272609, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 97.51098357963586, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 491364352, "perplexity": 1.0120222427943755, "step": 7500 }, { "epoch": 1.6248377325832972, "grad_norm": 0.349609375, "learning_rate": 0.00015732631578947365, "loss": 0.0177, "num_input_tokens_seen": 492019712, "step": 7510, "train_runtime": 3346.6726, "train_tokens_per_second": 147017.581 }, { "epoch": 1.6270012981393336, "grad_norm": 0.306640625, "learning_rate": 0.00015669473684210524, "loss": 0.0198, "num_input_tokens_seen": 492675072, "step": 7520, "train_runtime": 3350.4863, "train_tokens_per_second": 147045.84 }, { "epoch": 1.62916486369537, "grad_norm": 0.404296875, "learning_rate": 0.00015606315789473682, "loss": 0.0191, "num_input_tokens_seen": 493330432, "step": 7530, "train_runtime": 3354.2991, "train_tokens_per_second": 147074.074 }, { "epoch": 1.6313284292514063, "grad_norm": 0.333984375, "learning_rate": 0.0001554315789473684, "loss": 0.0187, "num_input_tokens_seen": 493985792, "step": 7540, "train_runtime": 3358.1088, "train_tokens_per_second": 147102.38 }, { "epoch": 1.6334919948074427, "grad_norm": 0.546875, "learning_rate": 0.0001548, "loss": 0.0203, "num_input_tokens_seen": 494641152, "step": 7550, "train_runtime": 3361.9119, "train_tokens_per_second": 147130.909 }, { "epoch": 1.635655560363479, "grad_norm": 0.458984375, "learning_rate": 0.00015416842105263158, "loss": 0.0216, "num_input_tokens_seen": 495296512, "step": 7560, "train_runtime": 3367.9656, "train_tokens_per_second": 147061.037 }, { "epoch": 1.6378191259195154, "grad_norm": 0.470703125, "learning_rate": 0.00015353684210526317, "loss": 0.0283, "num_input_tokens_seen": 495951872, "step": 7570, "train_runtime": 3371.7831, "train_tokens_per_second": 147088.902 }, { "epoch": 1.6399826914755518, "grad_norm": 0.484375, "learning_rate": 0.0001529052631578947, "loss": 0.0193, "num_input_tokens_seen": 496603136, "step": 7580, "train_runtime": 3375.574, "train_tokens_per_second": 147116.648 }, { "epoch": 1.642146257031588, "grad_norm": 0.431640625, "learning_rate": 0.0001522736842105263, "loss": 0.0434, "num_input_tokens_seen": 497258496, "step": 7590, "train_runtime": 3379.386, "train_tokens_per_second": 147144.627 }, { "epoch": 1.6443098225876245, "grad_norm": 0.6640625, "learning_rate": 0.00015164210526315787, "loss": 0.0233, "num_input_tokens_seen": 497913856, "step": 7600, "train_runtime": 3383.1944, "train_tokens_per_second": 147172.7 }, { "epoch": 1.6443098225876245, "eval_loss": 0.012651835568249226, "eval_runtime": 2.1539, "eval_samples_per_second": 14.857, "eval_steps_per_second": 0.464, "num_input_tokens_seen": 497913856, "step": 7600 }, { "epoch": 1.6443098225876245, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 98.23489619471393, "eval_sacrebleu": 97.07371933981932, "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 497913856, "perplexity": 1.0127322086378236, "step": 7600 }, { "epoch": 1.6464733881436606, "grad_norm": 0.482421875, "learning_rate": 0.00015101052631578946, "loss": 0.0188, "num_input_tokens_seen": 498569216, "step": 7610, "train_runtime": 3391.4129, "train_tokens_per_second": 147009.295 }, { "epoch": 1.6486369536996972, "grad_norm": 0.314453125, "learning_rate": 0.00015037894736842105, "loss": 0.0175, "num_input_tokens_seen": 499224576, "step": 7620, "train_runtime": 3395.2172, "train_tokens_per_second": 147037.597 }, { "epoch": 1.6508005192557333, "grad_norm": 0.345703125, "learning_rate": 0.0001497473684210526, "loss": 0.0206, "num_input_tokens_seen": 499879936, "step": 7630, "train_runtime": 3399.0365, "train_tokens_per_second": 147065.187 }, { "epoch": 1.65296408481177, "grad_norm": 0.45703125, "learning_rate": 0.0001491157894736842, "loss": 0.0221, "num_input_tokens_seen": 500535296, "step": 7640, "train_runtime": 3402.8462, "train_tokens_per_second": 147093.129 }, { "epoch": 1.655127650367806, "grad_norm": 0.53125, "learning_rate": 0.00014848421052631578, "loss": 0.0223, "num_input_tokens_seen": 501190656, "step": 7650, "train_runtime": 3406.6556, "train_tokens_per_second": 147121.024 }, { "epoch": 1.6572912159238427, "grad_norm": 0.388671875, "learning_rate": 0.00014785263157894736, "loss": 0.0185, "num_input_tokens_seen": 501846016, "step": 7660, "train_runtime": 3412.9006, "train_tokens_per_second": 147043.841 }, { "epoch": 1.6594547814798788, "grad_norm": 0.3046875, "learning_rate": 0.00014722105263157892, "loss": 0.019, "num_input_tokens_seen": 502501376, "step": 7670, "train_runtime": 3416.71, "train_tokens_per_second": 147071.709 }, { "epoch": 1.6616183470359152, "grad_norm": 0.298828125, "learning_rate": 0.0001465894736842105, "loss": 0.0196, "num_input_tokens_seen": 503156736, "step": 7680, "train_runtime": 3420.5206, "train_tokens_per_second": 147099.46 }, { "epoch": 1.6637819125919515, "grad_norm": 0.416015625, "learning_rate": 0.0001459578947368421, "loss": 0.0261, "num_input_tokens_seen": 503812096, "step": 7690, "train_runtime": 3424.3328, "train_tokens_per_second": 147127.084 }, { "epoch": 1.6659454781479879, "grad_norm": 0.412109375, "learning_rate": 0.00014532631578947368, "loss": 0.0201, "num_input_tokens_seen": 504467456, "step": 7700, "train_runtime": 3428.1363, "train_tokens_per_second": 147155.016 }, { "epoch": 1.6659454781479879, "eval_loss": 0.011731253936886787, "eval_runtime": 2.1236, "eval_samples_per_second": 15.069, "eval_steps_per_second": 0.471, "num_input_tokens_seen": 504467456, "step": 7700 }, { "epoch": 1.6659454781479879, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 98.23489619471393, "eval_sacrebleu": 97.07371933981932, "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 504467456, "perplexity": 1.0118003349677656, "step": 7700 }, { "epoch": 1.6681090437040242, "grad_norm": 0.453125, "learning_rate": 0.00014469473684210524, "loss": 0.0218, "num_input_tokens_seen": 505122816, "step": 7710, "train_runtime": 3436.3206, "train_tokens_per_second": 146995.251 }, { "epoch": 1.6702726092600606, "grad_norm": 0.330078125, "learning_rate": 0.00014406315789473683, "loss": 0.0175, "num_input_tokens_seen": 505778176, "step": 7720, "train_runtime": 3440.1328, "train_tokens_per_second": 147022.862 }, { "epoch": 1.672436174816097, "grad_norm": 0.30859375, "learning_rate": 0.00014343157894736842, "loss": 0.0195, "num_input_tokens_seen": 506433536, "step": 7730, "train_runtime": 3443.9218, "train_tokens_per_second": 147051.402 }, { "epoch": 1.6745997403721333, "grad_norm": 0.5078125, "learning_rate": 0.00014279999999999997, "loss": 0.0192, "num_input_tokens_seen": 507088896, "step": 7740, "train_runtime": 3447.7441, "train_tokens_per_second": 147078.462 }, { "epoch": 1.6767633059281697, "grad_norm": 0.4375, "learning_rate": 0.00014216842105263156, "loss": 0.0175, "num_input_tokens_seen": 507744256, "step": 7750, "train_runtime": 3451.5673, "train_tokens_per_second": 147105.42 }, { "epoch": 1.6789268714842058, "grad_norm": 0.35546875, "learning_rate": 0.00014153684210526315, "loss": 0.0192, "num_input_tokens_seen": 508399616, "step": 7760, "train_runtime": 3457.6351, "train_tokens_per_second": 147036.804 }, { "epoch": 1.6810904370402424, "grad_norm": 0.46484375, "learning_rate": 0.00014090526315789473, "loss": 0.0178, "num_input_tokens_seen": 509054976, "step": 7770, "train_runtime": 3461.4502, "train_tokens_per_second": 147064.076 }, { "epoch": 1.6832540025962786, "grad_norm": 0.421875, "learning_rate": 0.0001402736842105263, "loss": 0.0194, "num_input_tokens_seen": 509710336, "step": 7780, "train_runtime": 3465.2701, "train_tokens_per_second": 147091.085 }, { "epoch": 1.6854175681523151, "grad_norm": 0.33203125, "learning_rate": 0.00013964210526315788, "loss": 0.0207, "num_input_tokens_seen": 510365696, "step": 7790, "train_runtime": 3469.0795, "train_tokens_per_second": 147118.476 }, { "epoch": 1.6875811337083513, "grad_norm": 0.44140625, "learning_rate": 0.00013901052631578947, "loss": 0.0214, "num_input_tokens_seen": 511021056, "step": 7800, "train_runtime": 3472.8918, "train_tokens_per_second": 147145.689 }, { "epoch": 1.6875811337083513, "eval_loss": 0.011290564201772213, "eval_runtime": 2.1435, "eval_samples_per_second": 14.929, "eval_steps_per_second": 0.467, "num_input_tokens_seen": 511021056, "step": 7800 }, { "epoch": 1.6875811337083513, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 98.23489619471393, "eval_sacrebleu": 97.07371933981932, "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 511021056, "perplexity": 1.0113545431813067, "step": 7800 }, { "epoch": 1.6897446992643879, "grad_norm": 0.326171875, "learning_rate": 0.00013837894736842103, "loss": 0.0209, "num_input_tokens_seen": 511676416, "step": 7810, "train_runtime": 3481.1094, "train_tokens_per_second": 146986.596 }, { "epoch": 1.691908264820424, "grad_norm": 0.3671875, "learning_rate": 0.0001377473684210526, "loss": 0.0199, "num_input_tokens_seen": 512331776, "step": 7820, "train_runtime": 3484.9263, "train_tokens_per_second": 147013.66 }, { "epoch": 1.6940718303764604, "grad_norm": 0.37109375, "learning_rate": 0.0001371157894736842, "loss": 0.0195, "num_input_tokens_seen": 512987136, "step": 7830, "train_runtime": 3488.7424, "train_tokens_per_second": 147040.702 }, { "epoch": 1.6962353959324967, "grad_norm": 0.37109375, "learning_rate": 0.00013648421052631578, "loss": 0.0182, "num_input_tokens_seen": 513642496, "step": 7840, "train_runtime": 3492.5624, "train_tokens_per_second": 147067.522 }, { "epoch": 1.698398961488533, "grad_norm": 0.330078125, "learning_rate": 0.00013585263157894734, "loss": 0.0184, "num_input_tokens_seen": 514297856, "step": 7850, "train_runtime": 3496.381, "train_tokens_per_second": 147094.34 }, { "epoch": 1.7005625270445694, "grad_norm": 0.462890625, "learning_rate": 0.00013522105263157893, "loss": 0.0188, "num_input_tokens_seen": 514953216, "step": 7860, "train_runtime": 3502.2526, "train_tokens_per_second": 147034.859 }, { "epoch": 1.7027260926006058, "grad_norm": 0.486328125, "learning_rate": 0.00013458947368421052, "loss": 0.0187, "num_input_tokens_seen": 515608576, "step": 7870, "train_runtime": 3506.0684, "train_tokens_per_second": 147061.755 }, { "epoch": 1.7048896581566422, "grad_norm": 0.3046875, "learning_rate": 0.00013395789473684208, "loss": 0.0185, "num_input_tokens_seen": 516263936, "step": 7880, "train_runtime": 3509.8938, "train_tokens_per_second": 147088.194 }, { "epoch": 1.7070532237126785, "grad_norm": 0.44921875, "learning_rate": 0.00013332631578947366, "loss": 0.0201, "num_input_tokens_seen": 516919296, "step": 7890, "train_runtime": 3513.7148, "train_tokens_per_second": 147114.755 }, { "epoch": 1.709216789268715, "grad_norm": 0.458984375, "learning_rate": 0.00013269473684210525, "loss": 0.0178, "num_input_tokens_seen": 517574656, "step": 7900, "train_runtime": 3517.5325, "train_tokens_per_second": 147141.402 }, { "epoch": 1.709216789268715, "eval_loss": 0.010886249132454395, "eval_runtime": 2.4894, "eval_samples_per_second": 12.855, "eval_steps_per_second": 0.402, "num_input_tokens_seen": 517574656, "step": 7900 }, { "epoch": 1.709216789268715, "eval_byte_accuracy": 0.9970326409495549, "eval_chrf": 98.32407535469744, "eval_sacrebleu": 97.07371933981932, "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 517574656, "perplexity": 1.010945719951341, "step": 7900 }, { "epoch": 1.711380354824751, "grad_norm": 0.3671875, "learning_rate": 0.00013206315789473684, "loss": 0.0193, "num_input_tokens_seen": 518230016, "step": 7910, "train_runtime": 3525.9923, "train_tokens_per_second": 146974.235 }, { "epoch": 1.7135439203807876, "grad_norm": 0.337890625, "learning_rate": 0.0001314315789473684, "loss": 0.0201, "num_input_tokens_seen": 518885376, "step": 7920, "train_runtime": 3529.8039, "train_tokens_per_second": 147001.191 }, { "epoch": 1.7157074859368238, "grad_norm": 0.404296875, "learning_rate": 0.00013079999999999998, "loss": 0.017, "num_input_tokens_seen": 519540736, "step": 7930, "train_runtime": 3533.6263, "train_tokens_per_second": 147027.639 }, { "epoch": 1.7178710514928603, "grad_norm": 0.396484375, "learning_rate": 0.00013016842105263157, "loss": 0.0197, "num_input_tokens_seen": 520196096, "step": 7940, "train_runtime": 3537.4337, "train_tokens_per_second": 147054.655 }, { "epoch": 1.7200346170488965, "grad_norm": 0.357421875, "learning_rate": 0.00012953684210526313, "loss": 0.0199, "num_input_tokens_seen": 520851456, "step": 7950, "train_runtime": 3541.2447, "train_tokens_per_second": 147081.465 }, { "epoch": 1.722198182604933, "grad_norm": 0.310546875, "learning_rate": 0.0001289052631578947, "loss": 0.0204, "num_input_tokens_seen": 521506816, "step": 7960, "train_runtime": 3547.2565, "train_tokens_per_second": 147016.947 }, { "epoch": 1.7243617481609692, "grad_norm": 0.326171875, "learning_rate": 0.0001282736842105263, "loss": 0.0181, "num_input_tokens_seen": 522162176, "step": 7970, "train_runtime": 3551.0783, "train_tokens_per_second": 147043.272 }, { "epoch": 1.7265253137170056, "grad_norm": 0.474609375, "learning_rate": 0.00012764210526315789, "loss": 0.0194, "num_input_tokens_seen": 522817536, "step": 7980, "train_runtime": 3554.8984, "train_tokens_per_second": 147069.614 }, { "epoch": 1.728688879273042, "grad_norm": 0.5625, "learning_rate": 0.00012701052631578945, "loss": 0.02, "num_input_tokens_seen": 523472896, "step": 7990, "train_runtime": 3558.7128, "train_tokens_per_second": 147096.132 }, { "epoch": 1.7308524448290783, "grad_norm": 0.421875, "learning_rate": 0.00012637894736842103, "loss": 0.0204, "num_input_tokens_seen": 524128256, "step": 8000, "train_runtime": 3562.5131, "train_tokens_per_second": 147123.179 }, { "epoch": 1.7308524448290783, "eval_loss": 0.011375652626156807, "eval_runtime": 2.1623, "eval_samples_per_second": 14.799, "eval_steps_per_second": 0.462, "num_input_tokens_seen": 524128256, "step": 8000 }, { "epoch": 1.7308524448290783, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 98.23489619471393, "eval_sacrebleu": 97.07371933981932, "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 524128256, "perplexity": 1.0114406014071076, "step": 8000 }, { "epoch": 1.7330160103851147, "grad_norm": 0.478515625, "learning_rate": 0.00012574736842105262, "loss": 0.0184, "num_input_tokens_seen": 524783616, "step": 8010, "train_runtime": 3570.7811, "train_tokens_per_second": 146966.057 }, { "epoch": 1.735179575941151, "grad_norm": 0.296875, "learning_rate": 0.0001251157894736842, "loss": 0.0207, "num_input_tokens_seen": 525438976, "step": 8020, "train_runtime": 3575.036, "train_tokens_per_second": 146974.456 }, { "epoch": 1.7373431414971874, "grad_norm": 0.427734375, "learning_rate": 0.00012448421052631576, "loss": 0.0203, "num_input_tokens_seen": 526094336, "step": 8030, "train_runtime": 3578.8478, "train_tokens_per_second": 147001.035 }, { "epoch": 1.7395067070532237, "grad_norm": 0.310546875, "learning_rate": 0.00012385263157894735, "loss": 0.0193, "num_input_tokens_seen": 526749696, "step": 8040, "train_runtime": 3582.657, "train_tokens_per_second": 147027.666 }, { "epoch": 1.74167027260926, "grad_norm": 0.353515625, "learning_rate": 0.00012322105263157894, "loss": 0.0193, "num_input_tokens_seen": 527405056, "step": 8050, "train_runtime": 3586.4702, "train_tokens_per_second": 147054.076 }, { "epoch": 1.7438338381652962, "grad_norm": 0.337890625, "learning_rate": 0.00012258947368421052, "loss": 0.0207, "num_input_tokens_seen": 528060416, "step": 8060, "train_runtime": 3592.3668, "train_tokens_per_second": 146995.127 }, { "epoch": 1.7459974037213328, "grad_norm": 0.31640625, "learning_rate": 0.0001219578947368421, "loss": 0.0168, "num_input_tokens_seen": 528715776, "step": 8070, "train_runtime": 3596.1775, "train_tokens_per_second": 147021.603 }, { "epoch": 1.748160969277369, "grad_norm": 0.357421875, "learning_rate": 0.00012132631578947368, "loss": 0.0185, "num_input_tokens_seen": 529371136, "step": 8080, "train_runtime": 3599.9884, "train_tokens_per_second": 147048.01 }, { "epoch": 1.7503245348334056, "grad_norm": 0.373046875, "learning_rate": 0.00012069473684210526, "loss": 0.019, "num_input_tokens_seen": 530026496, "step": 8090, "train_runtime": 3603.7949, "train_tokens_per_second": 147074.546 }, { "epoch": 1.7524881003894417, "grad_norm": 0.380859375, "learning_rate": 0.00012006315789473683, "loss": 0.0219, "num_input_tokens_seen": 530681856, "step": 8100, "train_runtime": 3607.6029, "train_tokens_per_second": 147100.961 }, { "epoch": 1.7524881003894417, "eval_loss": 0.010724018327891827, "eval_runtime": 2.0748, "eval_samples_per_second": 15.423, "eval_steps_per_second": 0.482, "num_input_tokens_seen": 530681856, "step": 8100 }, { "epoch": 1.7524881003894417, "eval_byte_accuracy": 0.9970326409495549, "eval_chrf": 98.23489619471393, "eval_sacrebleu": 97.07371933981932, "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 530681856, "perplexity": 1.010781726716561, "step": 8100 }, { "epoch": 1.7546516659454783, "grad_norm": 0.515625, "learning_rate": 0.00011943157894736841, "loss": 0.0195, "num_input_tokens_seen": 531337216, "step": 8110, "train_runtime": 3615.6952, "train_tokens_per_second": 146952.99 }, { "epoch": 1.7568152315015144, "grad_norm": 0.40234375, "learning_rate": 0.0001188, "loss": 0.0194, "num_input_tokens_seen": 531992576, "step": 8120, "train_runtime": 3619.5079, "train_tokens_per_second": 146979.257 }, { "epoch": 1.758978797057551, "grad_norm": 0.4296875, "learning_rate": 0.00011816842105263156, "loss": 0.0193, "num_input_tokens_seen": 532647936, "step": 8130, "train_runtime": 3623.3058, "train_tokens_per_second": 147006.066 }, { "epoch": 1.7611423626135871, "grad_norm": 0.4296875, "learning_rate": 0.00011753684210526315, "loss": 0.0191, "num_input_tokens_seen": 533303296, "step": 8140, "train_runtime": 3627.1278, "train_tokens_per_second": 147031.844 }, { "epoch": 1.7633059281696235, "grad_norm": 0.53515625, "learning_rate": 0.00011690526315789473, "loss": 0.0218, "num_input_tokens_seen": 533958656, "step": 8150, "train_runtime": 3630.9307, "train_tokens_per_second": 147058.345 }, { "epoch": 1.7654694937256599, "grad_norm": 0.5, "learning_rate": 0.00011627368421052632, "loss": 0.0247, "num_input_tokens_seen": 534614016, "step": 8160, "train_runtime": 3636.9571, "train_tokens_per_second": 146994.864 }, { "epoch": 1.7676330592816962, "grad_norm": 0.3359375, "learning_rate": 0.00011564210526315788, "loss": 0.0207, "num_input_tokens_seen": 535269376, "step": 8170, "train_runtime": 3640.7675, "train_tokens_per_second": 147021.025 }, { "epoch": 1.7697966248377326, "grad_norm": 0.515625, "learning_rate": 0.00011501052631578947, "loss": 0.023, "num_input_tokens_seen": 535924736, "step": 8180, "train_runtime": 3644.5823, "train_tokens_per_second": 147046.959 }, { "epoch": 1.771960190393769, "grad_norm": 0.345703125, "learning_rate": 0.00011437894736842105, "loss": 0.018, "num_input_tokens_seen": 536580096, "step": 8190, "train_runtime": 3648.3887, "train_tokens_per_second": 147073.17 }, { "epoch": 1.7741237559498053, "grad_norm": 0.373046875, "learning_rate": 0.00011374736842105261, "loss": 0.0172, "num_input_tokens_seen": 537235456, "step": 8200, "train_runtime": 3652.206, "train_tokens_per_second": 147098.892 }, { "epoch": 1.7741237559498053, "eval_loss": 0.01021627802401781, "eval_runtime": 2.0679, "eval_samples_per_second": 15.474, "eval_steps_per_second": 0.484, "num_input_tokens_seen": 537235456, "step": 8200 }, { "epoch": 1.7741237559498053, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 98.29630543849224, "eval_sacrebleu": 97.56427984542309, "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 537235456, "perplexity": 1.0102686423633132, "step": 8200 }, { "epoch": 1.7762873215058417, "grad_norm": 0.5390625, "learning_rate": 0.0001131157894736842, "loss": 0.0215, "num_input_tokens_seen": 537890816, "step": 8210, "train_runtime": 3660.4725, "train_tokens_per_second": 146945.734 }, { "epoch": 1.778450887061878, "grad_norm": 0.396484375, "learning_rate": 0.00011248421052631578, "loss": 0.018, "num_input_tokens_seen": 538546176, "step": 8220, "train_runtime": 3664.28, "train_tokens_per_second": 146971.895 }, { "epoch": 1.7806144526179142, "grad_norm": 0.333984375, "learning_rate": 0.00011185263157894737, "loss": 0.0219, "num_input_tokens_seen": 539201536, "step": 8230, "train_runtime": 3668.0854, "train_tokens_per_second": 146998.086 }, { "epoch": 1.7827780181739508, "grad_norm": 0.412109375, "learning_rate": 0.00011122105263157893, "loss": 0.0178, "num_input_tokens_seen": 539856896, "step": 8240, "train_runtime": 3671.8996, "train_tokens_per_second": 147023.871 }, { "epoch": 1.784941583729987, "grad_norm": 0.365234375, "learning_rate": 0.00011058947368421052, "loss": 0.0218, "num_input_tokens_seen": 540512256, "step": 8250, "train_runtime": 3675.7129, "train_tokens_per_second": 147049.638 }, { "epoch": 1.7871051492860235, "grad_norm": 0.369140625, "learning_rate": 0.0001099578947368421, "loss": 0.0184, "num_input_tokens_seen": 541167616, "step": 8260, "train_runtime": 3681.7547, "train_tokens_per_second": 146986.332 }, { "epoch": 1.7892687148420596, "grad_norm": 0.353515625, "learning_rate": 0.00010932631578947366, "loss": 0.0189, "num_input_tokens_seen": 541822976, "step": 8270, "train_runtime": 3685.564, "train_tokens_per_second": 147012.228 }, { "epoch": 1.7914322803980962, "grad_norm": 0.384765625, "learning_rate": 0.00010869473684210525, "loss": 0.0179, "num_input_tokens_seen": 542478336, "step": 8280, "train_runtime": 3689.3687, "train_tokens_per_second": 147038.257 }, { "epoch": 1.7935958459541324, "grad_norm": 0.3671875, "learning_rate": 0.00010806315789473683, "loss": 0.0216, "num_input_tokens_seen": 543133696, "step": 8290, "train_runtime": 3693.18, "train_tokens_per_second": 147063.966 }, { "epoch": 1.7957594115101687, "grad_norm": 0.341796875, "learning_rate": 0.00010743157894736842, "loss": 0.0183, "num_input_tokens_seen": 543789056, "step": 8300, "train_runtime": 3696.986, "train_tokens_per_second": 147089.833 }, { "epoch": 1.7957594115101687, "eval_loss": 0.010247361846268177, "eval_runtime": 2.1361, "eval_samples_per_second": 14.981, "eval_steps_per_second": 0.468, "num_input_tokens_seen": 543789056, "step": 8300 }, { "epoch": 1.7957594115101687, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 98.23489619471393, "eval_sacrebleu": 97.07371933981932, "eval_word_accuracy": 0.9917582417582418, "num_input_tokens_seen": 543789056, "perplexity": 1.0103000458622853, "step": 8300 }, { "epoch": 1.797922977066205, "grad_norm": 0.427734375, "learning_rate": 0.00010679999999999998, "loss": 0.0217, "num_input_tokens_seen": 544444416, "step": 8310, "train_runtime": 3705.3713, "train_tokens_per_second": 146933.836 }, { "epoch": 1.8000865426222414, "grad_norm": 0.3046875, "learning_rate": 0.00010616842105263157, "loss": 0.0188, "num_input_tokens_seen": 545099776, "step": 8320, "train_runtime": 3709.1839, "train_tokens_per_second": 146959.49 }, { "epoch": 1.8022501081782778, "grad_norm": 0.298828125, "learning_rate": 0.00010553684210526315, "loss": 0.019, "num_input_tokens_seen": 545755136, "step": 8330, "train_runtime": 3713.0027, "train_tokens_per_second": 146984.848 }, { "epoch": 1.8044136737343142, "grad_norm": 0.3828125, "learning_rate": 0.00010490526315789473, "loss": 0.0198, "num_input_tokens_seen": 546410496, "step": 8340, "train_runtime": 3716.8139, "train_tokens_per_second": 147010.452 }, { "epoch": 1.8065772392903505, "grad_norm": 0.2490234375, "learning_rate": 0.0001042736842105263, "loss": 0.019, "num_input_tokens_seen": 547065856, "step": 8350, "train_runtime": 3720.6281, "train_tokens_per_second": 147035.889 }, { "epoch": 1.8087408048463869, "grad_norm": 0.345703125, "learning_rate": 0.00010364210526315789, "loss": 0.019, "num_input_tokens_seen": 547721216, "step": 8360, "train_runtime": 3726.5016, "train_tokens_per_second": 146980.002 }, { "epoch": 1.8109043704024232, "grad_norm": 0.439453125, "learning_rate": 0.00010301052631578947, "loss": 0.0196, "num_input_tokens_seen": 548376576, "step": 8370, "train_runtime": 3730.3164, "train_tokens_per_second": 147005.379 }, { "epoch": 1.8130679359584594, "grad_norm": 0.3984375, "learning_rate": 0.00010237894736842104, "loss": 0.0187, "num_input_tokens_seen": 549031936, "step": 8380, "train_runtime": 3734.1271, "train_tokens_per_second": 147030.864 }, { "epoch": 1.815231501514496, "grad_norm": 0.39453125, "learning_rate": 0.00010174736842105262, "loss": 0.0202, "num_input_tokens_seen": 549687296, "step": 8390, "train_runtime": 3737.9352, "train_tokens_per_second": 147056.4 }, { "epoch": 1.817395067070532, "grad_norm": 0.3671875, "learning_rate": 0.0001011157894736842, "loss": 0.0208, "num_input_tokens_seen": 550342656, "step": 8400, "train_runtime": 3741.7466, "train_tokens_per_second": 147081.753 }, { "epoch": 1.817395067070532, "eval_loss": 0.010128295980393887, "eval_runtime": 2.142, "eval_samples_per_second": 14.939, "eval_steps_per_second": 0.467, "num_input_tokens_seen": 550342656, "step": 8400 }, { "epoch": 1.817395067070532, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 98.23489619471393, "eval_sacrebleu": 97.07371933981932, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 550342656, "perplexity": 1.010179760773598, "step": 8400 }, { "epoch": 1.8195586326265687, "grad_norm": 0.41796875, "learning_rate": 0.00010048421052631578, "loss": 0.019, "num_input_tokens_seen": 550993920, "step": 8410, "train_runtime": 3750.089, "train_tokens_per_second": 146928.223 }, { "epoch": 1.8217221981826048, "grad_norm": 0.52734375, "learning_rate": 9.985263157894735e-05, "loss": 0.0171, "num_input_tokens_seen": 551649280, "step": 8420, "train_runtime": 3753.8968, "train_tokens_per_second": 146953.767 }, { "epoch": 1.8238857637386414, "grad_norm": 0.33203125, "learning_rate": 9.922105263157894e-05, "loss": 0.0212, "num_input_tokens_seen": 552304640, "step": 8430, "train_runtime": 3757.7027, "train_tokens_per_second": 146979.333 }, { "epoch": 1.8260493292946776, "grad_norm": 0.41796875, "learning_rate": 9.858947368421052e-05, "loss": 0.0203, "num_input_tokens_seen": 552960000, "step": 8440, "train_runtime": 3761.516, "train_tokens_per_second": 147004.558 }, { "epoch": 1.8282128948507141, "grad_norm": 0.32421875, "learning_rate": 9.79578947368421e-05, "loss": 0.0195, "num_input_tokens_seen": 553615360, "step": 8450, "train_runtime": 3765.3177, "train_tokens_per_second": 147030.187 }, { "epoch": 1.8303764604067503, "grad_norm": 0.296875, "learning_rate": 9.732631578947367e-05, "loss": 0.0182, "num_input_tokens_seen": 554270720, "step": 8460, "train_runtime": 3771.3697, "train_tokens_per_second": 146968.016 }, { "epoch": 1.8325400259627866, "grad_norm": 0.337890625, "learning_rate": 9.669473684210525e-05, "loss": 0.0161, "num_input_tokens_seen": 554926080, "step": 8470, "train_runtime": 3775.1871, "train_tokens_per_second": 146993.0 }, { "epoch": 1.834703591518823, "grad_norm": 0.318359375, "learning_rate": 9.606315789473684e-05, "loss": 0.0169, "num_input_tokens_seen": 555581440, "step": 8480, "train_runtime": 3779.0067, "train_tokens_per_second": 147017.849 }, { "epoch": 1.8368671570748594, "grad_norm": 0.380859375, "learning_rate": 9.543157894736841e-05, "loss": 0.0208, "num_input_tokens_seen": 556236800, "step": 8490, "train_runtime": 3782.8155, "train_tokens_per_second": 147043.07 }, { "epoch": 1.8390307226308957, "grad_norm": 0.474609375, "learning_rate": 9.479999999999999e-05, "loss": 0.0202, "num_input_tokens_seen": 556892160, "step": 8500, "train_runtime": 3786.6204, "train_tokens_per_second": 147068.389 }, { "epoch": 1.8390307226308957, "eval_loss": 0.01045873574912548, "eval_runtime": 2.1368, "eval_samples_per_second": 14.976, "eval_steps_per_second": 0.468, "num_input_tokens_seen": 556892160, "step": 8500 }, { "epoch": 1.8390307226308957, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 97.60016273961939, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 556892160, "perplexity": 1.0105136194971873, "step": 8500 }, { "epoch": 1.841194288186932, "grad_norm": 0.462890625, "learning_rate": 9.416842105263157e-05, "loss": 0.02, "num_input_tokens_seen": 557547520, "step": 8510, "train_runtime": 3794.8809, "train_tokens_per_second": 146920.954 }, { "epoch": 1.8433578537429685, "grad_norm": 0.345703125, "learning_rate": 9.353684210526315e-05, "loss": 0.0176, "num_input_tokens_seen": 558202880, "step": 8520, "train_runtime": 3798.6923, "train_tokens_per_second": 146946.064 }, { "epoch": 1.8455214192990046, "grad_norm": 0.349609375, "learning_rate": 9.290526315789473e-05, "loss": 0.0197, "num_input_tokens_seen": 558858240, "step": 8530, "train_runtime": 3802.5052, "train_tokens_per_second": 146971.064 }, { "epoch": 1.8476849848550412, "grad_norm": 0.314453125, "learning_rate": 9.22736842105263e-05, "loss": 0.0176, "num_input_tokens_seen": 559513600, "step": 8540, "train_runtime": 3806.3149, "train_tokens_per_second": 146996.142 }, { "epoch": 1.8498485504110773, "grad_norm": 0.34375, "learning_rate": 9.164210526315789e-05, "loss": 0.0177, "num_input_tokens_seen": 560168960, "step": 8550, "train_runtime": 3810.127, "train_tokens_per_second": 147021.073 }, { "epoch": 1.852012115967114, "grad_norm": 0.294921875, "learning_rate": 9.101052631578946e-05, "loss": 0.0187, "num_input_tokens_seen": 560824320, "step": 8560, "train_runtime": 3816.3836, "train_tokens_per_second": 146951.77 }, { "epoch": 1.85417568152315, "grad_norm": 0.298828125, "learning_rate": 9.037894736842105e-05, "loss": 0.018, "num_input_tokens_seen": 561479680, "step": 8570, "train_runtime": 3820.1956, "train_tokens_per_second": 146976.682 }, { "epoch": 1.8563392470791866, "grad_norm": 0.328125, "learning_rate": 8.974736842105262e-05, "loss": 0.0181, "num_input_tokens_seen": 562135040, "step": 8580, "train_runtime": 3824.0034, "train_tokens_per_second": 147001.709 }, { "epoch": 1.8585028126352228, "grad_norm": 0.384765625, "learning_rate": 8.91157894736842e-05, "loss": 0.0198, "num_input_tokens_seen": 562790400, "step": 8590, "train_runtime": 3827.8147, "train_tokens_per_second": 147026.553 }, { "epoch": 1.8606663781912594, "grad_norm": 0.353515625, "learning_rate": 8.848421052631578e-05, "loss": 0.0206, "num_input_tokens_seen": 563445760, "step": 8600, "train_runtime": 3831.6313, "train_tokens_per_second": 147051.143 }, { "epoch": 1.8606663781912594, "eval_loss": 0.010154013521969318, "eval_runtime": 2.1562, "eval_samples_per_second": 14.841, "eval_steps_per_second": 0.464, "num_input_tokens_seen": 563445760, "step": 8600 }, { "epoch": 1.8606663781912594, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 98.32407535469744, "eval_sacrebleu": 97.07371933981932, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 563445760, "perplexity": 1.0102057404476597, "step": 8600 }, { "epoch": 1.8628299437472955, "grad_norm": 0.388671875, "learning_rate": 8.785263157894737e-05, "loss": 0.0183, "num_input_tokens_seen": 564101120, "step": 8610, "train_runtime": 3839.8519, "train_tokens_per_second": 146906.999 }, { "epoch": 1.8649935093033319, "grad_norm": 0.322265625, "learning_rate": 8.722105263157894e-05, "loss": 0.0187, "num_input_tokens_seen": 564756480, "step": 8620, "train_runtime": 3843.6679, "train_tokens_per_second": 146931.653 }, { "epoch": 1.8671570748593682, "grad_norm": 0.314453125, "learning_rate": 8.658947368421052e-05, "loss": 0.0165, "num_input_tokens_seen": 565411840, "step": 8630, "train_runtime": 3847.4757, "train_tokens_per_second": 146956.572 }, { "epoch": 1.8693206404154046, "grad_norm": 0.330078125, "learning_rate": 8.59578947368421e-05, "loss": 0.0191, "num_input_tokens_seen": 566063104, "step": 8640, "train_runtime": 3851.2684, "train_tokens_per_second": 146980.955 }, { "epoch": 1.871484205971441, "grad_norm": 0.353515625, "learning_rate": 8.532631578947369e-05, "loss": 0.0185, "num_input_tokens_seen": 566718464, "step": 8650, "train_runtime": 3855.0844, "train_tokens_per_second": 147005.461 }, { "epoch": 1.8736477715274773, "grad_norm": 0.30859375, "learning_rate": 8.469473684210525e-05, "loss": 0.0203, "num_input_tokens_seen": 567373824, "step": 8660, "train_runtime": 3861.1356, "train_tokens_per_second": 146944.805 }, { "epoch": 1.8758113370835137, "grad_norm": 0.3671875, "learning_rate": 8.406315789473683e-05, "loss": 0.0365, "num_input_tokens_seen": 568029184, "step": 8670, "train_runtime": 3864.9452, "train_tokens_per_second": 146969.53 }, { "epoch": 1.87797490263955, "grad_norm": 0.35546875, "learning_rate": 8.343157894736842e-05, "loss": 0.0185, "num_input_tokens_seen": 568684544, "step": 8680, "train_runtime": 3868.7494, "train_tokens_per_second": 146994.411 }, { "epoch": 1.8801384681955864, "grad_norm": 0.3671875, "learning_rate": 8.28e-05, "loss": 0.0189, "num_input_tokens_seen": 569339904, "step": 8690, "train_runtime": 3872.5576, "train_tokens_per_second": 147019.093 }, { "epoch": 1.8823020337516225, "grad_norm": 0.41015625, "learning_rate": 8.216842105263157e-05, "loss": 0.0163, "num_input_tokens_seen": 569995264, "step": 8700, "train_runtime": 3876.3574, "train_tokens_per_second": 147044.043 }, { "epoch": 1.8823020337516225, "eval_loss": 0.010016790591180325, "eval_runtime": 2.1432, "eval_samples_per_second": 14.931, "eval_steps_per_second": 0.467, "num_input_tokens_seen": 569995264, "step": 8700 }, { "epoch": 1.8823020337516225, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 97.60016273961939, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 569995264, "perplexity": 1.0100671265659746, "step": 8700 }, { "epoch": 1.8844655993076591, "grad_norm": 0.2734375, "learning_rate": 8.153684210526315e-05, "loss": 0.0182, "num_input_tokens_seen": 570650624, "step": 8710, "train_runtime": 3884.7322, "train_tokens_per_second": 146895.744 }, { "epoch": 1.8866291648636953, "grad_norm": 0.380859375, "learning_rate": 8.090526315789474e-05, "loss": 0.0173, "num_input_tokens_seen": 571305984, "step": 8720, "train_runtime": 3888.5094, "train_tokens_per_second": 146921.59 }, { "epoch": 1.8887927304197318, "grad_norm": 0.302734375, "learning_rate": 8.02736842105263e-05, "loss": 0.0171, "num_input_tokens_seen": 571961344, "step": 8730, "train_runtime": 3892.3271, "train_tokens_per_second": 146945.86 }, { "epoch": 1.890956295975768, "grad_norm": 0.423828125, "learning_rate": 7.964210526315788e-05, "loss": 0.0193, "num_input_tokens_seen": 572616704, "step": 8740, "train_runtime": 3896.1376, "train_tokens_per_second": 146970.351 }, { "epoch": 1.8931198615318046, "grad_norm": 0.28125, "learning_rate": 7.901052631578947e-05, "loss": 0.0189, "num_input_tokens_seen": 573272064, "step": 8750, "train_runtime": 3899.9488, "train_tokens_per_second": 146994.766 }, { "epoch": 1.8952834270878407, "grad_norm": 0.330078125, "learning_rate": 7.837894736842106e-05, "loss": 0.0174, "num_input_tokens_seen": 573927424, "step": 8760, "train_runtime": 3906.0027, "train_tokens_per_second": 146934.724 }, { "epoch": 1.897446992643877, "grad_norm": 0.318359375, "learning_rate": 7.774736842105262e-05, "loss": 0.0197, "num_input_tokens_seen": 574582784, "step": 8770, "train_runtime": 3909.8136, "train_tokens_per_second": 146959.123 }, { "epoch": 1.8996105581999134, "grad_norm": 0.4453125, "learning_rate": 7.71157894736842e-05, "loss": 0.0175, "num_input_tokens_seen": 575238144, "step": 8780, "train_runtime": 3913.6229, "train_tokens_per_second": 146983.538 }, { "epoch": 1.9017741237559498, "grad_norm": 0.421875, "learning_rate": 7.648421052631579e-05, "loss": 0.0204, "num_input_tokens_seen": 575893504, "step": 8790, "train_runtime": 3917.4364, "train_tokens_per_second": 147007.747 }, { "epoch": 1.9039376893119861, "grad_norm": 0.392578125, "learning_rate": 7.585263157894735e-05, "loss": 0.0205, "num_input_tokens_seen": 576548864, "step": 8800, "train_runtime": 3921.2307, "train_tokens_per_second": 147032.631 }, { "epoch": 1.9039376893119861, "eval_loss": 0.010483089834451675, "eval_runtime": 2.1523, "eval_samples_per_second": 14.868, "eval_steps_per_second": 0.465, "num_input_tokens_seen": 576548864, "step": 8800 }, { "epoch": 1.9039376893119861, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 97.51098357963586, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 576548864, "perplexity": 1.0105382299317809, "step": 8800 }, { "epoch": 1.9061012548680225, "grad_norm": 0.326171875, "learning_rate": 7.522105263157894e-05, "loss": 0.0181, "num_input_tokens_seen": 577204224, "step": 8810, "train_runtime": 3929.4565, "train_tokens_per_second": 146891.618 }, { "epoch": 1.9082648204240589, "grad_norm": 0.357421875, "learning_rate": 7.458947368421052e-05, "loss": 0.0184, "num_input_tokens_seen": 577859584, "step": 8820, "train_runtime": 3933.2587, "train_tokens_per_second": 146916.243 }, { "epoch": 1.9104283859800952, "grad_norm": 0.365234375, "learning_rate": 7.39578947368421e-05, "loss": 0.0177, "num_input_tokens_seen": 578514944, "step": 8830, "train_runtime": 3937.0791, "train_tokens_per_second": 146940.136 }, { "epoch": 1.9125919515361316, "grad_norm": 0.376953125, "learning_rate": 7.332631578947368e-05, "loss": 0.0181, "num_input_tokens_seen": 579170304, "step": 8840, "train_runtime": 3940.8873, "train_tokens_per_second": 146964.442 }, { "epoch": 1.9147555170921677, "grad_norm": 0.390625, "learning_rate": 7.269473684210525e-05, "loss": 0.0181, "num_input_tokens_seen": 579821568, "step": 8850, "train_runtime": 3944.6768, "train_tokens_per_second": 146988.357 }, { "epoch": 1.9169190826482043, "grad_norm": 0.419921875, "learning_rate": 7.206315789473684e-05, "loss": 0.0174, "num_input_tokens_seen": 580476928, "step": 8860, "train_runtime": 3950.7316, "train_tokens_per_second": 146928.971 }, { "epoch": 1.9190826482042405, "grad_norm": 0.330078125, "learning_rate": 7.143157894736841e-05, "loss": 0.0412, "num_input_tokens_seen": 581132288, "step": 8870, "train_runtime": 3954.5446, "train_tokens_per_second": 146953.023 }, { "epoch": 1.921246213760277, "grad_norm": 0.3125, "learning_rate": 7.079999999999999e-05, "loss": 0.0205, "num_input_tokens_seen": 581787648, "step": 8880, "train_runtime": 3958.3515, "train_tokens_per_second": 146977.258 }, { "epoch": 1.9234097793163132, "grad_norm": 0.29296875, "learning_rate": 7.016842105263157e-05, "loss": 0.0192, "num_input_tokens_seen": 582443008, "step": 8890, "train_runtime": 3962.164, "train_tokens_per_second": 147001.235 }, { "epoch": 1.9255733448723498, "grad_norm": 0.40234375, "learning_rate": 6.953684210526315e-05, "loss": 0.0214, "num_input_tokens_seen": 583098368, "step": 8900, "train_runtime": 3965.9697, "train_tokens_per_second": 147025.422 }, { "epoch": 1.9255733448723498, "eval_loss": 0.01048352662473917, "eval_runtime": 2.1294, "eval_samples_per_second": 15.028, "eval_steps_per_second": 0.47, "num_input_tokens_seen": 583098368, "step": 8900 }, { "epoch": 1.9255733448723498, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 97.60016273961939, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 583098368, "perplexity": 1.0105386713251614, "step": 8900 }, { "epoch": 1.927736910428386, "grad_norm": 0.392578125, "learning_rate": 6.890526315789473e-05, "loss": 0.019, "num_input_tokens_seen": 583753728, "step": 8910, "train_runtime": 3974.3517, "train_tokens_per_second": 146880.241 }, { "epoch": 1.9299004759844225, "grad_norm": 0.35546875, "learning_rate": 6.82736842105263e-05, "loss": 0.0192, "num_input_tokens_seen": 584409088, "step": 8920, "train_runtime": 3978.1649, "train_tokens_per_second": 146904.189 }, { "epoch": 1.9320640415404586, "grad_norm": 0.3125, "learning_rate": 6.764210526315789e-05, "loss": 0.0174, "num_input_tokens_seen": 585064448, "step": 8930, "train_runtime": 3981.9831, "train_tokens_per_second": 146927.909 }, { "epoch": 1.934227607096495, "grad_norm": 0.318359375, "learning_rate": 6.701052631578946e-05, "loss": 0.0183, "num_input_tokens_seen": 585719808, "step": 8940, "train_runtime": 3985.7936, "train_tokens_per_second": 146951.868 }, { "epoch": 1.9363911726525314, "grad_norm": 0.376953125, "learning_rate": 6.637894736842104e-05, "loss": 0.0169, "num_input_tokens_seen": 586375168, "step": 8950, "train_runtime": 3989.5895, "train_tokens_per_second": 146976.314 }, { "epoch": 1.9385547382085677, "grad_norm": 0.3359375, "learning_rate": 6.574736842105262e-05, "loss": 0.0195, "num_input_tokens_seen": 587030528, "step": 8960, "train_runtime": 3995.6481, "train_tokens_per_second": 146917.474 }, { "epoch": 1.940718303764604, "grad_norm": 0.359375, "learning_rate": 6.51157894736842e-05, "loss": 0.0188, "num_input_tokens_seen": 587685888, "step": 8970, "train_runtime": 3999.455, "train_tokens_per_second": 146941.492 }, { "epoch": 1.9428818693206404, "grad_norm": 0.400390625, "learning_rate": 6.448421052631578e-05, "loss": 0.021, "num_input_tokens_seen": 588341248, "step": 8980, "train_runtime": 4003.2606, "train_tokens_per_second": 146965.513 }, { "epoch": 1.9450454348766768, "grad_norm": 0.296875, "learning_rate": 6.385263157894736e-05, "loss": 0.019, "num_input_tokens_seen": 588996608, "step": 8990, "train_runtime": 4007.0732, "train_tokens_per_second": 146989.231 }, { "epoch": 1.9472090004327132, "grad_norm": 0.380859375, "learning_rate": 6.322105263157894e-05, "loss": 0.019, "num_input_tokens_seen": 589651968, "step": 9000, "train_runtime": 4010.88, "train_tokens_per_second": 147013.117 }, { "epoch": 1.9472090004327132, "eval_loss": 0.010117894969880581, "eval_runtime": 2.1514, "eval_samples_per_second": 14.874, "eval_steps_per_second": 0.465, "num_input_tokens_seen": 589651968, "step": 9000 }, { "epoch": 1.9472090004327132, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 97.60016273961939, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 589651968, "perplexity": 1.010169253937927, "step": 9000 }, { "epoch": 1.9493725659887495, "grad_norm": 0.361328125, "learning_rate": 6.258947368421051e-05, "loss": 0.0174, "num_input_tokens_seen": 590307328, "step": 9010, "train_runtime": 4019.172, "train_tokens_per_second": 146872.872 }, { "epoch": 1.9515361315447857, "grad_norm": 0.32421875, "learning_rate": 6.19578947368421e-05, "loss": 0.0177, "num_input_tokens_seen": 590962688, "step": 9020, "train_runtime": 4022.9902, "train_tokens_per_second": 146896.377 }, { "epoch": 1.9536996971008223, "grad_norm": 0.302734375, "learning_rate": 6.132631578947367e-05, "loss": 0.0182, "num_input_tokens_seen": 591618048, "step": 9030, "train_runtime": 4026.7959, "train_tokens_per_second": 146920.297 }, { "epoch": 1.9558632626568584, "grad_norm": 0.359375, "learning_rate": 6.0694736842105254e-05, "loss": 0.0158, "num_input_tokens_seen": 592273408, "step": 9040, "train_runtime": 4030.5968, "train_tokens_per_second": 146944.346 }, { "epoch": 1.958026828212895, "grad_norm": 0.349609375, "learning_rate": 6.006315789473684e-05, "loss": 0.0172, "num_input_tokens_seen": 592928768, "step": 9050, "train_runtime": 4034.4087, "train_tokens_per_second": 146967.95 }, { "epoch": 1.9601903937689311, "grad_norm": 0.337890625, "learning_rate": 5.943157894736841e-05, "loss": 0.0184, "num_input_tokens_seen": 593580032, "step": 9060, "train_runtime": 4040.6374, "train_tokens_per_second": 146902.572 }, { "epoch": 1.9623539593249677, "grad_norm": 0.390625, "learning_rate": 5.88e-05, "loss": 0.0173, "num_input_tokens_seen": 594235392, "step": 9070, "train_runtime": 4044.4294, "train_tokens_per_second": 146926.88 }, { "epoch": 1.9645175248810038, "grad_norm": 0.3125, "learning_rate": 5.816842105263157e-05, "loss": 0.0181, "num_input_tokens_seen": 594890752, "step": 9080, "train_runtime": 4048.2382, "train_tokens_per_second": 146950.53 }, { "epoch": 1.9666810904370402, "grad_norm": 0.296875, "learning_rate": 5.753684210526316e-05, "loss": 0.0167, "num_input_tokens_seen": 595546112, "step": 9090, "train_runtime": 4052.0487, "train_tokens_per_second": 146974.077 }, { "epoch": 1.9688446559930766, "grad_norm": 0.447265625, "learning_rate": 5.690526315789473e-05, "loss": 0.0189, "num_input_tokens_seen": 596201472, "step": 9100, "train_runtime": 4055.8497, "train_tokens_per_second": 146997.921 }, { "epoch": 1.9688446559930766, "eval_loss": 0.010281143710017204, "eval_runtime": 2.1304, "eval_samples_per_second": 15.021, "eval_steps_per_second": 0.469, "num_input_tokens_seen": 596201472, "step": 9100 }, { "epoch": 1.9688446559930766, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 97.51098357963586, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 596201472, "perplexity": 1.0103341762572713, "step": 9100 }, { "epoch": 1.971008221549113, "grad_norm": 0.41796875, "learning_rate": 5.6273684210526304e-05, "loss": 0.0219, "num_input_tokens_seen": 596856832, "step": 9110, "train_runtime": 4064.0593, "train_tokens_per_second": 146862.236 }, { "epoch": 1.9731717871051493, "grad_norm": 0.267578125, "learning_rate": 5.564210526315789e-05, "loss": 0.0175, "num_input_tokens_seen": 597512192, "step": 9120, "train_runtime": 4067.8568, "train_tokens_per_second": 146886.241 }, { "epoch": 1.9753353526611857, "grad_norm": 0.263671875, "learning_rate": 5.5010526315789464e-05, "loss": 0.0195, "num_input_tokens_seen": 598167552, "step": 9130, "train_runtime": 4071.6745, "train_tokens_per_second": 146909.473 }, { "epoch": 1.977498918217222, "grad_norm": 0.326171875, "learning_rate": 5.437894736842105e-05, "loss": 0.0175, "num_input_tokens_seen": 598822912, "step": 9140, "train_runtime": 4075.4996, "train_tokens_per_second": 146932.393 }, { "epoch": 1.9796624837732584, "grad_norm": 0.412109375, "learning_rate": 5.374736842105262e-05, "loss": 0.0181, "num_input_tokens_seen": 599478272, "step": 9150, "train_runtime": 4079.3078, "train_tokens_per_second": 146955.88 }, { "epoch": 1.9818260493292947, "grad_norm": 0.322265625, "learning_rate": 5.311578947368421e-05, "loss": 0.0182, "num_input_tokens_seen": 600133632, "step": 9160, "train_runtime": 4085.3923, "train_tokens_per_second": 146897.429 }, { "epoch": 1.9839896148853309, "grad_norm": 0.396484375, "learning_rate": 5.248421052631578e-05, "loss": 0.0194, "num_input_tokens_seen": 600788992, "step": 9170, "train_runtime": 4089.2065, "train_tokens_per_second": 146920.679 }, { "epoch": 1.9861531804413675, "grad_norm": 0.25390625, "learning_rate": 5.185263157894737e-05, "loss": 0.02, "num_input_tokens_seen": 601444352, "step": 9180, "train_runtime": 4093.02, "train_tokens_per_second": 146943.909 }, { "epoch": 1.9883167459974036, "grad_norm": 0.357421875, "learning_rate": 5.122105263157894e-05, "loss": 0.019, "num_input_tokens_seen": 602099712, "step": 9190, "train_runtime": 4096.8265, "train_tokens_per_second": 146967.346 }, { "epoch": 1.9904803115534402, "grad_norm": 0.384765625, "learning_rate": 5.058947368421052e-05, "loss": 0.0176, "num_input_tokens_seen": 602755072, "step": 9200, "train_runtime": 4100.6457, "train_tokens_per_second": 146990.281 }, { "epoch": 1.9904803115534402, "eval_loss": 0.010350430384278297, "eval_runtime": 2.1379, "eval_samples_per_second": 14.968, "eval_steps_per_second": 0.468, "num_input_tokens_seen": 602755072, "step": 9200 }, { "epoch": 1.9904803115534402, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 97.51098357963586, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 602755072, "perplexity": 1.0104041813774194, "step": 9200 }, { "epoch": 1.9926438771094763, "grad_norm": 0.287109375, "learning_rate": 4.99578947368421e-05, "loss": 0.0174, "num_input_tokens_seen": 603410432, "step": 9210, "train_runtime": 4108.873, "train_tokens_per_second": 146855.461 }, { "epoch": 1.994807442665513, "grad_norm": 0.38671875, "learning_rate": 4.932631578947368e-05, "loss": 0.0195, "num_input_tokens_seen": 604065792, "step": 9220, "train_runtime": 4112.6559, "train_tokens_per_second": 146879.73 }, { "epoch": 1.996971008221549, "grad_norm": 0.3046875, "learning_rate": 4.869473684210526e-05, "loss": 0.0187, "num_input_tokens_seen": 604721152, "step": 9230, "train_runtime": 4116.4529, "train_tokens_per_second": 146903.456 }, { "epoch": 1.9991345737775854, "grad_norm": 0.34375, "learning_rate": 4.806315789473684e-05, "loss": 0.0197, "num_input_tokens_seen": 605372416, "step": 9240, "train_runtime": 4120.2142, "train_tokens_per_second": 146927.414 }, { "epoch": 2.0012981393336218, "grad_norm": 0.359375, "learning_rate": 4.743157894736842e-05, "loss": 0.0193, "num_input_tokens_seen": 606015488, "step": 9250, "train_runtime": 4123.9949, "train_tokens_per_second": 146948.652 }, { "epoch": 2.0034617048896584, "grad_norm": 0.291015625, "learning_rate": 4.68e-05, "loss": 0.0166, "num_input_tokens_seen": 606670848, "step": 9260, "train_runtime": 4130.0553, "train_tokens_per_second": 146891.703 }, { "epoch": 2.0056252704456945, "grad_norm": 0.291015625, "learning_rate": 4.616842105263157e-05, "loss": 0.0204, "num_input_tokens_seen": 607326208, "step": 9270, "train_runtime": 4133.8681, "train_tokens_per_second": 146914.751 }, { "epoch": 2.0077888360017306, "grad_norm": 0.41015625, "learning_rate": 4.553684210526315e-05, "loss": 0.0183, "num_input_tokens_seen": 607981568, "step": 9280, "train_runtime": 4137.6785, "train_tokens_per_second": 146937.847 }, { "epoch": 2.009952401557767, "grad_norm": 0.3671875, "learning_rate": 4.490526315789473e-05, "loss": 0.0169, "num_input_tokens_seen": 608636928, "step": 9290, "train_runtime": 4141.487, "train_tokens_per_second": 146960.966 }, { "epoch": 2.0121159671138034, "grad_norm": 0.396484375, "learning_rate": 4.427368421052631e-05, "loss": 0.0183, "num_input_tokens_seen": 609292288, "step": 9300, "train_runtime": 4145.2997, "train_tokens_per_second": 146983.894 }, { "epoch": 2.0121159671138034, "eval_loss": 0.010261913761496544, "eval_runtime": 2.1393, "eval_samples_per_second": 14.958, "eval_steps_per_second": 0.467, "num_input_tokens_seen": 609292288, "step": 9300 }, { "epoch": 2.0121159671138034, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 97.51098357963586, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 609292288, "perplexity": 1.0103147477698782, "step": 9300 }, { "epoch": 2.01427953266984, "grad_norm": 0.298828125, "learning_rate": 4.364210526315789e-05, "loss": 0.0177, "num_input_tokens_seen": 609947648, "step": 9310, "train_runtime": 4153.5205, "train_tokens_per_second": 146850.76 }, { "epoch": 2.016443098225876, "grad_norm": 0.3203125, "learning_rate": 4.301052631578947e-05, "loss": 0.016, "num_input_tokens_seen": 610603008, "step": 9320, "train_runtime": 4157.3317, "train_tokens_per_second": 146873.777 }, { "epoch": 2.0186066637819127, "grad_norm": 0.283203125, "learning_rate": 4.237894736842105e-05, "loss": 0.0186, "num_input_tokens_seen": 611258368, "step": 9330, "train_runtime": 4161.1347, "train_tokens_per_second": 146897.041 }, { "epoch": 2.020770229337949, "grad_norm": 0.3828125, "learning_rate": 4.174736842105262e-05, "loss": 0.0163, "num_input_tokens_seen": 611913728, "step": 9340, "train_runtime": 4164.9462, "train_tokens_per_second": 146919.959 }, { "epoch": 2.0229337948939854, "grad_norm": 0.353515625, "learning_rate": 4.111578947368421e-05, "loss": 0.0182, "num_input_tokens_seen": 612569088, "step": 9350, "train_runtime": 4168.7566, "train_tokens_per_second": 146942.876 }, { "epoch": 2.0250973604500215, "grad_norm": 0.314453125, "learning_rate": 4.048421052631578e-05, "loss": 0.0176, "num_input_tokens_seen": 613224448, "step": 9360, "train_runtime": 4175.0303, "train_tokens_per_second": 146879.041 }, { "epoch": 2.027260926006058, "grad_norm": 0.310546875, "learning_rate": 3.985263157894737e-05, "loss": 0.0168, "num_input_tokens_seen": 613879808, "step": 9370, "train_runtime": 4178.8392, "train_tokens_per_second": 146901.994 }, { "epoch": 2.0294244915620943, "grad_norm": 0.34375, "learning_rate": 3.922105263157894e-05, "loss": 0.0184, "num_input_tokens_seen": 614535168, "step": 9380, "train_runtime": 4182.6568, "train_tokens_per_second": 146924.597 }, { "epoch": 2.031588057118131, "grad_norm": 0.30078125, "learning_rate": 3.858947368421053e-05, "loss": 0.02, "num_input_tokens_seen": 615190528, "step": 9390, "train_runtime": 4186.4682, "train_tokens_per_second": 146947.38 }, { "epoch": 2.033751622674167, "grad_norm": 0.31640625, "learning_rate": 3.79578947368421e-05, "loss": 0.0194, "num_input_tokens_seen": 615845888, "step": 9400, "train_runtime": 4190.2936, "train_tokens_per_second": 146969.629 }, { "epoch": 2.033751622674167, "eval_loss": 0.01002554688602686, "eval_runtime": 2.1448, "eval_samples_per_second": 14.92, "eval_steps_per_second": 0.466, "num_input_tokens_seen": 615845888, "step": 9400 }, { "epoch": 2.033751622674167, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 97.51098357963586, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 615845888, "perplexity": 1.010075971050272, "step": 9400 }, { "epoch": 2.0359151882302036, "grad_norm": 0.376953125, "learning_rate": 3.732631578947368e-05, "loss": 0.0169, "num_input_tokens_seen": 616501248, "step": 9410, "train_runtime": 4198.5084, "train_tokens_per_second": 146838.16 }, { "epoch": 2.0380787537862397, "grad_norm": 0.373046875, "learning_rate": 3.669473684210526e-05, "loss": 0.0177, "num_input_tokens_seen": 617156608, "step": 9420, "train_runtime": 4202.3302, "train_tokens_per_second": 146860.568 }, { "epoch": 2.0402423193422763, "grad_norm": 0.337890625, "learning_rate": 3.606315789473684e-05, "loss": 0.0201, "num_input_tokens_seen": 617811968, "step": 9430, "train_runtime": 4206.1464, "train_tokens_per_second": 146883.135 }, { "epoch": 2.0424058848983124, "grad_norm": 0.390625, "learning_rate": 3.543157894736842e-05, "loss": 0.0176, "num_input_tokens_seen": 618467328, "step": 9440, "train_runtime": 4209.9584, "train_tokens_per_second": 146905.804 }, { "epoch": 2.0445694504543486, "grad_norm": 0.36328125, "learning_rate": 3.48e-05, "loss": 0.0162, "num_input_tokens_seen": 619122688, "step": 9450, "train_runtime": 4213.7813, "train_tokens_per_second": 146928.053 }, { "epoch": 2.046733016010385, "grad_norm": 0.27734375, "learning_rate": 3.416842105263157e-05, "loss": 0.0168, "num_input_tokens_seen": 619778048, "step": 9460, "train_runtime": 4219.8691, "train_tokens_per_second": 146871.392 }, { "epoch": 2.0488965815664213, "grad_norm": 0.384765625, "learning_rate": 3.353684210526315e-05, "loss": 0.0171, "num_input_tokens_seen": 620433408, "step": 9470, "train_runtime": 4223.6982, "train_tokens_per_second": 146893.405 }, { "epoch": 2.051060147122458, "grad_norm": 0.349609375, "learning_rate": 3.290526315789473e-05, "loss": 0.0165, "num_input_tokens_seen": 621088768, "step": 9480, "train_runtime": 4227.494, "train_tokens_per_second": 146916.535 }, { "epoch": 2.053223712678494, "grad_norm": 0.2890625, "learning_rate": 3.227368421052631e-05, "loss": 0.0161, "num_input_tokens_seen": 621744128, "step": 9490, "train_runtime": 4231.3049, "train_tokens_per_second": 146939.097 }, { "epoch": 2.0553872782345306, "grad_norm": 0.353515625, "learning_rate": 3.164210526315789e-05, "loss": 0.0158, "num_input_tokens_seen": 622399488, "step": 9500, "train_runtime": 4235.1168, "train_tokens_per_second": 146961.587 }, { "epoch": 2.0553872782345306, "eval_loss": 0.010010459460318089, "eval_runtime": 2.1507, "eval_samples_per_second": 14.879, "eval_steps_per_second": 0.465, "num_input_tokens_seen": 622399488, "step": 9500 }, { "epoch": 2.0553872782345306, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 97.60016273961939, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 622399488, "perplexity": 1.01006073171906, "step": 9500 }, { "epoch": 2.0575508437905667, "grad_norm": 0.337890625, "learning_rate": 3.101052631578947e-05, "loss": 0.0186, "num_input_tokens_seen": 623054848, "step": 9510, "train_runtime": 4243.3995, "train_tokens_per_second": 146829.175 }, { "epoch": 2.0597144093466033, "grad_norm": 0.5, "learning_rate": 3.0378947368421053e-05, "loss": 0.0174, "num_input_tokens_seen": 623710208, "step": 9520, "train_runtime": 4247.2255, "train_tokens_per_second": 146851.211 }, { "epoch": 2.0618779749026395, "grad_norm": 0.32421875, "learning_rate": 2.974736842105263e-05, "loss": 0.0158, "num_input_tokens_seen": 624365568, "step": 9530, "train_runtime": 4251.0383, "train_tokens_per_second": 146873.663 }, { "epoch": 2.064041540458676, "grad_norm": 0.345703125, "learning_rate": 2.9115789473684205e-05, "loss": 0.0173, "num_input_tokens_seen": 625020928, "step": 9540, "train_runtime": 4254.8535, "train_tokens_per_second": 146895.993 }, { "epoch": 2.066205106014712, "grad_norm": 0.373046875, "learning_rate": 2.8484210526315785e-05, "loss": 0.0165, "num_input_tokens_seen": 625676288, "step": 9550, "train_runtime": 4258.673, "train_tokens_per_second": 146918.134 }, { "epoch": 2.0683686715707488, "grad_norm": 0.28125, "learning_rate": 2.7852631578947365e-05, "loss": 0.0159, "num_input_tokens_seen": 626331648, "step": 9560, "train_runtime": 4264.758, "train_tokens_per_second": 146862.179 }, { "epoch": 2.070532237126785, "grad_norm": 0.3359375, "learning_rate": 2.7221052631578944e-05, "loss": 0.0158, "num_input_tokens_seen": 626987008, "step": 9570, "train_runtime": 4268.5813, "train_tokens_per_second": 146884.168 }, { "epoch": 2.0726958026828215, "grad_norm": 0.396484375, "learning_rate": 2.6589473684210524e-05, "loss": 0.0184, "num_input_tokens_seen": 627642368, "step": 9580, "train_runtime": 4272.405, "train_tokens_per_second": 146906.103 }, { "epoch": 2.0748593682388576, "grad_norm": 0.296875, "learning_rate": 2.5957894736842104e-05, "loss": 0.0183, "num_input_tokens_seen": 628297728, "step": 9590, "train_runtime": 4276.2131, "train_tokens_per_second": 146928.537 }, { "epoch": 2.0770229337948938, "grad_norm": 0.31640625, "learning_rate": 2.5326315789473683e-05, "loss": 0.0171, "num_input_tokens_seen": 628953088, "step": 9600, "train_runtime": 4280.0277, "train_tokens_per_second": 146950.704 }, { "epoch": 2.0770229337948938, "eval_loss": 0.010010439902544022, "eval_runtime": 2.1518, "eval_samples_per_second": 14.871, "eval_steps_per_second": 0.465, "num_input_tokens_seen": 628953088, "step": 9600 }, { "epoch": 2.0770229337948938, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 97.51098357963586, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 628953088, "perplexity": 1.0100607119645206, "step": 9600 }, { "epoch": 2.0791864993509304, "grad_norm": 0.287109375, "learning_rate": 2.4694736842105263e-05, "loss": 0.0167, "num_input_tokens_seen": 629608448, "step": 9610, "train_runtime": 4288.0929, "train_tokens_per_second": 146827.146 }, { "epoch": 2.0813500649069665, "grad_norm": 0.296875, "learning_rate": 2.406315789473684e-05, "loss": 0.0178, "num_input_tokens_seen": 630263808, "step": 9620, "train_runtime": 4291.9127, "train_tokens_per_second": 146849.169 }, { "epoch": 2.083513630463003, "grad_norm": 0.34375, "learning_rate": 2.343157894736842e-05, "loss": 0.0161, "num_input_tokens_seen": 630919168, "step": 9630, "train_runtime": 4295.7199, "train_tokens_per_second": 146871.579 }, { "epoch": 2.0856771960190392, "grad_norm": 0.287109375, "learning_rate": 2.28e-05, "loss": 0.0186, "num_input_tokens_seen": 631574528, "step": 9640, "train_runtime": 4299.5368, "train_tokens_per_second": 146893.619 }, { "epoch": 2.087840761575076, "grad_norm": 0.337890625, "learning_rate": 2.2168421052631578e-05, "loss": 0.0188, "num_input_tokens_seen": 632229888, "step": 9650, "train_runtime": 4303.3438, "train_tokens_per_second": 146915.96 }, { "epoch": 2.090004327131112, "grad_norm": 0.318359375, "learning_rate": 2.1536842105263158e-05, "loss": 0.0164, "num_input_tokens_seen": 632885248, "step": 9660, "train_runtime": 4309.5545, "train_tokens_per_second": 146856.303 }, { "epoch": 2.0921678926871485, "grad_norm": 0.390625, "learning_rate": 2.0905263157894737e-05, "loss": 0.0173, "num_input_tokens_seen": 633540608, "step": 9670, "train_runtime": 4313.3662, "train_tokens_per_second": 146878.465 }, { "epoch": 2.0943314582431847, "grad_norm": 0.373046875, "learning_rate": 2.0273684210526317e-05, "loss": 0.0193, "num_input_tokens_seen": 634195968, "step": 9680, "train_runtime": 4317.1835, "train_tokens_per_second": 146900.396 }, { "epoch": 2.0964950237992213, "grad_norm": 0.30078125, "learning_rate": 1.964210526315789e-05, "loss": 0.017, "num_input_tokens_seen": 634851328, "step": 9690, "train_runtime": 4320.9989, "train_tokens_per_second": 146922.353 }, { "epoch": 2.0986585893552574, "grad_norm": 0.423828125, "learning_rate": 1.901052631578947e-05, "loss": 0.0196, "num_input_tokens_seen": 635506688, "step": 9700, "train_runtime": 4324.807, "train_tokens_per_second": 146944.519 }, { "epoch": 2.0986585893552574, "eval_loss": 0.009999026544392109, "eval_runtime": 2.1556, "eval_samples_per_second": 14.845, "eval_steps_per_second": 0.464, "num_input_tokens_seen": 635506688, "step": 9700 }, { "epoch": 2.0986585893552574, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 97.51098357963586, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 635506688, "perplexity": 1.0100491838456471, "step": 9700 }, { "epoch": 2.100822154911294, "grad_norm": 0.3125, "learning_rate": 1.837894736842105e-05, "loss": 0.0193, "num_input_tokens_seen": 636162048, "step": 9710, "train_runtime": 4333.0188, "train_tokens_per_second": 146817.282 }, { "epoch": 2.10298572046733, "grad_norm": 0.3515625, "learning_rate": 1.774736842105263e-05, "loss": 0.0176, "num_input_tokens_seen": 636817408, "step": 9720, "train_runtime": 4336.7843, "train_tokens_per_second": 146840.924 }, { "epoch": 2.1051492860233667, "grad_norm": 0.384765625, "learning_rate": 1.711578947368421e-05, "loss": 0.0181, "num_input_tokens_seen": 637472768, "step": 9730, "train_runtime": 4340.605, "train_tokens_per_second": 146862.653 }, { "epoch": 2.107312851579403, "grad_norm": 0.373046875, "learning_rate": 1.6484210526315788e-05, "loss": 0.0184, "num_input_tokens_seen": 638128128, "step": 9740, "train_runtime": 4344.4225, "train_tokens_per_second": 146884.455 }, { "epoch": 2.109476417135439, "grad_norm": 0.27734375, "learning_rate": 1.5852631578947364e-05, "loss": 0.0157, "num_input_tokens_seen": 638783488, "step": 9750, "train_runtime": 4348.2274, "train_tokens_per_second": 146906.644 }, { "epoch": 2.1116399826914756, "grad_norm": 0.275390625, "learning_rate": 1.5221052631578946e-05, "loss": 0.0171, "num_input_tokens_seen": 639438848, "step": 9760, "train_runtime": 4354.2945, "train_tokens_per_second": 146852.457 }, { "epoch": 2.1138035482475117, "grad_norm": 0.357421875, "learning_rate": 1.4589473684210525e-05, "loss": 0.0168, "num_input_tokens_seen": 640094208, "step": 9770, "train_runtime": 4358.1147, "train_tokens_per_second": 146874.107 }, { "epoch": 2.1159671138035483, "grad_norm": 0.283203125, "learning_rate": 1.3957894736842105e-05, "loss": 0.0178, "num_input_tokens_seen": 640749568, "step": 9780, "train_runtime": 4361.9276, "train_tokens_per_second": 146895.964 }, { "epoch": 2.1181306793595844, "grad_norm": 0.353515625, "learning_rate": 1.3326315789473681e-05, "loss": 0.0178, "num_input_tokens_seen": 641404928, "step": 9790, "train_runtime": 4365.7451, "train_tokens_per_second": 146917.63 }, { "epoch": 2.120294244915621, "grad_norm": 0.462890625, "learning_rate": 1.2694736842105261e-05, "loss": 0.0184, "num_input_tokens_seen": 642060288, "step": 9800, "train_runtime": 4369.5567, "train_tokens_per_second": 146939.456 }, { "epoch": 2.120294244915621, "eval_loss": 0.009999697096645832, "eval_runtime": 2.1661, "eval_samples_per_second": 14.773, "eval_steps_per_second": 0.462, "num_input_tokens_seen": 642060288, "step": 9800 }, { "epoch": 2.120294244915621, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 97.51098357963586, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 642060288, "perplexity": 1.0100498611366309, "step": 9800 }, { "epoch": 2.122457810471657, "grad_norm": 0.4140625, "learning_rate": 1.206315789473684e-05, "loss": 0.0172, "num_input_tokens_seen": 642715648, "step": 9810, "train_runtime": 4377.9693, "train_tokens_per_second": 146806.798 }, { "epoch": 2.1246213760276937, "grad_norm": 0.314453125, "learning_rate": 1.143157894736842e-05, "loss": 0.0155, "num_input_tokens_seen": 643371008, "step": 9820, "train_runtime": 4381.7799, "train_tokens_per_second": 146828.693 }, { "epoch": 2.12678494158373, "grad_norm": 0.357421875, "learning_rate": 1.0799999999999998e-05, "loss": 0.0159, "num_input_tokens_seen": 644026368, "step": 9830, "train_runtime": 4385.602, "train_tokens_per_second": 146850.163 }, { "epoch": 2.1289485071397665, "grad_norm": 0.396484375, "learning_rate": 1.0168421052631578e-05, "loss": 0.0168, "num_input_tokens_seen": 644681728, "step": 9840, "train_runtime": 4389.4172, "train_tokens_per_second": 146871.827 }, { "epoch": 2.1311120726958026, "grad_norm": 0.482421875, "learning_rate": 9.536842105263158e-06, "loss": 0.0189, "num_input_tokens_seen": 645337088, "step": 9850, "train_runtime": 4393.2316, "train_tokens_per_second": 146893.482 }, { "epoch": 2.133275638251839, "grad_norm": 0.3046875, "learning_rate": 8.905263157894735e-06, "loss": 0.0183, "num_input_tokens_seen": 645992448, "step": 9860, "train_runtime": 4399.285, "train_tokens_per_second": 146840.326 }, { "epoch": 2.1354392038078753, "grad_norm": 0.462890625, "learning_rate": 8.273684210526315e-06, "loss": 0.0168, "num_input_tokens_seen": 646643712, "step": 9870, "train_runtime": 4403.0615, "train_tokens_per_second": 146862.295 }, { "epoch": 2.137602769363912, "grad_norm": 0.30078125, "learning_rate": 7.642105263157893e-06, "loss": 0.0167, "num_input_tokens_seen": 647299072, "step": 9880, "train_runtime": 4406.8613, "train_tokens_per_second": 146884.377 }, { "epoch": 2.139766334919948, "grad_norm": 0.37109375, "learning_rate": 7.0105263157894736e-06, "loss": 0.0189, "num_input_tokens_seen": 647954432, "step": 9890, "train_runtime": 4410.6797, "train_tokens_per_second": 146905.8 }, { "epoch": 2.1419299004759846, "grad_norm": 0.263671875, "learning_rate": 6.3789473684210515e-06, "loss": 0.0172, "num_input_tokens_seen": 648605696, "step": 9900, "train_runtime": 4414.4742, "train_tokens_per_second": 146927.055 }, { "epoch": 2.1419299004759846, "eval_loss": 0.010031883604824543, "eval_runtime": 2.146, "eval_samples_per_second": 14.912, "eval_steps_per_second": 0.466, "num_input_tokens_seen": 648605696, "step": 9900 }, { "epoch": 2.1419299004759846, "eval_byte_accuracy": 0.9966617210682492, "eval_chrf": 97.51098357963586, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 648605696, "perplexity": 1.0100823716379443, "step": 9900 }, { "epoch": 2.144093466032021, "grad_norm": 0.365234375, "learning_rate": 5.747368421052631e-06, "loss": 0.0188, "num_input_tokens_seen": 649261056, "step": 9910, "train_runtime": 4422.6956, "train_tokens_per_second": 146802.113 }, { "epoch": 2.146257031588057, "grad_norm": 0.412109375, "learning_rate": 5.11578947368421e-06, "loss": 0.0184, "num_input_tokens_seen": 649916416, "step": 9920, "train_runtime": 4426.5062, "train_tokens_per_second": 146823.79 }, { "epoch": 2.1484205971440935, "grad_norm": 0.341796875, "learning_rate": 4.484210526315789e-06, "loss": 0.0164, "num_input_tokens_seen": 650571776, "step": 9930, "train_runtime": 4430.3233, "train_tokens_per_second": 146845.215 }, { "epoch": 2.1505841627001296, "grad_norm": 0.365234375, "learning_rate": 3.8526315789473676e-06, "loss": 0.0168, "num_input_tokens_seen": 651227136, "step": 9940, "train_runtime": 4434.1482, "train_tokens_per_second": 146866.345 }, { "epoch": 2.1527477282561662, "grad_norm": 0.3515625, "learning_rate": 3.2210526315789468e-06, "loss": 0.0406, "num_input_tokens_seen": 651882496, "step": 9950, "train_runtime": 4437.9717, "train_tokens_per_second": 146887.484 }, { "epoch": 2.1549112938122024, "grad_norm": 0.419921875, "learning_rate": 2.589473684210526e-06, "loss": 0.0179, "num_input_tokens_seen": 652537856, "step": 9960, "train_runtime": 4444.2083, "train_tokens_per_second": 146828.818 }, { "epoch": 2.157074859368239, "grad_norm": 0.44140625, "learning_rate": 1.957894736842105e-06, "loss": 0.0176, "num_input_tokens_seen": 653193216, "step": 9970, "train_runtime": 4448.0238, "train_tokens_per_second": 146850.208 }, { "epoch": 2.159238424924275, "grad_norm": 0.318359375, "learning_rate": 1.326315789473684e-06, "loss": 0.0181, "num_input_tokens_seen": 653848576, "step": 9980, "train_runtime": 4451.8424, "train_tokens_per_second": 146871.457 }, { "epoch": 2.1614019904803117, "grad_norm": 0.357421875, "learning_rate": 6.947368421052631e-07, "loss": 0.0172, "num_input_tokens_seen": 654503936, "step": 9990, "train_runtime": 4455.6546, "train_tokens_per_second": 146892.881 }, { "epoch": 2.163565556036348, "grad_norm": 0.294921875, "learning_rate": 6.31578947368421e-08, "loss": 0.0175, "num_input_tokens_seen": 655159296, "step": 10000, "train_runtime": 4459.4692, "train_tokens_per_second": 146914.188 }, { "epoch": 2.163565556036348, "eval_loss": 0.010019873268902302, "eval_runtime": 2.1457, "eval_samples_per_second": 14.914, "eval_steps_per_second": 0.466, "num_input_tokens_seen": 655159296, "step": 10000 }, { "epoch": 2.163565556036348, "eval_byte_accuracy": 0.9962908011869436, "eval_chrf": 97.60016273961939, "eval_sacrebleu": 96.29188240283864, "eval_word_accuracy": 0.9903846153846154, "num_input_tokens_seen": 655159296, "perplexity": 1.0100702402822026, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 655159296, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.632344094428365e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }