TatarTokenizers / training_summary.json
ArabovMK's picture
Upload training_summary.json with huggingface_hub
8885ab1 verified
{
"bpe": {
"v8000_mf2": {
"out_dir": "results\\bpe\\v8000_mf2",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 96.0113,
"avg_processing_time_ms": 0.19588143825531007,
"compression_ratio": 96.0113,
"total_tokens_evaluated": 1920226,
"unk_count": 0,
"train_time_s": 105.87230825424194,
"config": {
"vocab_size": 8000,
"min_frequency": 2,
"continuing_subword_prefix": "##"
}
}
},
"v8000_mf5": {
"out_dir": "results\\bpe\\v8000_mf5",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 96.0113,
"avg_processing_time_ms": 0.19508297443389894,
"compression_ratio": 96.0113,
"total_tokens_evaluated": 1920226,
"unk_count": 0,
"train_time_s": 115.85335993766785,
"config": {
"vocab_size": 8000,
"min_frequency": 5,
"continuing_subword_prefix": "##"
}
}
},
"v16000_mf2": {
"out_dir": "results\\bpe\\v16000_mf2",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 84.86375,
"avg_processing_time_ms": 0.1791509985923767,
"compression_ratio": 84.86375,
"total_tokens_evaluated": 1697275,
"unk_count": 0,
"train_time_s": 122.03794264793396,
"config": {
"vocab_size": 16000,
"min_frequency": 2,
"continuing_subword_prefix": "##"
}
}
},
"v16000_mf5": {
"out_dir": "results\\bpe\\v16000_mf5",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 84.86375,
"avg_processing_time_ms": 0.1843635559082031,
"compression_ratio": 84.86375,
"total_tokens_evaluated": 1697275,
"unk_count": 0,
"train_time_s": 119.14113140106201,
"config": {
"vocab_size": 16000,
"min_frequency": 5,
"continuing_subword_prefix": "##"
}
}
},
"v32000_mf2": {
"out_dir": "results\\bpe\\v32000_mf2",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 77.17065,
"avg_processing_time_ms": 0.18579285144805907,
"compression_ratio": 77.17065,
"total_tokens_evaluated": 1543413,
"unk_count": 0,
"train_time_s": 122.94540190696716,
"config": {
"vocab_size": 32000,
"min_frequency": 2,
"continuing_subword_prefix": "##"
}
}
},
"v32000_mf5": {
"out_dir": "results\\bpe\\v32000_mf5",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 77.17065,
"avg_processing_time_ms": 0.1811486840248108,
"compression_ratio": 77.17065,
"total_tokens_evaluated": 1543413,
"unk_count": 0,
"train_time_s": 122.62627506256104,
"config": {
"vocab_size": 32000,
"min_frequency": 5,
"continuing_subword_prefix": "##"
}
}
}
},
"wordpiece": {
"v8000_mf1": {
"out_dir": "results\\wordpiece\\v8000_mf1",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 95.39795,
"avg_processing_time_ms": 31.364226222038273,
"compression_ratio": 95.39795,
"total_tokens_evaluated": 1907959,
"unk_count": 0,
"train_time_s": 124.3489019870758,
"config": {
"vocab_size": 8000,
"min_frequency": 1
}
}
},
"v8000_mf2": {
"out_dir": "results\\wordpiece\\v8000_mf2",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 95.39795,
"avg_processing_time_ms": 0.22379395961761475,
"compression_ratio": 95.39795,
"total_tokens_evaluated": 1907959,
"unk_count": 0,
"train_time_s": 176.4660017490387,
"config": {
"vocab_size": 8000,
"min_frequency": 2
}
}
},
"v16000_mf1": {
"out_dir": "results\\wordpiece\\v16000_mf1",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 84.55695,
"avg_processing_time_ms": 0.2237707018852234,
"compression_ratio": 84.55695,
"total_tokens_evaluated": 1691139,
"unk_count": 0,
"train_time_s": 184.54623937606812,
"config": {
"vocab_size": 16000,
"min_frequency": 1
}
}
},
"v16000_mf2": {
"out_dir": "results\\wordpiece\\v16000_mf2",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 84.55695,
"avg_processing_time_ms": 0.2417303204536438,
"compression_ratio": 84.55695,
"total_tokens_evaluated": 1691139,
"unk_count": 0,
"train_time_s": 318.9338138103485,
"config": {
"vocab_size": 16000,
"min_frequency": 2
}
}
},
"v32000_mf1": {
"out_dir": "results\\wordpiece\\v32000_mf1",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 76.92375,
"avg_processing_time_ms": 0.2857889056205749,
"compression_ratio": 76.92375,
"total_tokens_evaluated": 1538475,
"unk_count": 0,
"train_time_s": 158.26075053215027,
"config": {
"vocab_size": 32000,
"min_frequency": 1
}
}
},
"v32000_mf2": {
"out_dir": "results\\wordpiece\\v32000_mf2",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 76.92375,
"avg_processing_time_ms": 0.518797504901886,
"compression_ratio": 76.92375,
"total_tokens_evaluated": 1538475,
"unk_count": 0,
"train_time_s": 157.1074833869934,
"config": {
"vocab_size": 32000,
"min_frequency": 2
}
}
}
},
"unigram": {
"v8000": {
"out_dir": "results\\unigram\\v8000",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 101.5805,
"avg_processing_time_ms": 0.3227068305015564,
"compression_ratio": 101.5805,
"total_tokens_evaluated": 2031610,
"unk_count": 0,
"train_time_s": 601.7949032783508,
"config": {
"vocab_size": 8000
}
}
},
"v16000": {
"out_dir": "results\\unigram\\v16000",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 90.8909,
"avg_processing_time_ms": 0.29166127443313594,
"compression_ratio": 90.8909,
"total_tokens_evaluated": 1817818,
"unk_count": 0,
"train_time_s": 614.1360929012299,
"config": {
"vocab_size": 16000
}
}
},
"v32000": {
"out_dir": "results\\unigram\\v32000",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 83.3668,
"avg_processing_time_ms": 0.32854799032211307,
"compression_ratio": 83.3668,
"total_tokens_evaluated": 1667336,
"unk_count": 0,
"train_time_s": 757.2155563831329,
"config": {
"vocab_size": 32000
}
}
}
},
"spm": {
"v8000": {
"out_dir": "results\\spm_unigram\\v8000",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 107.90535,
"avg_processing_time_ms": 0.11760829687118529,
"compression_ratio": 107.90535,
"total_tokens_evaluated": 2158107,
"unk_count": 0,
"unk_piece_used": "[UNK]",
"train_time_s": 343.80153012275696,
"config": {
"vocab_size": 8000
}
}
},
"v16000": {
"out_dir": "results\\spm_unigram\\v16000",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 95.67175,
"avg_processing_time_ms": 0.160364830493927,
"compression_ratio": 95.67175,
"total_tokens_evaluated": 1913435,
"unk_count": 0,
"unk_piece_used": "[UNK]",
"train_time_s": 477.8609836101532,
"config": {
"vocab_size": 16000
}
}
},
"v32000": {
"out_dir": "results\\spm_unigram\\v32000",
"metrics": {
"oov_rate": 0.0,
"avg_sequence_length": 86.6945,
"avg_processing_time_ms": 0.1026016116142273,
"compression_ratio": 86.6945,
"total_tokens_evaluated": 1733890,
"unk_count": 0,
"unk_piece_used": "[UNK]",
"train_time_s": 249.83488726615906,
"config": {
"vocab_size": 32000
}
}
}
},
"metadata": {
"corpus_path": "full_tatar_raw_corpus_clean.txt",
"vocab_sizes": [
8000,
16000,
32000
],
"sample_size": 20000,
"seed": 42,
"selected_models": [
"bpe",
"wordpiece",
"unigram",
"spm"
],
"timestamp": "2025-11-19 21:10:06"
}
}