| { | |
| "bpe": { | |
| "v8000_mf2": { | |
| "out_dir": "results\\bpe\\v8000_mf2", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 96.0113, | |
| "avg_processing_time_ms": 0.19588143825531007, | |
| "compression_ratio": 96.0113, | |
| "total_tokens_evaluated": 1920226, | |
| "unk_count": 0, | |
| "train_time_s": 105.87230825424194, | |
| "config": { | |
| "vocab_size": 8000, | |
| "min_frequency": 2, | |
| "continuing_subword_prefix": "##" | |
| } | |
| } | |
| }, | |
| "v8000_mf5": { | |
| "out_dir": "results\\bpe\\v8000_mf5", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 96.0113, | |
| "avg_processing_time_ms": 0.19508297443389894, | |
| "compression_ratio": 96.0113, | |
| "total_tokens_evaluated": 1920226, | |
| "unk_count": 0, | |
| "train_time_s": 115.85335993766785, | |
| "config": { | |
| "vocab_size": 8000, | |
| "min_frequency": 5, | |
| "continuing_subword_prefix": "##" | |
| } | |
| } | |
| }, | |
| "v16000_mf2": { | |
| "out_dir": "results\\bpe\\v16000_mf2", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 84.86375, | |
| "avg_processing_time_ms": 0.1791509985923767, | |
| "compression_ratio": 84.86375, | |
| "total_tokens_evaluated": 1697275, | |
| "unk_count": 0, | |
| "train_time_s": 122.03794264793396, | |
| "config": { | |
| "vocab_size": 16000, | |
| "min_frequency": 2, | |
| "continuing_subword_prefix": "##" | |
| } | |
| } | |
| }, | |
| "v16000_mf5": { | |
| "out_dir": "results\\bpe\\v16000_mf5", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 84.86375, | |
| "avg_processing_time_ms": 0.1843635559082031, | |
| "compression_ratio": 84.86375, | |
| "total_tokens_evaluated": 1697275, | |
| "unk_count": 0, | |
| "train_time_s": 119.14113140106201, | |
| "config": { | |
| "vocab_size": 16000, | |
| "min_frequency": 5, | |
| "continuing_subword_prefix": "##" | |
| } | |
| } | |
| }, | |
| "v32000_mf2": { | |
| "out_dir": "results\\bpe\\v32000_mf2", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 77.17065, | |
| "avg_processing_time_ms": 0.18579285144805907, | |
| "compression_ratio": 77.17065, | |
| "total_tokens_evaluated": 1543413, | |
| "unk_count": 0, | |
| "train_time_s": 122.94540190696716, | |
| "config": { | |
| "vocab_size": 32000, | |
| "min_frequency": 2, | |
| "continuing_subword_prefix": "##" | |
| } | |
| } | |
| }, | |
| "v32000_mf5": { | |
| "out_dir": "results\\bpe\\v32000_mf5", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 77.17065, | |
| "avg_processing_time_ms": 0.1811486840248108, | |
| "compression_ratio": 77.17065, | |
| "total_tokens_evaluated": 1543413, | |
| "unk_count": 0, | |
| "train_time_s": 122.62627506256104, | |
| "config": { | |
| "vocab_size": 32000, | |
| "min_frequency": 5, | |
| "continuing_subword_prefix": "##" | |
| } | |
| } | |
| } | |
| }, | |
| "wordpiece": { | |
| "v8000_mf1": { | |
| "out_dir": "results\\wordpiece\\v8000_mf1", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 95.39795, | |
| "avg_processing_time_ms": 31.364226222038273, | |
| "compression_ratio": 95.39795, | |
| "total_tokens_evaluated": 1907959, | |
| "unk_count": 0, | |
| "train_time_s": 124.3489019870758, | |
| "config": { | |
| "vocab_size": 8000, | |
| "min_frequency": 1 | |
| } | |
| } | |
| }, | |
| "v8000_mf2": { | |
| "out_dir": "results\\wordpiece\\v8000_mf2", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 95.39795, | |
| "avg_processing_time_ms": 0.22379395961761475, | |
| "compression_ratio": 95.39795, | |
| "total_tokens_evaluated": 1907959, | |
| "unk_count": 0, | |
| "train_time_s": 176.4660017490387, | |
| "config": { | |
| "vocab_size": 8000, | |
| "min_frequency": 2 | |
| } | |
| } | |
| }, | |
| "v16000_mf1": { | |
| "out_dir": "results\\wordpiece\\v16000_mf1", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 84.55695, | |
| "avg_processing_time_ms": 0.2237707018852234, | |
| "compression_ratio": 84.55695, | |
| "total_tokens_evaluated": 1691139, | |
| "unk_count": 0, | |
| "train_time_s": 184.54623937606812, | |
| "config": { | |
| "vocab_size": 16000, | |
| "min_frequency": 1 | |
| } | |
| } | |
| }, | |
| "v16000_mf2": { | |
| "out_dir": "results\\wordpiece\\v16000_mf2", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 84.55695, | |
| "avg_processing_time_ms": 0.2417303204536438, | |
| "compression_ratio": 84.55695, | |
| "total_tokens_evaluated": 1691139, | |
| "unk_count": 0, | |
| "train_time_s": 318.9338138103485, | |
| "config": { | |
| "vocab_size": 16000, | |
| "min_frequency": 2 | |
| } | |
| } | |
| }, | |
| "v32000_mf1": { | |
| "out_dir": "results\\wordpiece\\v32000_mf1", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 76.92375, | |
| "avg_processing_time_ms": 0.2857889056205749, | |
| "compression_ratio": 76.92375, | |
| "total_tokens_evaluated": 1538475, | |
| "unk_count": 0, | |
| "train_time_s": 158.26075053215027, | |
| "config": { | |
| "vocab_size": 32000, | |
| "min_frequency": 1 | |
| } | |
| } | |
| }, | |
| "v32000_mf2": { | |
| "out_dir": "results\\wordpiece\\v32000_mf2", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 76.92375, | |
| "avg_processing_time_ms": 0.518797504901886, | |
| "compression_ratio": 76.92375, | |
| "total_tokens_evaluated": 1538475, | |
| "unk_count": 0, | |
| "train_time_s": 157.1074833869934, | |
| "config": { | |
| "vocab_size": 32000, | |
| "min_frequency": 2 | |
| } | |
| } | |
| } | |
| }, | |
| "unigram": { | |
| "v8000": { | |
| "out_dir": "results\\unigram\\v8000", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 101.5805, | |
| "avg_processing_time_ms": 0.3227068305015564, | |
| "compression_ratio": 101.5805, | |
| "total_tokens_evaluated": 2031610, | |
| "unk_count": 0, | |
| "train_time_s": 601.7949032783508, | |
| "config": { | |
| "vocab_size": 8000 | |
| } | |
| } | |
| }, | |
| "v16000": { | |
| "out_dir": "results\\unigram\\v16000", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 90.8909, | |
| "avg_processing_time_ms": 0.29166127443313594, | |
| "compression_ratio": 90.8909, | |
| "total_tokens_evaluated": 1817818, | |
| "unk_count": 0, | |
| "train_time_s": 614.1360929012299, | |
| "config": { | |
| "vocab_size": 16000 | |
| } | |
| } | |
| }, | |
| "v32000": { | |
| "out_dir": "results\\unigram\\v32000", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 83.3668, | |
| "avg_processing_time_ms": 0.32854799032211307, | |
| "compression_ratio": 83.3668, | |
| "total_tokens_evaluated": 1667336, | |
| "unk_count": 0, | |
| "train_time_s": 757.2155563831329, | |
| "config": { | |
| "vocab_size": 32000 | |
| } | |
| } | |
| } | |
| }, | |
| "spm": { | |
| "v8000": { | |
| "out_dir": "results\\spm_unigram\\v8000", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 107.90535, | |
| "avg_processing_time_ms": 0.11760829687118529, | |
| "compression_ratio": 107.90535, | |
| "total_tokens_evaluated": 2158107, | |
| "unk_count": 0, | |
| "unk_piece_used": "[UNK]", | |
| "train_time_s": 343.80153012275696, | |
| "config": { | |
| "vocab_size": 8000 | |
| } | |
| } | |
| }, | |
| "v16000": { | |
| "out_dir": "results\\spm_unigram\\v16000", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 95.67175, | |
| "avg_processing_time_ms": 0.160364830493927, | |
| "compression_ratio": 95.67175, | |
| "total_tokens_evaluated": 1913435, | |
| "unk_count": 0, | |
| "unk_piece_used": "[UNK]", | |
| "train_time_s": 477.8609836101532, | |
| "config": { | |
| "vocab_size": 16000 | |
| } | |
| } | |
| }, | |
| "v32000": { | |
| "out_dir": "results\\spm_unigram\\v32000", | |
| "metrics": { | |
| "oov_rate": 0.0, | |
| "avg_sequence_length": 86.6945, | |
| "avg_processing_time_ms": 0.1026016116142273, | |
| "compression_ratio": 86.6945, | |
| "total_tokens_evaluated": 1733890, | |
| "unk_count": 0, | |
| "unk_piece_used": "[UNK]", | |
| "train_time_s": 249.83488726615906, | |
| "config": { | |
| "vocab_size": 32000 | |
| } | |
| } | |
| } | |
| }, | |
| "metadata": { | |
| "corpus_path": "full_tatar_raw_corpus_clean.txt", | |
| "vocab_sizes": [ | |
| 8000, | |
| 16000, | |
| 32000 | |
| ], | |
| "sample_size": 20000, | |
| "seed": 42, | |
| "selected_models": [ | |
| "bpe", | |
| "wordpiece", | |
| "unigram", | |
| "spm" | |
| ], | |
| "timestamp": "2025-11-19 21:10:06" | |
| } | |
| } |