| |
| """ |
| Balochi Tokenizer Training β Ablation Models |
| ============================================= |
| This script handles the training of BPE, WordPiece, and SentencePiece |
| tokenizers across multiple vocabulary sizes (32K, 64K, 80K, 128K). |
| |
| It concatenates the training corpora and runs the training loop. |
| """ |
|
|
| import os |
| import sys |
| import time |
| import glob |
| import string |
|
|
| |
| if sys.platform == "win32": |
| sys.stdout.reconfigure(encoding='utf-8', errors='replace') |
| sys.stderr.reconfigure(encoding='utf-8', errors='replace') |
|
|
| def install_if_missing(package, pip_name=None): |
| try: |
| __import__(package) |
| except ImportError: |
| import subprocess |
| subprocess.check_call([ |
| sys.executable, "-m", "pip", "install", pip_name or package, "-q" |
| ]) |
|
|
| install_if_missing("tokenizers") |
| install_if_missing("sentencepiece") |
|
|
| import sentencepiece as spm |
| from tokenizers import Tokenizer |
| from tokenizers.models import BPE, WordPiece |
| from tokenizers.trainers import BpeTrainer, WordPieceTrainer |
| from tokenizers.pre_tokenizers import Whitespace, Punctuation, Digits, Sequence as PretokSequence |
| from tokenizers.normalizers import NFKC |
|
|
| |
| |
| |
|
|
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) |
| BASE_DIR = SCRIPT_DIR |
| OUTPUT_DIR = os.path.join(SCRIPT_DIR, "Output", "Ablation") |
| MODELS_DIR = os.path.join(OUTPUT_DIR, "Models") |
|
|
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
| os.makedirs(MODELS_DIR, exist_ok=True) |
|
|
| CONFIG = { |
| "CORPUS_FILES": [ |
| os.path.join(BASE_DIR, "..", "..", "Final Tokenizers", "balochi_clean_corpus_dictionary.txt"), |
| os.path.join(BASE_DIR, "..", "..", "Final Tokenizers", "balochi_dedup_corpus.txt"), |
| os.path.join(BASE_DIR, "..", "..", "Final Tokenizers", "english_corpus_2M.txt"), |
| ], |
| "VOCAB_SIZES": [32000, 47000, 64000, 80000, 128000], |
| "ALGORITHMS": ["bpe", "wordpiece", "sentencepiece"], |
| "SKIP_EXISTING": False, |
| "SP_MODEL_TYPE": "unigram", |
| "SP_CHARACTER_COVERAGE": 0.9995, |
| "WP_MIN_FREQUENCY": 2, |
| "BPE_MIN_FREQUENCY": 2, |
| "SPECIAL_TOKENS": ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], |
| } |
|
|
| MANDATORY_ALPHABET = ( |
| list(string.ascii_lowercase) + |
| list(string.ascii_uppercase) + |
| list("0123456789") + |
| list("Ϋ°Ϋ±Ϋ²Ϋ³Ϋ΄Ϋ΅ΫΆΫ·ΫΈΫΉ") + |
| list("Ω Ω‘Ω’Ω£Ω€Ω₯Ω¦Ω§Ω¨Ω©") |
| ) |
|
|
| |
| |
| |
|
|
| def load_corpus_files(corpus_files): |
| existing_files = [] |
| for f in corpus_files: |
| if os.path.exists(f): |
| existing_files.append(f) |
| else: |
| print(f" β οΈ Warning: Corpus file not found: {f}") |
| return existing_files |
|
|
| def prepare_corpus_for_training(corpus_files, output_path): |
| from Tokenizers_Comparison import normalize_balochi |
| |
| |
| print(f" Concatenating and Normalizing {len(corpus_files)} corpus files...") |
| total_lines = 0 |
| with open(output_path, "w", encoding="utf-8") as out_f: |
| for fpath in corpus_files: |
| try: |
| with open(fpath, "r", encoding="utf-8") as in_f: |
| for line in in_f: |
| line = line.strip() |
| if len(line) >= 2: |
| norm_line = normalize_balochi(line, drop_diacritics=True, preserve_ye=True) |
| if len(norm_line) >= 2: |
| out_f.write(norm_line + "\n") |
| total_lines += 1 |
| except Exception as e: |
| print(f" β οΈ Skipped {fpath}: {e}") |
|
|
| size_mb = os.path.getsize(output_path) / (1024 * 1024) |
| print(f" β Wrote {total_lines:,} normalized lines ({size_mb:.1f} MB)") |
| return output_path |
|
|
| |
| |
| |
|
|
| def train_bpe_tokenizer(corpus_file, vocab_size, output_dir): |
| model_path = os.path.join(output_dir, f"bpe_{vocab_size}.json") |
| tokenizer = Tokenizer(BPE(unk_token="[UNK]")) |
| tokenizer.normalizer = NFKC() |
| tokenizer.pre_tokenizer = PretokSequence([ |
| Whitespace(), |
| Punctuation(), |
| Digits(individual_digits=True) |
| ]) |
| trainer = BpeTrainer( |
| vocab_size=vocab_size, |
| min_frequency=CONFIG["BPE_MIN_FREQUENCY"], |
| special_tokens=CONFIG["SPECIAL_TOKENS"], |
| initial_alphabet=MANDATORY_ALPHABET, |
| show_progress=True, |
| ) |
| print(f" Training BPE (vocab={vocab_size:,})...") |
| start = time.perf_counter() |
| tokenizer.train([corpus_file], trainer) |
| elapsed = time.perf_counter() - start |
| print(f" β Trained in {elapsed:.1f}s (actual vocab: {tokenizer.get_vocab_size():,})") |
| tokenizer.save(model_path) |
| return model_path, elapsed |
|
|
| def train_wordpiece_tokenizer(corpus_file, vocab_size, output_dir): |
| model_path = os.path.join(output_dir, f"wordpiece_{vocab_size}.json") |
| tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) |
| tokenizer.normalizer = NFKC() |
| tokenizer.pre_tokenizer = PretokSequence([ |
| Whitespace(), |
| Punctuation(), |
| Digits(individual_digits=True) |
| ]) |
| trainer = WordPieceTrainer( |
| vocab_size=vocab_size, |
| min_frequency=CONFIG["WP_MIN_FREQUENCY"], |
| special_tokens=CONFIG["SPECIAL_TOKENS"], |
| initial_alphabet=MANDATORY_ALPHABET, |
| show_progress=True, |
| ) |
| print(f" Training WordPiece (vocab={vocab_size:,})...") |
| start = time.perf_counter() |
| tokenizer.train([corpus_file], trainer) |
| elapsed = time.perf_counter() - start |
| print(f" β Trained in {elapsed:.1f}s (actual vocab: {tokenizer.get_vocab_size():,})") |
| tokenizer.save(model_path) |
| return model_path, elapsed |
|
|
| def train_sentencepiece_tokenizer(corpus_file, vocab_size, output_dir): |
| model_prefix = os.path.join(output_dir, f"sentencepiece_{vocab_size}") |
| model_path = model_prefix + ".model" |
| print(f" Training SentencePiece (vocab={vocab_size:,})...") |
| start = time.perf_counter() |
| spm.SentencePieceTrainer.train( |
| input=corpus_file, |
| model_prefix=model_prefix, |
| vocab_size=vocab_size, |
| model_type=CONFIG["SP_MODEL_TYPE"], |
| character_coverage=CONFIG["SP_CHARACTER_COVERAGE"], |
| bos_id=2, |
| eos_id=3, |
| pad_id=0, |
| unk_id=1, |
| pad_piece='<pad>', |
| unk_piece='<unk>', |
| bos_piece='<s>', |
| eos_piece='</s>', |
| byte_fallback=True, |
| user_defined_symbols=["[CLS]", "[SEP]", "[MASK]"], |
| train_extremely_large_corpus=True, |
| ) |
| elapsed = time.perf_counter() - start |
| sp = spm.SentencePieceProcessor(model_file=model_path) |
| print(f" β Trained in {elapsed:.1f}s (actual vocab: {sp.get_piece_size():,})") |
| return model_path, elapsed |
|
|
| |
| |
| |
|
|
| def main(): |
| print("=" * 80) |
| print(" π Vocabulary Size Ablation β Tokenizer Training") |
| print("=" * 80) |
|
|
| corpus_files = load_corpus_files(CONFIG["CORPUS_FILES"]) |
| if not corpus_files: |
| print("\n β Corpus files not found. Cannot train models. Exiting.") |
| return |
|
|
| print(f"\n β Found {len(corpus_files)} corpus files") |
| corpus_file = prepare_corpus_for_training(corpus_files, os.path.join(OUTPUT_DIR, "training_corpus_normalized.txt")) |
|
|
| print(f"\n Training {len(CONFIG['ALGORITHMS'])} algorithms Γ " |
| f"{len(CONFIG['VOCAB_SIZES'])} vocab sizes = " |
| f"{len(CONFIG['ALGORITHMS']) * len(CONFIG['VOCAB_SIZES'])} models\n") |
|
|
| for algo in CONFIG["ALGORITHMS"]: |
| for vs in CONFIG["VOCAB_SIZES"]: |
| print(f" [{algo.upper()} / {vs:,}]", end=" ") |
| ablation_model_dir = os.path.join(MODELS_DIR, f"{algo}_{vs}") |
| |
| if algo in ("bpe", "wordpiece"): |
| ablation_model_path = os.path.join(ablation_model_dir, f"{algo}_{vs}.json") |
| else: |
| ablation_model_path = os.path.join(ablation_model_dir, f"sentencepiece_{vs}.model") |
|
|
| if os.path.exists(ablation_model_path) and CONFIG["SKIP_EXISTING"]: |
| print(f"β Skipping (already trained)") |
| continue |
|
|
| os.makedirs(ablation_model_dir, exist_ok=True) |
| print(f"β Training new model...") |
| try: |
| if algo == "bpe": |
| train_bpe_tokenizer(corpus_file, vs, ablation_model_dir) |
| elif algo == "wordpiece": |
| train_wordpiece_tokenizer(corpus_file, vs, ablation_model_dir) |
| elif algo == "sentencepiece": |
| train_sentencepiece_tokenizer(corpus_file, vs, ablation_model_dir) |
| except Exception as e: |
| print(f" β Training failed: {e}") |
|
|
| print("\n β
Training Complete!") |
|
|
| if __name__ == "__main__": |
| main() |
|
|