| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| Train a SentencePiece tokenizer from scratch using the prepared training data. |
| |
| OVERVIEW: |
| This script trains a SentencePiece tokenizer on the cleaned text data from the SQUAD dataset |
| or any other text corpus. SentencePiece is a subword tokenizer that works well for language |
| models and supports multiple languages without requiring pre-tokenization. |
| |
| FEATURES: |
| - Supports BPE (Byte Pair Encoding) and Unigram tokenization algorithms |
| - Configurable vocabulary size (recommended: 8k-64k for LLMs) |
| - Handles special tokens (BOS, EOS, UNK, PAD) |
| - Outputs tokenizer model files compatible with Hugging Face |
| - Comprehensive statistics and vocabulary analysis |
| |
| TOKENIZER OUTPUT: |
| - tokenizer.model: SentencePiece model file |
| - tokenizer.vocab: Human-readable vocabulary file |
| - tokenizer_config.json: Configuration for Hugging Face integration |
| |
| Usage: |
| python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000 |
| |
| Advanced usage: |
| python core/src/train_tokenizer.py \\ |
| --input data/clean/training_data.txt \\ |
| --vocab_size 32000 \\ |
| --model_type bpe \\ |
| --output_dir data/tokenizer/ \\ |
| --character_coverage 0.9995 |
| |
| Requirements: |
| pip install sentencepiece |
| |
| Example setup: |
| ```bash |
| # If not already in virtual environment |
| python -m venv venv |
| source venv/bin/activate # Linux/macOS |
| # .\venv\Scripts\Activate.ps1 # Windows PowerShell |
| |
| # Install SentencePiece |
| pip install sentencepiece |
| |
| # Train tokenizer |
| python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000 |
| ``` |
| |
| """ |
|
|
| import argparse |
| import json |
| import os |
| import time |
| from typing import Any, Dict |
|
|
| try: |
| import sentencepiece as spm |
| except ImportError: |
| print("ERROR: SentencePiece not installed. Run: pip install sentencepiece") |
| exit(1) |
|
|
|
|
| def validate_input_file(input_path: str) -> None: |
| """ |
| Validate that the input training file exists and is readable. |
| |
| Args: |
| input_path (str): Path to the training text file |
| |
| Raises: |
| FileNotFoundError: If input file doesn't exist |
| ValueError: If input file is empty or unreadable |
| """ |
| if not os.path.exists(input_path): |
| raise FileNotFoundError(f"Training data file not found: {input_path}") |
|
|
| |
| file_size = os.path.getsize(input_path) |
| if file_size == 0: |
| raise ValueError(f"Training data file is empty: {input_path}") |
|
|
| |
| try: |
| with open(input_path, "r", encoding="utf-8") as f: |
| first_line = f.readline() |
| if not first_line.strip(): |
| raise ValueError( |
| "Training data file appears to be empty or contains only whitespace" |
| ) |
| except UnicodeDecodeError as e: |
| raise ValueError(f"Cannot read training data file as UTF-8: {e}") |
|
|
| print(f"β Input file validated: {input_path} ({file_size:,} bytes)") |
|
|
|
|
| def count_training_sentences(input_path: str) -> int: |
| """ |
| Count the number of training sentences/lines in the input file. |
| |
| Args: |
| input_path (str): Path to the training text file |
| |
| Returns: |
| int: Number of lines in the file |
| """ |
| print("Counting training sentences...") |
| with open(input_path, "r", encoding="utf-8") as f: |
| count = sum(1 for line in f if line.strip()) |
| print(f"β Found {count:,} training sentences") |
| return count |
|
|
|
|
| def train_sentencepiece_tokenizer( |
| input_path: str, |
| output_dir: str, |
| vocab_size: int = 32000, |
| model_type: str = "bpe", |
| character_coverage: float = 0.9995, |
| max_sentence_length: int = 4192, |
| input_sentence_size: int = 10000000, |
| shuffle_input_sentence: bool = True, |
| ) -> Dict[str, Any]: |
| """ |
| Train a SentencePiece tokenizer with the specified parameters. |
| |
| Args: |
| input_path (str): Path to training text file |
| output_dir (str): Directory to save tokenizer files |
| vocab_size (int): Target vocabulary size (recommended: 8k-64k) |
| model_type (str): Algorithm type ('bpe' or 'unigram') |
| character_coverage (float): Character coverage (0.9995 for English, 1.0 for Japanese) |
| max_sentence_length (int): Maximum sentence length in characters |
| input_sentence_size (int): Maximum number of sentences to use for training |
| shuffle_input_sentence (bool): Whether to shuffle input sentences |
| |
| Returns: |
| Dict[str, Any]: Training statistics and configuration |
| """ |
| |
| os.makedirs(output_dir, exist_ok=True) |
|
|
| |
| model_prefix = os.path.join(output_dir, "tokenizer") |
|
|
| |
| train_params = [ |
| f"--input={input_path}", |
| f"--model_prefix={model_prefix}", |
| f"--vocab_size={vocab_size}", |
| f"--model_type={model_type}", |
| f"--character_coverage={character_coverage}", |
| f"--max_sentence_length={max_sentence_length}", |
| f"--input_sentence_size={input_sentence_size}", |
| f"--shuffle_input_sentence={shuffle_input_sentence}", |
| |
| "--pad_id=0", |
| "--unk_id=1", |
| "--bos_id=2", |
| "--eos_id=3", |
| |
| "--split_by_unicode_script=true", |
| "--split_by_whitespace=true", |
| "--remove_extra_whitespaces=true", |
| "--normalization_rule_name=identity", |
| ] |
|
|
| print("\nTraining SentencePiece tokenizer...") |
| print(f" Algorithm: {model_type.upper()}") |
| print(f" Vocabulary size: {vocab_size:,}") |
| print(f" Character coverage: {character_coverage}") |
| print(f" Output directory: {output_dir}") |
| print(f" Model files: {model_prefix}.model, {model_prefix}.vocab") |
|
|
| |
| start_time = time.time() |
|
|
| |
| try: |
| spm.SentencePieceTrainer.train(" ".join(train_params)) |
| training_time = time.time() - start_time |
| print(f"β Tokenizer training completed in {training_time:.1f} seconds") |
| except Exception as e: |
| raise RuntimeError(f"SentencePiece training failed: {e}") |
|
|
| |
| model_file = f"{model_prefix}.model" |
| vocab_file = f"{model_prefix}.vocab" |
|
|
| if not os.path.exists(model_file): |
| raise RuntimeError(f"Expected model file not created: {model_file}") |
| if not os.path.exists(vocab_file): |
| raise RuntimeError(f"Expected vocab file not created: {vocab_file}") |
|
|
| print(f"β Model file created: {model_file} ({os.path.getsize(model_file):,} bytes)") |
| print(f"β Vocab file created: {vocab_file} ({os.path.getsize(vocab_file):,} bytes)") |
|
|
| |
| config = { |
| "model_type": model_type, |
| "vocab_size": vocab_size, |
| "character_coverage": character_coverage, |
| "max_sentence_length": max_sentence_length, |
| "training_time_seconds": training_time, |
| "input_file": input_path, |
| "output_directory": output_dir, |
| "model_file": model_file, |
| "vocab_file": vocab_file, |
| } |
|
|
| return config |
|
|
|
|
| def test_tokenizer(model_path: str, test_sentences: list = None) -> None: |
| """ |
| Test the trained tokenizer on sample sentences to verify it works correctly. |
| |
| Args: |
| model_path (str): Path to the trained .model file |
| test_sentences (list): Optional list of test sentences |
| """ |
| print("\nTesting trained tokenizer...") |
|
|
| |
| sp = spm.SentencePieceProcessor() |
| sp.load(model_path) |
|
|
| |
| if test_sentences is None: |
| test_sentences = [ |
| "Hello, world! This is a test sentence.", |
| "The quick brown fox jumps over the lazy dog.", |
| "Machine learning and artificial intelligence are transforming technology.", |
| "SentencePiece tokenization works well for language models.", |
| ] |
|
|
| print(f"Vocabulary size: {sp.vocab_size():,}") |
| print( |
| f"Special tokens: PAD={sp.pad_id()}, UNK={sp.unk_id()}, BOS={sp.bos_id()}, EOS={sp.eos_id()}" |
| ) |
|
|
| print("\nTokenization examples:") |
| for i, sentence in enumerate(test_sentences, 1): |
| |
| token_ids = sp.encode(sentence) |
| token_pieces = sp.encode(sentence, out_type=str) |
|
|
| print(f"\n{i}. Input: {sentence}") |
| print(f" Tokens ({len(token_pieces)}): {token_pieces}") |
| print(f" IDs: {token_ids[:10]}{'...' if len(token_ids) > 10 else ''}") |
|
|
| |
| decoded = sp.decode(token_ids) |
| print(f" Decoded: {decoded}") |
|
|
| |
| if decoded.strip() != sentence.strip(): |
| print(" β οΈ Warning: Decode mismatch!") |
|
|
| print("β Tokenizer testing completed") |
|
|
|
|
| def save_huggingface_config(output_dir: str, config: Dict[str, Any]) -> None: |
| """ |
| Save a Hugging Face compatible tokenizer configuration file. |
| |
| Args: |
| output_dir (str): Directory containing the tokenizer files |
| config (Dict[str, Any]): Tokenizer configuration |
| """ |
| |
| hf_config = { |
| "tokenizer_class": "SentencePieceTokenizer", |
| "model_type": config["model_type"], |
| "vocab_size": config["vocab_size"], |
| "model_file": "tokenizer.model", |
| "special_tokens": { |
| "pad_token": "<pad>", |
| "unk_token": "<unk>", |
| "bos_token": "<s>", |
| "eos_token": "</s>", |
| }, |
| "special_token_ids": { |
| "pad_token_id": 0, |
| "unk_token_id": 1, |
| "bos_token_id": 2, |
| "eos_token_id": 3, |
| }, |
| } |
|
|
| config_path = os.path.join(output_dir, "tokenizer_config.json") |
| with open(config_path, "w", encoding="utf-8") as f: |
| json.dump(hf_config, f, indent=2, ensure_ascii=False) |
|
|
| print(f"β Hugging Face config saved: {config_path}") |
|
|
|
|
| def main(): |
| """Main function to handle command line arguments and orchestrate tokenizer training.""" |
| parser = argparse.ArgumentParser( |
| description="Train a SentencePiece tokenizer for language model training", |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| epilog=""" |
| Examples: |
| # Basic usage with SQUAD data |
| python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000 |
| |
| # Advanced configuration |
| python core/src/train_tokenizer.py \\ |
| --input data/clean/training_data.txt \\ |
| --vocab_size 32000 \\ |
| --model_type bpe \\ |
| --output_dir data/tokenizer/ \\ |
| --character_coverage 0.9995 |
| """, |
| ) |
|
|
| |
| parser.add_argument( |
| "--input", |
| required=True, |
| help="Path to training text file (e.g., data/clean/training_data.txt)", |
| ) |
|
|
| |
| parser.add_argument( |
| "--vocab_size", |
| type=int, |
| default=32000, |
| help="Vocabulary size (default: 32000, recommended: 8k-64k)", |
| ) |
|
|
| parser.add_argument( |
| "--model_type", |
| choices=["bpe", "unigram"], |
| default="bpe", |
| help="Tokenization algorithm (default: bpe)", |
| ) |
|
|
| parser.add_argument( |
| "--output_dir", |
| default="data/tokenizer/", |
| help="Output directory for tokenizer files (default: data/tokenizer/)", |
| ) |
|
|
| parser.add_argument( |
| "--character_coverage", |
| type=float, |
| default=0.9995, |
| help="Character coverage (default: 0.9995 for English)", |
| ) |
|
|
| parser.add_argument( |
| "--max_sentence_length", |
| type=int, |
| default=4192, |
| help="Maximum sentence length in characters (default: 4192)", |
| ) |
|
|
| parser.add_argument( |
| "--no_test", action="store_true", help="Skip tokenizer testing after training" |
| ) |
|
|
| args = parser.parse_args() |
|
|
| print("π€ SentencePiece Tokenizer Training") |
| print("=" * 50) |
|
|
| try: |
| |
| validate_input_file(args.input) |
|
|
| |
| sentence_count = count_training_sentences(args.input) |
|
|
| |
| config = train_sentencepiece_tokenizer( |
| input_path=args.input, |
| output_dir=args.output_dir, |
| vocab_size=args.vocab_size, |
| model_type=args.model_type, |
| character_coverage=args.character_coverage, |
| max_sentence_length=args.max_sentence_length, |
| ) |
|
|
| |
| save_huggingface_config(args.output_dir, config) |
|
|
| |
| if not args.no_test: |
| model_path = os.path.join(args.output_dir, "tokenizer.model") |
| test_tokenizer(model_path) |
|
|
| |
| print("\nπ Tokenizer training completed successfully!") |
| print(f"π Output directory: {args.output_dir}") |
| print(f"π Vocabulary size: {config['vocab_size']:,}") |
| print(f"β±οΈ Training time: {config['training_time_seconds']:.1f}s") |
| print(f"π Training sentences: {sentence_count:,}") |
|
|
| print("\nFiles created:") |
| print(f" β’ {config['model_file']} - SentencePiece model") |
| print(f" β’ {config['vocab_file']} - Vocabulary file") |
| print(f" β’ {os.path.join(args.output_dir, 'tokenizer_config.json')} - Hugging Face config") |
|
|
| print("\nTo use this tokenizer in your language model:") |
| print(" import sentencepiece as spm") |
| print(" sp = spm.SentencePieceProcessor()") |
| print(f" sp.load('{config['model_file']}')") |
|
|
| except Exception as e: |
| print(f"\nβ Error: {e}") |
| exit(1) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|