|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
Train a SentencePiece tokenizer from scratch using the prepared training data.
|
|
|
|
|
|
OVERVIEW:
|
|
|
This script trains a SentencePiece tokenizer on the cleaned text data from the SQUAD dataset
|
|
|
or any other text corpus. SentencePiece is a subword tokenizer that works well for language
|
|
|
models and supports multiple languages without requiring pre-tokenization.
|
|
|
|
|
|
FEATURES:
|
|
|
- Supports BPE (Byte Pair Encoding) and Unigram tokenization algorithms
|
|
|
- Configurable vocabulary size (recommended: 8k-64k for LLMs)
|
|
|
- Handles special tokens (BOS, EOS, UNK, PAD)
|
|
|
- Outputs tokenizer model files compatible with Hugging Face
|
|
|
- Comprehensive statistics and vocabulary analysis
|
|
|
|
|
|
TOKENIZER OUTPUT:
|
|
|
- tokenizer.model: SentencePiece model file
|
|
|
- tokenizer.vocab: Human-readable vocabulary file
|
|
|
- tokenizer_config.json: Configuration for Hugging Face integration
|
|
|
|
|
|
Usage:
|
|
|
python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000
|
|
|
|
|
|
Advanced usage:
|
|
|
python core/src/train_tokenizer.py \\
|
|
|
--input data/clean/training_data.txt \\
|
|
|
--vocab_size 32000 \\
|
|
|
--model_type bpe \\
|
|
|
--output_dir data/tokenizer/ \\
|
|
|
--character_coverage 0.9995
|
|
|
|
|
|
Requirements:
|
|
|
pip install sentencepiece
|
|
|
|
|
|
Example setup:
|
|
|
```bash
|
|
|
# If not already in virtual environment
|
|
|
python -m venv venv
|
|
|
source venv/bin/activate # Linux/macOS
|
|
|
# .\venv\Scripts\Activate.ps1 # Windows PowerShell
|
|
|
|
|
|
# Install SentencePiece
|
|
|
pip install sentencepiece
|
|
|
|
|
|
# Train tokenizer
|
|
|
python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000
|
|
|
```
|
|
|
|
|
|
"""
|
|
|
|
|
|
import argparse
|
|
|
import json
|
|
|
import os
|
|
|
import time
|
|
|
from typing import Any, Dict
|
|
|
|
|
|
try:
|
|
|
import sentencepiece as spm
|
|
|
except ImportError:
|
|
|
print("ERROR: SentencePiece not installed. Run: pip install sentencepiece")
|
|
|
exit(1)
|
|
|
|
|
|
|
|
|
def validate_input_file(input_path: str) -> None:
|
|
|
"""
|
|
|
Validate that the input training file exists and is readable.
|
|
|
|
|
|
Args:
|
|
|
input_path (str): Path to the training text file
|
|
|
|
|
|
Raises:
|
|
|
FileNotFoundError: If input file doesn't exist
|
|
|
ValueError: If input file is empty or unreadable
|
|
|
"""
|
|
|
if not os.path.exists(input_path):
|
|
|
raise FileNotFoundError(f"Training data file not found: {input_path}")
|
|
|
|
|
|
|
|
|
file_size = os.path.getsize(input_path)
|
|
|
if file_size == 0:
|
|
|
raise ValueError(f"Training data file is empty: {input_path}")
|
|
|
|
|
|
|
|
|
try:
|
|
|
with open(input_path, "r", encoding="utf-8") as f:
|
|
|
first_line = f.readline()
|
|
|
if not first_line.strip():
|
|
|
raise ValueError(
|
|
|
"Training data file appears to be empty or contains only whitespace"
|
|
|
)
|
|
|
except UnicodeDecodeError as e:
|
|
|
raise ValueError(f"Cannot read training data file as UTF-8: {e}")
|
|
|
|
|
|
print(f"β Input file validated: {input_path} ({file_size:,} bytes)")
|
|
|
|
|
|
|
|
|
def count_training_sentences(input_path: str) -> int:
|
|
|
"""
|
|
|
Count the number of training sentences/lines in the input file.
|
|
|
|
|
|
Args:
|
|
|
input_path (str): Path to the training text file
|
|
|
|
|
|
Returns:
|
|
|
int: Number of lines in the file
|
|
|
"""
|
|
|
print("Counting training sentences...")
|
|
|
with open(input_path, "r", encoding="utf-8") as f:
|
|
|
count = sum(1 for line in f if line.strip())
|
|
|
print(f"β Found {count:,} training sentences")
|
|
|
return count
|
|
|
|
|
|
|
|
|
def train_sentencepiece_tokenizer(
|
|
|
input_path: str,
|
|
|
output_dir: str,
|
|
|
vocab_size: int = 32000,
|
|
|
model_type: str = "bpe",
|
|
|
character_coverage: float = 0.9995,
|
|
|
max_sentence_length: int = 4192,
|
|
|
input_sentence_size: int = 10000000,
|
|
|
shuffle_input_sentence: bool = True,
|
|
|
) -> Dict[str, Any]:
|
|
|
"""
|
|
|
Train a SentencePiece tokenizer with the specified parameters.
|
|
|
|
|
|
Args:
|
|
|
input_path (str): Path to training text file
|
|
|
output_dir (str): Directory to save tokenizer files
|
|
|
vocab_size (int): Target vocabulary size (recommended: 8k-64k)
|
|
|
model_type (str): Algorithm type ('bpe' or 'unigram')
|
|
|
character_coverage (float): Character coverage (0.9995 for English, 1.0 for Japanese)
|
|
|
max_sentence_length (int): Maximum sentence length in characters
|
|
|
input_sentence_size (int): Maximum number of sentences to use for training
|
|
|
shuffle_input_sentence (bool): Whether to shuffle input sentences
|
|
|
|
|
|
Returns:
|
|
|
Dict[str, Any]: Training statistics and configuration
|
|
|
"""
|
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
model_prefix = os.path.join(output_dir, "tokenizer")
|
|
|
|
|
|
|
|
|
train_params = [
|
|
|
f"--input={input_path}",
|
|
|
f"--model_prefix={model_prefix}",
|
|
|
f"--vocab_size={vocab_size}",
|
|
|
f"--model_type={model_type}",
|
|
|
f"--character_coverage={character_coverage}",
|
|
|
f"--max_sentence_length={max_sentence_length}",
|
|
|
f"--input_sentence_size={input_sentence_size}",
|
|
|
f"--shuffle_input_sentence={shuffle_input_sentence}",
|
|
|
|
|
|
"--pad_id=0",
|
|
|
"--unk_id=1",
|
|
|
"--bos_id=2",
|
|
|
"--eos_id=3",
|
|
|
|
|
|
"--split_by_unicode_script=true",
|
|
|
"--split_by_whitespace=true",
|
|
|
"--remove_extra_whitespaces=true",
|
|
|
"--normalization_rule_name=identity",
|
|
|
]
|
|
|
|
|
|
print("\nTraining SentencePiece tokenizer...")
|
|
|
print(f" Algorithm: {model_type.upper()}")
|
|
|
print(f" Vocabulary size: {vocab_size:,}")
|
|
|
print(f" Character coverage: {character_coverage}")
|
|
|
print(f" Output directory: {output_dir}")
|
|
|
print(f" Model files: {model_prefix}.model, {model_prefix}.vocab")
|
|
|
|
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
|
|
|
try:
|
|
|
spm.SentencePieceTrainer.train(" ".join(train_params))
|
|
|
training_time = time.time() - start_time
|
|
|
print(f"β Tokenizer training completed in {training_time:.1f} seconds")
|
|
|
except Exception as e:
|
|
|
raise RuntimeError(f"SentencePiece training failed: {e}")
|
|
|
|
|
|
|
|
|
model_file = f"{model_prefix}.model"
|
|
|
vocab_file = f"{model_prefix}.vocab"
|
|
|
|
|
|
if not os.path.exists(model_file):
|
|
|
raise RuntimeError(f"Expected model file not created: {model_file}")
|
|
|
if not os.path.exists(vocab_file):
|
|
|
raise RuntimeError(f"Expected vocab file not created: {vocab_file}")
|
|
|
|
|
|
print(f"β Model file created: {model_file} ({os.path.getsize(model_file):,} bytes)")
|
|
|
print(f"β Vocab file created: {vocab_file} ({os.path.getsize(vocab_file):,} bytes)")
|
|
|
|
|
|
|
|
|
config = {
|
|
|
"model_type": model_type,
|
|
|
"vocab_size": vocab_size,
|
|
|
"character_coverage": character_coverage,
|
|
|
"max_sentence_length": max_sentence_length,
|
|
|
"training_time_seconds": training_time,
|
|
|
"input_file": input_path,
|
|
|
"output_directory": output_dir,
|
|
|
"model_file": model_file,
|
|
|
"vocab_file": vocab_file,
|
|
|
}
|
|
|
|
|
|
return config
|
|
|
|
|
|
|
|
|
def test_tokenizer(model_path: str, test_sentences: list = None) -> None:
|
|
|
"""
|
|
|
Test the trained tokenizer on sample sentences to verify it works correctly.
|
|
|
|
|
|
Args:
|
|
|
model_path (str): Path to the trained .model file
|
|
|
test_sentences (list): Optional list of test sentences
|
|
|
"""
|
|
|
print("\nTesting trained tokenizer...")
|
|
|
|
|
|
|
|
|
sp = spm.SentencePieceProcessor()
|
|
|
sp.load(model_path)
|
|
|
|
|
|
|
|
|
if test_sentences is None:
|
|
|
test_sentences = [
|
|
|
"Hello, world! This is a test sentence.",
|
|
|
"The quick brown fox jumps over the lazy dog.",
|
|
|
"Machine learning and artificial intelligence are transforming technology.",
|
|
|
"SentencePiece tokenization works well for language models.",
|
|
|
]
|
|
|
|
|
|
print(f"Vocabulary size: {sp.vocab_size():,}")
|
|
|
print(
|
|
|
f"Special tokens: PAD={sp.pad_id()}, UNK={sp.unk_id()}, BOS={sp.bos_id()}, EOS={sp.eos_id()}"
|
|
|
)
|
|
|
|
|
|
print("\nTokenization examples:")
|
|
|
for i, sentence in enumerate(test_sentences, 1):
|
|
|
|
|
|
token_ids = sp.encode(sentence)
|
|
|
token_pieces = sp.encode(sentence, out_type=str)
|
|
|
|
|
|
print(f"\n{i}. Input: {sentence}")
|
|
|
print(f" Tokens ({len(token_pieces)}): {token_pieces}")
|
|
|
print(f" IDs: {token_ids[:10]}{'...' if len(token_ids) > 10 else ''}")
|
|
|
|
|
|
|
|
|
decoded = sp.decode(token_ids)
|
|
|
print(f" Decoded: {decoded}")
|
|
|
|
|
|
|
|
|
if decoded.strip() != sentence.strip():
|
|
|
print(" β οΈ Warning: Decode mismatch!")
|
|
|
|
|
|
print("β Tokenizer testing completed")
|
|
|
|
|
|
|
|
|
def save_huggingface_config(output_dir: str, config: Dict[str, Any]) -> None:
|
|
|
"""
|
|
|
Save a Hugging Face compatible tokenizer configuration file.
|
|
|
|
|
|
Args:
|
|
|
output_dir (str): Directory containing the tokenizer files
|
|
|
config (Dict[str, Any]): Tokenizer configuration
|
|
|
"""
|
|
|
|
|
|
hf_config = {
|
|
|
"tokenizer_class": "SentencePieceTokenizer",
|
|
|
"model_type": config["model_type"],
|
|
|
"vocab_size": config["vocab_size"],
|
|
|
"model_file": "tokenizer.model",
|
|
|
"special_tokens": {
|
|
|
"pad_token": "<pad>",
|
|
|
"unk_token": "<unk>",
|
|
|
"bos_token": "<s>",
|
|
|
"eos_token": "</s>",
|
|
|
},
|
|
|
"special_token_ids": {
|
|
|
"pad_token_id": 0,
|
|
|
"unk_token_id": 1,
|
|
|
"bos_token_id": 2,
|
|
|
"eos_token_id": 3,
|
|
|
},
|
|
|
}
|
|
|
|
|
|
config_path = os.path.join(output_dir, "tokenizer_config.json")
|
|
|
with open(config_path, "w", encoding="utf-8") as f:
|
|
|
json.dump(hf_config, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
print(f"β Hugging Face config saved: {config_path}")
|
|
|
|
|
|
|
|
|
def main():
|
|
|
"""Main function to handle command line arguments and orchestrate tokenizer training."""
|
|
|
parser = argparse.ArgumentParser(
|
|
|
description="Train a SentencePiece tokenizer for language model training",
|
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
epilog="""
|
|
|
Examples:
|
|
|
# Basic usage with SQUAD data
|
|
|
python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000
|
|
|
|
|
|
# Advanced configuration
|
|
|
python core/src/train_tokenizer.py \\
|
|
|
--input data/clean/training_data.txt \\
|
|
|
--vocab_size 32000 \\
|
|
|
--model_type bpe \\
|
|
|
--output_dir data/tokenizer/ \\
|
|
|
--character_coverage 0.9995
|
|
|
""",
|
|
|
)
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
"--input",
|
|
|
required=True,
|
|
|
help="Path to training text file (e.g., data/clean/training_data.txt)",
|
|
|
)
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
"--vocab_size",
|
|
|
type=int,
|
|
|
default=32000,
|
|
|
help="Vocabulary size (default: 32000, recommended: 8k-64k)",
|
|
|
)
|
|
|
|
|
|
parser.add_argument(
|
|
|
"--model_type",
|
|
|
choices=["bpe", "unigram"],
|
|
|
default="bpe",
|
|
|
help="Tokenization algorithm (default: bpe)",
|
|
|
)
|
|
|
|
|
|
parser.add_argument(
|
|
|
"--output_dir",
|
|
|
default="data/tokenizer/",
|
|
|
help="Output directory for tokenizer files (default: data/tokenizer/)",
|
|
|
)
|
|
|
|
|
|
parser.add_argument(
|
|
|
"--character_coverage",
|
|
|
type=float,
|
|
|
default=0.9995,
|
|
|
help="Character coverage (default: 0.9995 for English)",
|
|
|
)
|
|
|
|
|
|
parser.add_argument(
|
|
|
"--max_sentence_length",
|
|
|
type=int,
|
|
|
default=4192,
|
|
|
help="Maximum sentence length in characters (default: 4192)",
|
|
|
)
|
|
|
|
|
|
parser.add_argument(
|
|
|
"--no_test", action="store_true", help="Skip tokenizer testing after training"
|
|
|
)
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
print("π€ SentencePiece Tokenizer Training")
|
|
|
print("=" * 50)
|
|
|
|
|
|
try:
|
|
|
|
|
|
validate_input_file(args.input)
|
|
|
|
|
|
|
|
|
sentence_count = count_training_sentences(args.input)
|
|
|
|
|
|
|
|
|
config = train_sentencepiece_tokenizer(
|
|
|
input_path=args.input,
|
|
|
output_dir=args.output_dir,
|
|
|
vocab_size=args.vocab_size,
|
|
|
model_type=args.model_type,
|
|
|
character_coverage=args.character_coverage,
|
|
|
max_sentence_length=args.max_sentence_length,
|
|
|
)
|
|
|
|
|
|
|
|
|
save_huggingface_config(args.output_dir, config)
|
|
|
|
|
|
|
|
|
if not args.no_test:
|
|
|
model_path = os.path.join(args.output_dir, "tokenizer.model")
|
|
|
test_tokenizer(model_path)
|
|
|
|
|
|
|
|
|
print("\nπ Tokenizer training completed successfully!")
|
|
|
print(f"π Output directory: {args.output_dir}")
|
|
|
print(f"π Vocabulary size: {config['vocab_size']:,}")
|
|
|
print(f"β±οΈ Training time: {config['training_time_seconds']:.1f}s")
|
|
|
print(f"π Training sentences: {sentence_count:,}")
|
|
|
|
|
|
print("\nFiles created:")
|
|
|
print(f" β’ {config['model_file']} - SentencePiece model")
|
|
|
print(f" β’ {config['vocab_file']} - Vocabulary file")
|
|
|
print(f" β’ {os.path.join(args.output_dir, 'tokenizer_config.json')} - Hugging Face config")
|
|
|
|
|
|
print("\nTo use this tokenizer in your language model:")
|
|
|
print(" import sentencepiece as spm")
|
|
|
print(" sp = spm.SentencePieceProcessor()")
|
|
|
print(f" sp.load('{config['model_file']}')")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"\nβ Error: {e}")
|
|
|
exit(1)
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|