File size: 14,334 Bytes
768ec2e 15db041 768ec2e 15db041 768ec2e d9822f7 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 15db041 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 15db041 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 15db041 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 15db041 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 15db041 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 6c43565 768ec2e 15db041 768ec2e 6c43565 15db041 768ec2e 6c43565 768ec2e 6c43565 15db041 768ec2e 6c43565 768ec2e 6c43565 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 |
#!/usr/bin/env python3
# Copyright (C) 2024 Louis Chua Bean Chong
#
# This file is part of OpenLLM.
#
# OpenLLM is dual-licensed:
# 1. For open source use: GNU General Public License v3.0
# 2. For commercial use: Commercial License (contact for details)
#
# See LICENSE and docs/LICENSES.md for full license information.
"""
Train a SentencePiece tokenizer from scratch using the prepared training data.
OVERVIEW:
This script trains a SentencePiece tokenizer on the cleaned text data from the SQUAD dataset
or any other text corpus. SentencePiece is a subword tokenizer that works well for language
models and supports multiple languages without requiring pre-tokenization.
FEATURES:
- Supports BPE (Byte Pair Encoding) and Unigram tokenization algorithms
- Configurable vocabulary size (recommended: 8k-64k for LLMs)
- Handles special tokens (BOS, EOS, UNK, PAD)
- Outputs tokenizer model files compatible with Hugging Face
- Comprehensive statistics and vocabulary analysis
TOKENIZER OUTPUT:
- tokenizer.model: SentencePiece model file
- tokenizer.vocab: Human-readable vocabulary file
- tokenizer_config.json: Configuration for Hugging Face integration
Usage:
python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000
Advanced usage:
python core/src/train_tokenizer.py \\
--input data/clean/training_data.txt \\
--vocab_size 32000 \\
--model_type bpe \\
--output_dir data/tokenizer/ \\
--character_coverage 0.9995
Requirements:
pip install sentencepiece
Example setup:
```bash
# If not already in virtual environment
python -m venv venv
source venv/bin/activate # Linux/macOS
# .\venv\Scripts\Activate.ps1 # Windows PowerShell
# Install SentencePiece
pip install sentencepiece
# Train tokenizer
python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000
```
"""
import argparse
import json
import os
import time
from typing import Any, Dict
try:
import sentencepiece as spm
except ImportError:
print("ERROR: SentencePiece not installed. Run: pip install sentencepiece")
exit(1)
def validate_input_file(input_path: str) -> None:
"""
Validate that the input training file exists and is readable.
Args:
input_path (str): Path to the training text file
Raises:
FileNotFoundError: If input file doesn't exist
ValueError: If input file is empty or unreadable
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Training data file not found: {input_path}")
# Check file size and readability
file_size = os.path.getsize(input_path)
if file_size == 0:
raise ValueError(f"Training data file is empty: {input_path}")
# Test that we can read the file
try:
with open(input_path, "r", encoding="utf-8") as f:
first_line = f.readline()
if not first_line.strip():
raise ValueError(
"Training data file appears to be empty or contains only whitespace"
)
except UnicodeDecodeError as e:
raise ValueError(f"Cannot read training data file as UTF-8: {e}")
print(f"β Input file validated: {input_path} ({file_size:,} bytes)")
def count_training_sentences(input_path: str) -> int:
"""
Count the number of training sentences/lines in the input file.
Args:
input_path (str): Path to the training text file
Returns:
int: Number of lines in the file
"""
print("Counting training sentences...")
with open(input_path, "r", encoding="utf-8") as f:
count = sum(1 for line in f if line.strip())
print(f"β Found {count:,} training sentences")
return count
def train_sentencepiece_tokenizer(
input_path: str,
output_dir: str,
vocab_size: int = 32000,
model_type: str = "bpe",
character_coverage: float = 0.9995,
max_sentence_length: int = 4192,
input_sentence_size: int = 10000000,
shuffle_input_sentence: bool = True,
) -> Dict[str, Any]:
"""
Train a SentencePiece tokenizer with the specified parameters.
Args:
input_path (str): Path to training text file
output_dir (str): Directory to save tokenizer files
vocab_size (int): Target vocabulary size (recommended: 8k-64k)
model_type (str): Algorithm type ('bpe' or 'unigram')
character_coverage (float): Character coverage (0.9995 for English, 1.0 for Japanese)
max_sentence_length (int): Maximum sentence length in characters
input_sentence_size (int): Maximum number of sentences to use for training
shuffle_input_sentence (bool): Whether to shuffle input sentences
Returns:
Dict[str, Any]: Training statistics and configuration
"""
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
# Define output paths
model_prefix = os.path.join(output_dir, "tokenizer")
# SentencePiece training parameters
train_params = [
f"--input={input_path}",
f"--model_prefix={model_prefix}",
f"--vocab_size={vocab_size}",
f"--model_type={model_type}",
f"--character_coverage={character_coverage}",
f"--max_sentence_length={max_sentence_length}",
f"--input_sentence_size={input_sentence_size}",
f"--shuffle_input_sentence={shuffle_input_sentence}",
# Special tokens for language modeling
"--pad_id=0", # Padding token
"--unk_id=1", # Unknown token
"--bos_id=2", # Beginning of sequence
"--eos_id=3", # End of sequence
# Additional useful parameters
"--split_by_unicode_script=true", # Better handling of mixed scripts
"--split_by_whitespace=true", # Split on whitespace
"--remove_extra_whitespaces=true", # Clean up whitespace
"--normalization_rule_name=identity", # Keep original text as-is
]
print("\nTraining SentencePiece tokenizer...")
print(f" Algorithm: {model_type.upper()}")
print(f" Vocabulary size: {vocab_size:,}")
print(f" Character coverage: {character_coverage}")
print(f" Output directory: {output_dir}")
print(f" Model files: {model_prefix}.model, {model_prefix}.vocab")
# Record training start time
start_time = time.time()
# Train the tokenizer
try:
spm.SentencePieceTrainer.train(" ".join(train_params))
training_time = time.time() - start_time
print(f"β Tokenizer training completed in {training_time:.1f} seconds")
except Exception as e:
raise RuntimeError(f"SentencePiece training failed: {e}")
# Verify output files were created
model_file = f"{model_prefix}.model"
vocab_file = f"{model_prefix}.vocab"
if not os.path.exists(model_file):
raise RuntimeError(f"Expected model file not created: {model_file}")
if not os.path.exists(vocab_file):
raise RuntimeError(f"Expected vocab file not created: {vocab_file}")
print(f"β Model file created: {model_file} ({os.path.getsize(model_file):,} bytes)")
print(f"β Vocab file created: {vocab_file} ({os.path.getsize(vocab_file):,} bytes)")
# Return training configuration and statistics
config = {
"model_type": model_type,
"vocab_size": vocab_size,
"character_coverage": character_coverage,
"max_sentence_length": max_sentence_length,
"training_time_seconds": training_time,
"input_file": input_path,
"output_directory": output_dir,
"model_file": model_file,
"vocab_file": vocab_file,
}
return config
def test_tokenizer(model_path: str, test_sentences: list = None) -> None:
"""
Test the trained tokenizer on sample sentences to verify it works correctly.
Args:
model_path (str): Path to the trained .model file
test_sentences (list): Optional list of test sentences
"""
print("\nTesting trained tokenizer...")
# Load the trained tokenizer
sp = spm.SentencePieceProcessor()
sp.load(model_path)
# Default test sentences if none provided
if test_sentences is None:
test_sentences = [
"Hello, world! This is a test sentence.",
"The quick brown fox jumps over the lazy dog.",
"Machine learning and artificial intelligence are transforming technology.",
"SentencePiece tokenization works well for language models.",
]
print(f"Vocabulary size: {sp.vocab_size():,}")
print(
f"Special tokens: PAD={sp.pad_id()}, UNK={sp.unk_id()}, BOS={sp.bos_id()}, EOS={sp.eos_id()}"
)
print("\nTokenization examples:")
for i, sentence in enumerate(test_sentences, 1):
# Encode to token IDs and pieces
token_ids = sp.encode(sentence)
token_pieces = sp.encode(sentence, out_type=str)
print(f"\n{i}. Input: {sentence}")
print(f" Tokens ({len(token_pieces)}): {token_pieces}")
print(f" IDs: {token_ids[:10]}{'...' if len(token_ids) > 10 else ''}")
# Test decoding
decoded = sp.decode(token_ids)
print(f" Decoded: {decoded}")
# Verify round-trip encoding/decoding
if decoded.strip() != sentence.strip():
print(" β οΈ Warning: Decode mismatch!")
print("β Tokenizer testing completed")
def save_huggingface_config(output_dir: str, config: Dict[str, Any]) -> None:
"""
Save a Hugging Face compatible tokenizer configuration file.
Args:
output_dir (str): Directory containing the tokenizer files
config (Dict[str, Any]): Tokenizer configuration
"""
# Create Hugging Face tokenizer config
hf_config = {
"tokenizer_class": "SentencePieceTokenizer",
"model_type": config["model_type"],
"vocab_size": config["vocab_size"],
"model_file": "tokenizer.model",
"special_tokens": {
"pad_token": "<pad>",
"unk_token": "<unk>",
"bos_token": "<s>",
"eos_token": "</s>",
},
"special_token_ids": {
"pad_token_id": 0,
"unk_token_id": 1,
"bos_token_id": 2,
"eos_token_id": 3,
},
}
config_path = os.path.join(output_dir, "tokenizer_config.json")
with open(config_path, "w", encoding="utf-8") as f:
json.dump(hf_config, f, indent=2, ensure_ascii=False)
print(f"β Hugging Face config saved: {config_path}")
def main():
"""Main function to handle command line arguments and orchestrate tokenizer training."""
parser = argparse.ArgumentParser(
description="Train a SentencePiece tokenizer for language model training",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Basic usage with SQUAD data
python core/src/train_tokenizer.py --input data/clean/training_data.txt --vocab_size 32000
# Advanced configuration
python core/src/train_tokenizer.py \\
--input data/clean/training_data.txt \\
--vocab_size 32000 \\
--model_type bpe \\
--output_dir data/tokenizer/ \\
--character_coverage 0.9995
""",
)
# Required arguments
parser.add_argument(
"--input",
required=True,
help="Path to training text file (e.g., data/clean/training_data.txt)",
)
# Optional arguments with sensible defaults
parser.add_argument(
"--vocab_size",
type=int,
default=32000,
help="Vocabulary size (default: 32000, recommended: 8k-64k)",
)
parser.add_argument(
"--model_type",
choices=["bpe", "unigram"],
default="bpe",
help="Tokenization algorithm (default: bpe)",
)
parser.add_argument(
"--output_dir",
default="data/tokenizer/",
help="Output directory for tokenizer files (default: data/tokenizer/)",
)
parser.add_argument(
"--character_coverage",
type=float,
default=0.9995,
help="Character coverage (default: 0.9995 for English)",
)
parser.add_argument(
"--max_sentence_length",
type=int,
default=4192,
help="Maximum sentence length in characters (default: 4192)",
)
parser.add_argument(
"--no_test", action="store_true", help="Skip tokenizer testing after training"
)
args = parser.parse_args()
print("π€ SentencePiece Tokenizer Training")
print("=" * 50)
try:
# Step 1: Validate input file
validate_input_file(args.input)
# Step 2: Count training data
sentence_count = count_training_sentences(args.input)
# Step 3: Train tokenizer
config = train_sentencepiece_tokenizer(
input_path=args.input,
output_dir=args.output_dir,
vocab_size=args.vocab_size,
model_type=args.model_type,
character_coverage=args.character_coverage,
max_sentence_length=args.max_sentence_length,
)
# Step 4: Save Hugging Face compatible config
save_huggingface_config(args.output_dir, config)
# Step 5: Test tokenizer (unless skipped)
if not args.no_test:
model_path = os.path.join(args.output_dir, "tokenizer.model")
test_tokenizer(model_path)
# Step 6: Print summary
print("\nπ Tokenizer training completed successfully!")
print(f"π Output directory: {args.output_dir}")
print(f"π Vocabulary size: {config['vocab_size']:,}")
print(f"β±οΈ Training time: {config['training_time_seconds']:.1f}s")
print(f"π Training sentences: {sentence_count:,}")
print("\nFiles created:")
print(f" β’ {config['model_file']} - SentencePiece model")
print(f" β’ {config['vocab_file']} - Vocabulary file")
print(f" β’ {os.path.join(args.output_dir, 'tokenizer_config.json')} - Hugging Face config")
print("\nTo use this tokenizer in your language model:")
print(" import sentencepiece as spm")
print(" sp = spm.SentencePieceProcessor()")
print(f" sp.load('{config['model_file']}')")
except Exception as e:
print(f"\nβ Error: {e}")
exit(1)
if __name__ == "__main__":
main()
|