File size: 6,048 Bytes
cce70aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import json
from pathlib import Path
import sentencepiece as spm
import logging
from typing import List, Dict
import shutil
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class TokenizerTrainer:
def __init__(self):
self.data_dir = Path('data/raw')
self.output_dir = Path('outputs/tokenizer')
self.output_dir.mkdir(parents=True, exist_ok=True)
# Tokenizer configuration
self.vocab_size = 32000
self.character_coverage = 0.9999
self.model_type = "unigram"
self.special_tokens = [
"[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]",
"<s>", "</s>", "<pad>", "<unk>", "<mask>",
"২০", "১০", "৫০", "১৫", "২৫", # Common Bengali numbers
"def", "class", "return", "if", "else", "for", "while", # Code keywords
"print", "input", "import", "from", "try", "except",
"#", "//", "/*", "*/", "'''", '"""' # Code comments
]
def prepare_training_data(self) -> str:
"""Prepare text data for tokenizer training"""
logger.info("Preparing training data for tokenizer")
# Load processed data
try:
with open(self.data_dir / 'processed_data.json', 'r', encoding='utf-8') as f:
data = json.load(f)
except FileNotFoundError:
logger.error("Processed data file not found. Run data collection first.")
raise
# Create temporary file for training
train_file = self.output_dir / 'train.txt'
with open(train_file, 'w', encoding='utf-8') as f:
for item in data:
text = item['text']
# Write one sentence per line
sentences = text.split('।') # Split on Bengali full stop
for sentence in sentences:
sentence = sentence.strip()
if sentence: # Skip empty sentences
f.write(sentence + '\n')
logger.info("Training data prepared successfully")
return str(train_file)
def train_tokenizer(self, train_file: str):
"""Train the SentencePiece tokenizer"""
logger.info("Starting tokenizer training")
# Prepare model prefix
model_prefix = self.output_dir / "bengali_code"
# Create training parameters
params = {
"--input": train_file,
"--model_prefix": str(model_prefix),
"--vocab_size": str(self.vocab_size),
"--character_coverage": str(self.character_coverage),
"--model_type": self.model_type,
"--pad_id": 0,
"--unk_id": 1,
"--bos_id": 2,
"--eos_id": 3,
"--user_defined_symbols": ",".join(self.special_tokens),
"--max_sentence_length": "4192",
"--input_sentence_size": "5000000",
"--shuffle_input_sentence": "true",
"--normalization_rule_name": "identity" # Preserve original text
}
# Convert parameters to command-line arguments
args = []
for key, value in params.items():
args.append(key)
args.append(value)
try:
# Train the tokenizer
spm.SentencePieceTrainer.train(" ".join(args))
logger.info("Tokenizer training completed successfully")
# Create config files for HuggingFace compatibility
self.create_huggingface_files(model_prefix)
except Exception as e:
logger.error(f"Failed to train tokenizer: {str(e)}")
raise
def create_huggingface_files(self, model_prefix: Path):
"""Create additional files needed for HuggingFace compatibility"""
logger.info("Creating HuggingFace compatibility files")
# Create tokenizer config
tokenizer_config = {
"model_max_length": 2048,
"padding_side": "right",
"truncation_side": "right",
"bos_token": "<s>",
"eos_token": "</s>",
"unk_token": "<unk>",
"pad_token": "<pad>",
"mask_token": "<mask>",
"model_type": self.model_type,
"vocab_size": self.vocab_size
}
with open(self.output_dir / "tokenizer_config.json", 'w', encoding='utf-8') as f:
json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
# Create special tokens map
special_tokens_map = {
"bos_token": "<s>",
"eos_token": "</s>",
"unk_token": "<unk>",
"pad_token": "<pad>",
"mask_token": "<mask>"
}
with open(self.output_dir / "special_tokens_map.json", 'w', encoding='utf-8') as f:
json.dump(special_tokens_map, f, ensure_ascii=False, indent=2)
logger.info("HuggingFace compatibility files created successfully")
def train(self):
"""Main method to train the tokenizer"""
try:
# Prepare training data
train_file = self.prepare_training_data()
# Train tokenizer
self.train_tokenizer(train_file)
# Clean up temporary files
if Path(train_file).exists():
Path(train_file).unlink()
logger.info("Tokenizer training pipeline completed successfully")
except Exception as e:
logger.error(f"Tokenizer training pipeline failed: {str(e)}")
raise
if __name__ == "__main__":
trainer = TokenizerTrainer()
trainer.train()
|