File size: 8,127 Bytes
7f974df | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 | from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerFast
import json
import os
# ------------------------------------------------------------------ #
# CONSTANTS
# ------------------------------------------------------------------ #
import os
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
TOKENIZER_PATH = os.path.join(SCRIPT_DIR, "fineweb_edu_tokenizer.json")
SAVE_DIR = os.path.join(SCRIPT_DIR, "fineweb_edu_tokenizer") # output folder
MODEL_MAX_LENGTH = 1024 # context length
PADDING_SIDE = "right" # causal LM standard
# ------------------------------------------------------------------ #
# WRAP
# ------------------------------------------------------------------ #
def wrap_tokenizer(
tokenizer_path: str = TOKENIZER_PATH,
save_dir: str = SAVE_DIR,
) -> PreTrainedTokenizerFast:
"""
Wraps a trained HuggingFace Tokenizer as a PreTrainedTokenizerFast.
This gives us:
- datasets.map() compatibility for bulk tokenization
- HuggingFace Trainer + DataCollator compatibility
- Automatic padding, truncation, attention masks
- from_pretrained() loading support
- return_tensors="pt" for PyTorch tensors
Args:
tokenizer_path : path to trained tokenizer .json file
save_dir : folder to save the wrapped tokenizer
Returns:
PreTrainedTokenizerFast ready for training
"""
print(f"Loading trained tokenizer from: {tokenizer_path}")
base_tokenizer = Tokenizer.from_file(tokenizer_path)
# ---- Wrap --------------------------------------------------------
# We map <|endoftext|> to all three roles:
#
# eos_token - end of sequence marker, used during generation
# to know when to stop
#
# bos_token - beginning of sequence, GPT-2 style uses eos
# for both since there is no separate BOS token
#
# pad_token - safe to reuse eos here because we are packing
# sequences and will never actually pad during
# pretraining. Defined so HuggingFace doesn't
# complain about missing pad token
#
# unk_token - None because byte-level means no unknowns ever
tokenizer = PreTrainedTokenizerFast(
tokenizer_object=base_tokenizer,
# Special token mappings
eos_token="<|endoftext|>",
bos_token="<|endoftext|>",
pad_token="<|endoftext|>",
unk_token=None,
# Context length
model_max_length=MODEL_MAX_LENGTH,
# Padding behavior
padding_side=PADDING_SIDE,
# Truncation side - truncate from the right
# (keep the beginning of the sequence, drop the end)
truncation_side="right",
)
tokenizer.add_special_tokens({
"eos_token": "<|endoftext|>",
"bos_token": "<|endoftext|>",
"pad_token": "<|endoftext|>",
})
special_tokens_map = {
"bos_token": "<|endoftext|>",
"eos_token": "<|endoftext|>",
"pad_token": "<|endoftext|>",
}
os.makedirs(save_dir, exist_ok=True)
with open(os.path.join(save_dir, "special_tokens_map.json"), "w") as f:
json.dump(special_tokens_map, f, indent=2)
print("special_tokens_map.json written manually")
# ---- Save --------------------------------------------------------
# Saves three files to save_dir/:
# tokenizer.json - the trained BPE tokenizer
# tokenizer_config.json - max length, pad token, special tokens
# special_tokens_map.json - maps eos/bos/pad to actual tokens
tokenizer.save_pretrained(save_dir)
print(f"Tokenizer saved to: {save_dir}/")
print(f" tokenizer.json")
print(f" tokenizer_config.json")
print(f" special_tokens_map.json")
return tokenizer
# ------------------------------------------------------------------ #
# VERIFICATION
# ------------------------------------------------------------------ #
def verify_wrapped_tokenizer(tokenizer: PreTrainedTokenizerFast):
"""
Verifies the wrapped tokenizer behaves correctly.
Tests encoding, decoding, padding, truncation and batch encoding.
"""
print("\n" + "="*60)
print(" WRAPPED TOKENIZER VERIFICATION")
print("="*60 + "\n")
eot_id = tokenizer.eos_token_id
# ---- 1. Basic config -----------------------------------------
print("Config:")
print(f" vocab size : {tokenizer.vocab_size:,}")
print(f" model_max_length : {tokenizer.model_max_length}")
print(f" padding_side : {tokenizer.padding_side}")
print(f" eos_token : {tokenizer.eos_token!r} (ID: {eot_id})")
print(f" bos_token : {tokenizer.bos_token!r}")
print(f" pad_token : {tokenizer.pad_token!r} (ID: {tokenizer.pad_token_id})")
print(f" unk_token : {tokenizer.unk_token!r}")
print()
# ---- 2. Basic encode/decode ----------------------------------
text = "The mitochondria is the powerhouse of the cell."
encoded = tokenizer(text)
decoded = tokenizer.decode(encoded["input_ids"])
print("Basic encode/decode:")
print(f" input : {repr(text)}")
print(f" input_ids: {encoded['input_ids']}")
print(f" decoded : {repr(decoded)}")
print()
# ---- 3. Padding ----------------------------------------------
# Batch of two sequences with different lengths
# shorter one should be right-padded to match the longer
batch = [
"Short sentence.",
"This is a much longer sentence that has more tokens in it.",
]
encoded_batch = tokenizer(
batch,
padding=True, # pad to longest in batch
return_tensors="pt", # return PyTorch tensors
)
print("Batch padding (right padding):")
print(f" input_ids shape : {encoded_batch['input_ids'].shape}")
print(f" attention_mask shape : {encoded_batch['attention_mask'].shape}")
print(f" input_ids[0] : {encoded_batch['input_ids'][0].tolist()}")
print(f" input_ids[1] : {encoded_batch['input_ids'][1].tolist()}")
print(f" attention_mask[0] : {encoded_batch['attention_mask'][0].tolist()}")
print()
# ---- 4. Truncation -------------------------------------------
# Sequence longer than model_max_length should be truncated
long_text = "word " * 2000 # 2000 words >> 1024 tokens
encoded_long = tokenizer(
long_text,
truncation=True,
max_length=MODEL_MAX_LENGTH,
)
print("Truncation:")
print(f" input length : {len(long_text.split())} words")
print(f" token count : {len(encoded_long['input_ids'])} (max: {MODEL_MAX_LENGTH})")
print(f" truncated : {len(encoded_long['input_ids']) <= MODEL_MAX_LENGTH}")
print()
# ---- 5. Load from disk and verify ----------------------------
print("Loading from disk:")
reloaded = PreTrainedTokenizerFast.from_pretrained(SAVE_DIR)
reloaded_ids = reloaded(text)["input_ids"]
original_ids = encoded["input_ids"]
match = reloaded_ids == original_ids
print(f" from_pretrained() : OK")
print(f" IDs match original: {match}")
# ------------------------------------------------------------------ #
# ENTRY POINT
# ------------------------------------------------------------------ #
if __name__ == "__main__":
tokenizer = wrap_tokenizer()
verify_wrapped_tokenizer(tokenizer)
print("\n" + "="*60)
print(" USAGE EXAMPLES")
print("="*60)
print("""
# Load anywhere with one line
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained("fineweb_edu_tokenizer")
# Single encode
ids = tokenizer("Hello world")["input_ids"]
# Batch encode with padding and tensors
batch = tokenizer(
["sentence one", "sentence two"],
padding=True,
truncation=True,
max_length=1024,
return_tensors="pt",
)
# Decode
text = tokenizer.decode(ids, skip_special_tokens=True)
# Get eos token id (use as document separator when packing)
eot_id = tokenizer.eos_token_id
""")
|