| from tokenizers import Tokenizer |
| from transformers import PreTrainedTokenizerFast |
| import json |
| import os |
|
|
| |
| |
| |
|
|
| import os |
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) |
| TOKENIZER_PATH = os.path.join(SCRIPT_DIR, "fineweb_edu_tokenizer.json") |
| SAVE_DIR = os.path.join(SCRIPT_DIR, "fineweb_edu_tokenizer") |
| MODEL_MAX_LENGTH = 1024 |
| PADDING_SIDE = "right" |
|
|
|
|
| |
| |
| |
|
|
| def wrap_tokenizer( |
| tokenizer_path: str = TOKENIZER_PATH, |
| save_dir: str = SAVE_DIR, |
| ) -> PreTrainedTokenizerFast: |
| """ |
| Wraps a trained HuggingFace Tokenizer as a PreTrainedTokenizerFast. |
| |
| This gives us: |
| - datasets.map() compatibility for bulk tokenization |
| - HuggingFace Trainer + DataCollator compatibility |
| - Automatic padding, truncation, attention masks |
| - from_pretrained() loading support |
| - return_tensors="pt" for PyTorch tensors |
| |
| Args: |
| tokenizer_path : path to trained tokenizer .json file |
| save_dir : folder to save the wrapped tokenizer |
| |
| Returns: |
| PreTrainedTokenizerFast ready for training |
| """ |
|
|
| print(f"Loading trained tokenizer from: {tokenizer_path}") |
| base_tokenizer = Tokenizer.from_file(tokenizer_path) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| tokenizer = PreTrainedTokenizerFast( |
| tokenizer_object=base_tokenizer, |
|
|
| |
| eos_token="<|endoftext|>", |
| bos_token="<|endoftext|>", |
| pad_token="<|endoftext|>", |
| unk_token=None, |
|
|
| |
| model_max_length=MODEL_MAX_LENGTH, |
|
|
| |
| padding_side=PADDING_SIDE, |
|
|
| |
| |
| truncation_side="right", |
| ) |
|
|
| tokenizer.add_special_tokens({ |
| "eos_token": "<|endoftext|>", |
| "bos_token": "<|endoftext|>", |
| "pad_token": "<|endoftext|>", |
| }) |
| special_tokens_map = { |
| "bos_token": "<|endoftext|>", |
| "eos_token": "<|endoftext|>", |
| "pad_token": "<|endoftext|>", |
| } |
| os.makedirs(save_dir, exist_ok=True) |
|
|
| with open(os.path.join(save_dir, "special_tokens_map.json"), "w") as f: |
| json.dump(special_tokens_map, f, indent=2) |
|
|
| print("special_tokens_map.json written manually") |
| |
| |
| |
| |
| |
| tokenizer.save_pretrained(save_dir) |
| print(f"Tokenizer saved to: {save_dir}/") |
| print(f" tokenizer.json") |
| print(f" tokenizer_config.json") |
| print(f" special_tokens_map.json") |
|
|
| return tokenizer |
|
|
|
|
| |
| |
| |
|
|
| def verify_wrapped_tokenizer(tokenizer: PreTrainedTokenizerFast): |
| """ |
| Verifies the wrapped tokenizer behaves correctly. |
| Tests encoding, decoding, padding, truncation and batch encoding. |
| """ |
|
|
| print("\n" + "="*60) |
| print(" WRAPPED TOKENIZER VERIFICATION") |
| print("="*60 + "\n") |
|
|
| eot_id = tokenizer.eos_token_id |
|
|
| |
| print("Config:") |
| print(f" vocab size : {tokenizer.vocab_size:,}") |
| print(f" model_max_length : {tokenizer.model_max_length}") |
| print(f" padding_side : {tokenizer.padding_side}") |
| print(f" eos_token : {tokenizer.eos_token!r} (ID: {eot_id})") |
| print(f" bos_token : {tokenizer.bos_token!r}") |
| print(f" pad_token : {tokenizer.pad_token!r} (ID: {tokenizer.pad_token_id})") |
| print(f" unk_token : {tokenizer.unk_token!r}") |
| print() |
|
|
| |
| text = "The mitochondria is the powerhouse of the cell." |
| encoded = tokenizer(text) |
| decoded = tokenizer.decode(encoded["input_ids"]) |
|
|
| print("Basic encode/decode:") |
| print(f" input : {repr(text)}") |
| print(f" input_ids: {encoded['input_ids']}") |
| print(f" decoded : {repr(decoded)}") |
| print() |
|
|
| |
| |
| |
| batch = [ |
| "Short sentence.", |
| "This is a much longer sentence that has more tokens in it.", |
| ] |
|
|
| encoded_batch = tokenizer( |
| batch, |
| padding=True, |
| return_tensors="pt", |
| ) |
|
|
| print("Batch padding (right padding):") |
| print(f" input_ids shape : {encoded_batch['input_ids'].shape}") |
| print(f" attention_mask shape : {encoded_batch['attention_mask'].shape}") |
| print(f" input_ids[0] : {encoded_batch['input_ids'][0].tolist()}") |
| print(f" input_ids[1] : {encoded_batch['input_ids'][1].tolist()}") |
| print(f" attention_mask[0] : {encoded_batch['attention_mask'][0].tolist()}") |
| print() |
|
|
| |
| |
| long_text = "word " * 2000 |
| encoded_long = tokenizer( |
| long_text, |
| truncation=True, |
| max_length=MODEL_MAX_LENGTH, |
| ) |
|
|
| print("Truncation:") |
| print(f" input length : {len(long_text.split())} words") |
| print(f" token count : {len(encoded_long['input_ids'])} (max: {MODEL_MAX_LENGTH})") |
| print(f" truncated : {len(encoded_long['input_ids']) <= MODEL_MAX_LENGTH}") |
| print() |
|
|
| |
| print("Loading from disk:") |
| reloaded = PreTrainedTokenizerFast.from_pretrained(SAVE_DIR) |
| reloaded_ids = reloaded(text)["input_ids"] |
| original_ids = encoded["input_ids"] |
| match = reloaded_ids == original_ids |
|
|
| print(f" from_pretrained() : OK") |
| print(f" IDs match original: {match}") |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| tokenizer = wrap_tokenizer() |
| verify_wrapped_tokenizer(tokenizer) |
|
|
| print("\n" + "="*60) |
| print(" USAGE EXAMPLES") |
| print("="*60) |
| print(""" |
| # Load anywhere with one line |
| from transformers import PreTrainedTokenizerFast |
| tokenizer = PreTrainedTokenizerFast.from_pretrained("fineweb_edu_tokenizer") |
| |
| # Single encode |
| ids = tokenizer("Hello world")["input_ids"] |
| |
| # Batch encode with padding and tensors |
| batch = tokenizer( |
| ["sentence one", "sentence two"], |
| padding=True, |
| truncation=True, |
| max_length=1024, |
| return_tensors="pt", |
| ) |
| |
| # Decode |
| text = tokenizer.decode(ids, skip_special_tokens=True) |
| |
| # Get eos token id (use as document separator when packing) |
| eot_id = tokenizer.eos_token_id |
| """) |
|
|