sllm / tokenizer /wrap_tokenizer.py

Initial commit

7f974df verified 4 days ago

8.13 kB

	from tokenizers import Tokenizer
	from transformers import PreTrainedTokenizerFast
	import json
	import os

	# ------------------------------------------------------------------ #
	# CONSTANTS
	# ------------------------------------------------------------------ #

	import os
	SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
	TOKENIZER_PATH = os.path.join(SCRIPT_DIR, "fineweb_edu_tokenizer.json")
	SAVE_DIR = os.path.join(SCRIPT_DIR, "fineweb_edu_tokenizer") # output folder
	MODEL_MAX_LENGTH = 1024 # context length
	PADDING_SIDE = "right" # causal LM standard


	# ------------------------------------------------------------------ #
	# WRAP
	# ------------------------------------------------------------------ #

	def wrap_tokenizer(
	tokenizer_path: str = TOKENIZER_PATH,
	save_dir: str = SAVE_DIR,
	) -> PreTrainedTokenizerFast:
	"""
	Wraps a trained HuggingFace Tokenizer as a PreTrainedTokenizerFast.

	This gives us:
	- datasets.map() compatibility for bulk tokenization
	- HuggingFace Trainer + DataCollator compatibility
	- Automatic padding, truncation, attention masks
	- from_pretrained() loading support
	- return_tensors="pt" for PyTorch tensors

	Args:
	tokenizer_path : path to trained tokenizer .json file
	save_dir : folder to save the wrapped tokenizer

	Returns:
	PreTrainedTokenizerFast ready for training
	"""

	print(f"Loading trained tokenizer from: {tokenizer_path}")
	base_tokenizer = Tokenizer.from_file(tokenizer_path)

	# ---- Wrap --------------------------------------------------------
	# We map <\|endoftext\|> to all three roles:
	#
	# eos_token - end of sequence marker, used during generation
	# to know when to stop
	#
	# bos_token - beginning of sequence, GPT-2 style uses eos
	# for both since there is no separate BOS token
	#
	# pad_token - safe to reuse eos here because we are packing
	# sequences and will never actually pad during
	# pretraining. Defined so HuggingFace doesn't
	# complain about missing pad token
	#
	# unk_token - None because byte-level means no unknowns ever

	tokenizer = PreTrainedTokenizerFast(
	tokenizer_object=base_tokenizer,

	# Special token mappings
	eos_token="<\|endoftext\|>",
	bos_token="<\|endoftext\|>",
	pad_token="<\|endoftext\|>",
	unk_token=None,

	# Context length
	model_max_length=MODEL_MAX_LENGTH,

	# Padding behavior
	padding_side=PADDING_SIDE,

	# Truncation side - truncate from the right
	# (keep the beginning of the sequence, drop the end)
	truncation_side="right",
	)

	tokenizer.add_special_tokens({
	"eos_token": "<\|endoftext\|>",
	"bos_token": "<\|endoftext\|>",
	"pad_token": "<\|endoftext\|>",
	})
	special_tokens_map = {
	"bos_token": "<\|endoftext\|>",
	"eos_token": "<\|endoftext\|>",
	"pad_token": "<\|endoftext\|>",
	}
	os.makedirs(save_dir, exist_ok=True)

	with open(os.path.join(save_dir, "special_tokens_map.json"), "w") as f:
	json.dump(special_tokens_map, f, indent=2)

	print("special_tokens_map.json written manually")
	# ---- Save --------------------------------------------------------
	# Saves three files to save_dir/:
	# tokenizer.json - the trained BPE tokenizer
	# tokenizer_config.json - max length, pad token, special tokens
	# special_tokens_map.json - maps eos/bos/pad to actual tokens
	tokenizer.save_pretrained(save_dir)
	print(f"Tokenizer saved to: {save_dir}/")
	print(f" tokenizer.json")
	print(f" tokenizer_config.json")
	print(f" special_tokens_map.json")

	return tokenizer


	# ------------------------------------------------------------------ #
	# VERIFICATION
	# ------------------------------------------------------------------ #

	def verify_wrapped_tokenizer(tokenizer: PreTrainedTokenizerFast):
	"""
	Verifies the wrapped tokenizer behaves correctly.
	Tests encoding, decoding, padding, truncation and batch encoding.
	"""

	print("\n" + "="*60)
	print(" WRAPPED TOKENIZER VERIFICATION")
	print("="*60 + "\n")

	eot_id = tokenizer.eos_token_id

	# ---- 1. Basic config -----------------------------------------
	print("Config:")
	print(f" vocab size : {tokenizer.vocab_size:,}")
	print(f" model_max_length : {tokenizer.model_max_length}")
	print(f" padding_side : {tokenizer.padding_side}")
	print(f" eos_token : {tokenizer.eos_token!r} (ID: {eot_id})")
	print(f" bos_token : {tokenizer.bos_token!r}")
	print(f" pad_token : {tokenizer.pad_token!r} (ID: {tokenizer.pad_token_id})")
	print(f" unk_token : {tokenizer.unk_token!r}")
	print()

	# ---- 2. Basic encode/decode ----------------------------------
	text = "The mitochondria is the powerhouse of the cell."
	encoded = tokenizer(text)
	decoded = tokenizer.decode(encoded["input_ids"])

	print("Basic encode/decode:")
	print(f" input : {repr(text)}")
	print(f" input_ids: {encoded['input_ids']}")
	print(f" decoded : {repr(decoded)}")
	print()

	# ---- 3. Padding ----------------------------------------------
	# Batch of two sequences with different lengths
	# shorter one should be right-padded to match the longer
	batch = [
	"Short sentence.",
	"This is a much longer sentence that has more tokens in it.",
	]

	encoded_batch = tokenizer(
	batch,
	padding=True, # pad to longest in batch
	return_tensors="pt", # return PyTorch tensors
	)

	print("Batch padding (right padding):")
	print(f" input_ids shape : {encoded_batch['input_ids'].shape}")
	print(f" attention_mask shape : {encoded_batch['attention_mask'].shape}")
	print(f" input_ids[0] : {encoded_batch['input_ids'][0].tolist()}")
	print(f" input_ids[1] : {encoded_batch['input_ids'][1].tolist()}")
	print(f" attention_mask[0] : {encoded_batch['attention_mask'][0].tolist()}")
	print()

	# ---- 4. Truncation -------------------------------------------
	# Sequence longer than model_max_length should be truncated
	long_text = "word " * 2000 # 2000 words >> 1024 tokens
	encoded_long = tokenizer(
	long_text,
	truncation=True,
	max_length=MODEL_MAX_LENGTH,
	)

	print("Truncation:")
	print(f" input length : {len(long_text.split())} words")
	print(f" token count : {len(encoded_long['input_ids'])} (max: {MODEL_MAX_LENGTH})")
	print(f" truncated : {len(encoded_long['input_ids']) <= MODEL_MAX_LENGTH}")
	print()

	# ---- 5. Load from disk and verify ----------------------------
	print("Loading from disk:")
	reloaded = PreTrainedTokenizerFast.from_pretrained(SAVE_DIR)
	reloaded_ids = reloaded(text)["input_ids"]
	original_ids = encoded["input_ids"]
	match = reloaded_ids == original_ids

	print(f" from_pretrained() : OK")
	print(f" IDs match original: {match}")


	# ------------------------------------------------------------------ #
	# ENTRY POINT
	# ------------------------------------------------------------------ #

	if __name__ == "__main__":
	tokenizer = wrap_tokenizer()
	verify_wrapped_tokenizer(tokenizer)

	print("\n" + "="*60)
	print(" USAGE EXAMPLES")
	print("="*60)
	print("""
	# Load anywhere with one line
	from transformers import PreTrainedTokenizerFast
	tokenizer = PreTrainedTokenizerFast.from_pretrained("fineweb_edu_tokenizer")

	# Single encode
	ids = tokenizer("Hello world")["input_ids"]

	# Batch encode with padding and tensors
	batch = tokenizer(
	["sentence one", "sentence two"],
	padding=True,
	truncation=True,
	max_length=1024,
	return_tensors="pt",
	)

	# Decode
	text = tokenizer.decode(ids, skip_special_tokens=True)

	# Get eos token id (use as document separator when packing)
	eot_id = tokenizer.eos_token_id
	""")