pycraft-1 / tokenizer_src /tokenizer_utils.py

Upload tokenizer_src/tokenizer_utils.py with huggingface_hub

e7e69ed verified 7 days ago

3.2 kB

	# tokenizer/tokenizer_utils.py
	#
	# Loads the trained BPE tokenizer and provides a clean interface
	# for encoding, decoding, and accessing special token IDs.
	# Used by the data pipeline and training loop.

	from pathlib import Path
	from tokenizers import Tokenizer


	TOKENIZER_PATH = Path("tokenizer/vocab/tokenizer.json")

	# Special token string constants — single source of truth
	TOK_EOT = "<\|endoftext\|>"
	TOK_PREFIX = "<\|fim_prefix\|>"
	TOK_SUFFIX = "<\|fim_suffix\|>"
	TOK_MIDDLE = "<\|fim_middle\|>"
	TOK_PAD = "<\|pad\|>"


	class PyCraftTokenizer:
	"""
	Thin wrapper around the HuggingFace tokenizers BPE tokenizer.
	Exposes encode/decode and special token IDs used by the data pipeline.
	"""

	def __init__(self, path: str \| Path = TOKENIZER_PATH):
	path = Path(path)
	if not path.exists():
	raise FileNotFoundError(
	f"Tokenizer not found at {path}.\n"
	f"Run: python -m tokenizer.train_tokenizer"
	)
	self._tok = Tokenizer.from_file(str(path))

	# Cache special token IDs
	self.eot_id = self._id(TOK_EOT)
	self.prefix_id = self._id(TOK_PREFIX)
	self.suffix_id = self._id(TOK_SUFFIX)
	self.middle_id = self._id(TOK_MIDDLE)
	self.pad_id = self._id(TOK_PAD)

	# Disable truncation/padding at tokenizer level
	# (handled manually in data pipeline)
	self._tok.no_truncation()
	self._tok.no_padding()

	def _id(self, token: str) -> int:
	tok_id = self._tok.token_to_id(token)
	assert tok_id is not None, f"Special token {token!r} not in vocab!"
	return tok_id

	@property
	def vocab_size(self) -> int:
	return self._tok.get_vocab_size()

	def encode(self, text: str) -> list[int]:
	"""Encode a string to a list of token IDs."""
	return self._tok.encode(text).ids

	def decode(self, ids: list[int], skip_special_tokens: bool = True) -> str:
	"""Decode a list of token IDs back to a string."""
	return self._tok.decode(ids, skip_special_tokens=skip_special_tokens)

	def __repr__(self) -> str:
	return (
	f"PyCraftTokenizer(vocab_size={self.vocab_size}, "
	f"eot={self.eot_id}, prefix={self.prefix_id}, "
	f"suffix={self.suffix_id}, middle={self.middle_id})"
	)


	# ------------------------------------------------------------------ #
	# Quick self-test (only runs after tokenizer is trained)
	# ------------------------------------------------------------------ #
	if __name__ == "__main__":
	print("Loading tokenizer...")
	tok = PyCraftTokenizer()
	print(tok)
	print()

	samples = [
	"def hello_world():\n print('Hello, world!')\n",
	"import numpy as np\nx = np.zeros((3, 3))\n",
	"# PyCraft-1 tokenizer test\nfor i in range(10):\n pass\n",
	]

	for code in samples:
	ids = tok.encode(code)
	decoded = tok.decode(ids)
	print(f" Original : {repr(code[:50])}")
	print(f" Tokens : {len(ids)}")
	print(f" Decoded : {repr(decoded[:50])}")
	print()