pycraft-1 / tokenizer_src /tokenizer_utils.py
imshadow0's picture
Upload tokenizer_src/tokenizer_utils.py with huggingface_hub
e7e69ed verified
Raw
History Blame Contribute Delete
3.2 kB
# tokenizer/tokenizer_utils.py
#
# Loads the trained BPE tokenizer and provides a clean interface
# for encoding, decoding, and accessing special token IDs.
# Used by the data pipeline and training loop.
from pathlib import Path
from tokenizers import Tokenizer
TOKENIZER_PATH = Path("tokenizer/vocab/tokenizer.json")
# Special token string constants — single source of truth
TOK_EOT = "<|endoftext|>"
TOK_PREFIX = "<|fim_prefix|>"
TOK_SUFFIX = "<|fim_suffix|>"
TOK_MIDDLE = "<|fim_middle|>"
TOK_PAD = "<|pad|>"
class PyCraftTokenizer:
"""
Thin wrapper around the HuggingFace tokenizers BPE tokenizer.
Exposes encode/decode and special token IDs used by the data pipeline.
"""
def __init__(self, path: str | Path = TOKENIZER_PATH):
path = Path(path)
if not path.exists():
raise FileNotFoundError(
f"Tokenizer not found at {path}.\n"
f"Run: python -m tokenizer.train_tokenizer"
)
self._tok = Tokenizer.from_file(str(path))
# Cache special token IDs
self.eot_id = self._id(TOK_EOT)
self.prefix_id = self._id(TOK_PREFIX)
self.suffix_id = self._id(TOK_SUFFIX)
self.middle_id = self._id(TOK_MIDDLE)
self.pad_id = self._id(TOK_PAD)
# Disable truncation/padding at tokenizer level
# (handled manually in data pipeline)
self._tok.no_truncation()
self._tok.no_padding()
def _id(self, token: str) -> int:
tok_id = self._tok.token_to_id(token)
assert tok_id is not None, f"Special token {token!r} not in vocab!"
return tok_id
@property
def vocab_size(self) -> int:
return self._tok.get_vocab_size()
def encode(self, text: str) -> list[int]:
"""Encode a string to a list of token IDs."""
return self._tok.encode(text).ids
def decode(self, ids: list[int], skip_special_tokens: bool = True) -> str:
"""Decode a list of token IDs back to a string."""
return self._tok.decode(ids, skip_special_tokens=skip_special_tokens)
def __repr__(self) -> str:
return (
f"PyCraftTokenizer(vocab_size={self.vocab_size}, "
f"eot={self.eot_id}, prefix={self.prefix_id}, "
f"suffix={self.suffix_id}, middle={self.middle_id})"
)
# ------------------------------------------------------------------ #
# Quick self-test (only runs after tokenizer is trained)
# ------------------------------------------------------------------ #
if __name__ == "__main__":
print("Loading tokenizer...")
tok = PyCraftTokenizer()
print(tok)
print()
samples = [
"def hello_world():\n print('Hello, world!')\n",
"import numpy as np\nx = np.zeros((3, 3))\n",
"# PyCraft-1 tokenizer test\nfor i in range(10):\n pass\n",
]
for code in samples:
ids = tok.encode(code)
decoded = tok.decode(ids)
print(f" Original : {repr(code[:50])}")
print(f" Tokens : {len(ids)}")
print(f" Decoded : {repr(decoded[:50])}")
print()