File size: 3,202 Bytes
e7e69ed | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | # tokenizer/tokenizer_utils.py
#
# Loads the trained BPE tokenizer and provides a clean interface
# for encoding, decoding, and accessing special token IDs.
# Used by the data pipeline and training loop.
from pathlib import Path
from tokenizers import Tokenizer
TOKENIZER_PATH = Path("tokenizer/vocab/tokenizer.json")
# Special token string constants — single source of truth
TOK_EOT = "<|endoftext|>"
TOK_PREFIX = "<|fim_prefix|>"
TOK_SUFFIX = "<|fim_suffix|>"
TOK_MIDDLE = "<|fim_middle|>"
TOK_PAD = "<|pad|>"
class PyCraftTokenizer:
"""
Thin wrapper around the HuggingFace tokenizers BPE tokenizer.
Exposes encode/decode and special token IDs used by the data pipeline.
"""
def __init__(self, path: str | Path = TOKENIZER_PATH):
path = Path(path)
if not path.exists():
raise FileNotFoundError(
f"Tokenizer not found at {path}.\n"
f"Run: python -m tokenizer.train_tokenizer"
)
self._tok = Tokenizer.from_file(str(path))
# Cache special token IDs
self.eot_id = self._id(TOK_EOT)
self.prefix_id = self._id(TOK_PREFIX)
self.suffix_id = self._id(TOK_SUFFIX)
self.middle_id = self._id(TOK_MIDDLE)
self.pad_id = self._id(TOK_PAD)
# Disable truncation/padding at tokenizer level
# (handled manually in data pipeline)
self._tok.no_truncation()
self._tok.no_padding()
def _id(self, token: str) -> int:
tok_id = self._tok.token_to_id(token)
assert tok_id is not None, f"Special token {token!r} not in vocab!"
return tok_id
@property
def vocab_size(self) -> int:
return self._tok.get_vocab_size()
def encode(self, text: str) -> list[int]:
"""Encode a string to a list of token IDs."""
return self._tok.encode(text).ids
def decode(self, ids: list[int], skip_special_tokens: bool = True) -> str:
"""Decode a list of token IDs back to a string."""
return self._tok.decode(ids, skip_special_tokens=skip_special_tokens)
def __repr__(self) -> str:
return (
f"PyCraftTokenizer(vocab_size={self.vocab_size}, "
f"eot={self.eot_id}, prefix={self.prefix_id}, "
f"suffix={self.suffix_id}, middle={self.middle_id})"
)
# ------------------------------------------------------------------ #
# Quick self-test (only runs after tokenizer is trained)
# ------------------------------------------------------------------ #
if __name__ == "__main__":
print("Loading tokenizer...")
tok = PyCraftTokenizer()
print(tok)
print()
samples = [
"def hello_world():\n print('Hello, world!')\n",
"import numpy as np\nx = np.zeros((3, 3))\n",
"# PyCraft-1 tokenizer test\nfor i in range(10):\n pass\n",
]
for code in samples:
ids = tok.encode(code)
decoded = tok.decode(ids)
print(f" Original : {repr(code[:50])}")
print(f" Tokens : {len(ids)}")
print(f" Decoded : {repr(decoded[:50])}")
print()
|