|
|
|
|
|
|
|
|
|
|
|
|
| from pathlib import Path
|
| from tokenizers import Tokenizer
|
|
|
|
|
| TOKENIZER_PATH = Path("tokenizer/vocab/tokenizer.json")
|
|
|
|
|
| TOK_EOT = "<|endoftext|>"
|
| TOK_PREFIX = "<|fim_prefix|>"
|
| TOK_SUFFIX = "<|fim_suffix|>"
|
| TOK_MIDDLE = "<|fim_middle|>"
|
| TOK_PAD = "<|pad|>"
|
|
|
|
|
| class PyCraftTokenizer:
|
| """
|
| Thin wrapper around the HuggingFace tokenizers BPE tokenizer.
|
| Exposes encode/decode and special token IDs used by the data pipeline.
|
| """
|
|
|
| def __init__(self, path: str | Path = TOKENIZER_PATH):
|
| path = Path(path)
|
| if not path.exists():
|
| raise FileNotFoundError(
|
| f"Tokenizer not found at {path}.\n"
|
| f"Run: python -m tokenizer.train_tokenizer"
|
| )
|
| self._tok = Tokenizer.from_file(str(path))
|
|
|
|
|
| self.eot_id = self._id(TOK_EOT)
|
| self.prefix_id = self._id(TOK_PREFIX)
|
| self.suffix_id = self._id(TOK_SUFFIX)
|
| self.middle_id = self._id(TOK_MIDDLE)
|
| self.pad_id = self._id(TOK_PAD)
|
|
|
|
|
|
|
| self._tok.no_truncation()
|
| self._tok.no_padding()
|
|
|
| def _id(self, token: str) -> int:
|
| tok_id = self._tok.token_to_id(token)
|
| assert tok_id is not None, f"Special token {token!r} not in vocab!"
|
| return tok_id
|
|
|
| @property
|
| def vocab_size(self) -> int:
|
| return self._tok.get_vocab_size()
|
|
|
| def encode(self, text: str) -> list[int]:
|
| """Encode a string to a list of token IDs."""
|
| return self._tok.encode(text).ids
|
|
|
| def decode(self, ids: list[int], skip_special_tokens: bool = True) -> str:
|
| """Decode a list of token IDs back to a string."""
|
| return self._tok.decode(ids, skip_special_tokens=skip_special_tokens)
|
|
|
| def __repr__(self) -> str:
|
| return (
|
| f"PyCraftTokenizer(vocab_size={self.vocab_size}, "
|
| f"eot={self.eot_id}, prefix={self.prefix_id}, "
|
| f"suffix={self.suffix_id}, middle={self.middle_id})"
|
| )
|
|
|
|
|
|
|
|
|
|
|
| if __name__ == "__main__":
|
| print("Loading tokenizer...")
|
| tok = PyCraftTokenizer()
|
| print(tok)
|
| print()
|
|
|
| samples = [
|
| "def hello_world():\n print('Hello, world!')\n",
|
| "import numpy as np\nx = np.zeros((3, 3))\n",
|
| "# PyCraft-1 tokenizer test\nfor i in range(10):\n pass\n",
|
| ]
|
|
|
| for code in samples:
|
| ids = tok.encode(code)
|
| decoded = tok.decode(ids)
|
| print(f" Original : {repr(code[:50])}")
|
| print(f" Tokens : {len(ids)}")
|
| print(f" Decoded : {repr(decoded[:50])}")
|
| print()
|
|
|