File size: 3,202 Bytes

e7e69ed

# tokenizer/tokenizer_utils.py
#
# Loads the trained BPE tokenizer and provides a clean interface
# for encoding, decoding, and accessing special token IDs.
# Used by the data pipeline and training loop.

from pathlib import Path
from tokenizers import Tokenizer


TOKENIZER_PATH = Path("tokenizer/vocab/tokenizer.json")

# Special token string constants — single source of truth
TOK_EOT = "<|endoftext|>"
TOK_PREFIX = "<|fim_prefix|>"
TOK_SUFFIX = "<|fim_suffix|>"
TOK_MIDDLE = "<|fim_middle|>"
TOK_PAD = "<|pad|>"


class PyCraftTokenizer:
    """

    Thin wrapper around the HuggingFace tokenizers BPE tokenizer.

    Exposes encode/decode and special token IDs used by the data pipeline.

    """

    def __init__(self, path: str | Path = TOKENIZER_PATH):
        path = Path(path)
        if not path.exists():
            raise FileNotFoundError(
                f"Tokenizer not found at {path}.\n"
                f"Run: python -m tokenizer.train_tokenizer"
            )
        self._tok = Tokenizer.from_file(str(path))

        # Cache special token IDs
        self.eot_id = self._id(TOK_EOT)
        self.prefix_id = self._id(TOK_PREFIX)
        self.suffix_id = self._id(TOK_SUFFIX)
        self.middle_id = self._id(TOK_MIDDLE)
        self.pad_id = self._id(TOK_PAD)

        # Disable truncation/padding at tokenizer level
        # (handled manually in data pipeline)
        self._tok.no_truncation()
        self._tok.no_padding()

    def _id(self, token: str) -> int:
        tok_id = self._tok.token_to_id(token)
        assert tok_id is not None, f"Special token {token!r} not in vocab!"
        return tok_id

    @property
    def vocab_size(self) -> int:
        return self._tok.get_vocab_size()

    def encode(self, text: str) -> list[int]:
        """Encode a string to a list of token IDs."""
        return self._tok.encode(text).ids

    def decode(self, ids: list[int], skip_special_tokens: bool = True) -> str:
        """Decode a list of token IDs back to a string."""
        return self._tok.decode(ids, skip_special_tokens=skip_special_tokens)

    def __repr__(self) -> str:
        return (
            f"PyCraftTokenizer(vocab_size={self.vocab_size}, "
            f"eot={self.eot_id}, prefix={self.prefix_id}, "
            f"suffix={self.suffix_id}, middle={self.middle_id})"
        )


# ------------------------------------------------------------------ #
# Quick self-test (only runs after tokenizer is trained)
# ------------------------------------------------------------------ #
if __name__ == "__main__":
    print("Loading tokenizer...")
    tok = PyCraftTokenizer()
    print(tok)
    print()

    samples = [
        "def hello_world():\n    print('Hello, world!')\n",
        "import numpy as np\nx = np.zeros((3, 3))\n",
        "# PyCraft-1 tokenizer test\nfor i in range(10):\n    pass\n",
    ]

    for code in samples:
        ids = tok.encode(code)
        decoded = tok.decode(ids)
        print(f"  Original : {repr(code[:50])}")
        print(f"  Tokens   : {len(ids)}")
        print(f"  Decoded  : {repr(decoded[:50])}")
        print()