| """ |
| Component 2: Custom code tokenizer for Python and JavaScript. |
| |
| This tokenizer is code-aware: |
| - It preserves indentation structure using explicit tokens. |
| - It keeps newline boundaries using a newline token. |
| - It treats code operators and brackets as separate units. |
| - It supports prompt+code style training samples. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| from dataclasses import asdict, dataclass |
| from pathlib import Path |
| from typing import Dict, Iterable, List, Optional |
|
|
| from tokenizers import Regex, Tokenizer |
| from tokenizers.decoders import BPEDecoder |
| from tokenizers.models import BPE |
| from tokenizers.normalizers import NFKC, Sequence as NormalizerSequence |
| from tokenizers.pre_tokenizers import Metaspace, Sequence as PreTokenizerSequence, Split |
| from tokenizers.processors import TemplateProcessing |
| from tokenizers.trainers import BpeTrainer |
|
|
|
|
| @dataclass |
| class CodeTokenizerConfig: |
| |
| vocab_size: int = 50_000 |
| |
| min_frequency: int = 2 |
| |
| model_max_length: int = 2048 |
| |
| indent_width: int = 4 |
| |
| special_tokens: List[str] = None |
|
|
| def __post_init__(self) -> None: |
| if self.special_tokens is None: |
| self.special_tokens = [ |
| "<PAD>", |
| "<UNK>", |
| "<BOS>", |
| "<EOS>", |
| "<NL>", |
| "<INDENT>", |
| "<DEDENT>", |
| "<PROMPT>", |
| "<CODE>", |
| "<PYTHON>", |
| "<JAVASCRIPT>", |
| ] |
|
|
|
|
| class CodeTokenizer: |
| |
|
|
| def __init__(self, config: Optional[CodeTokenizerConfig] = None) -> None: |
| self.config = config or CodeTokenizerConfig() |
| self.tokenizer: Optional[Tokenizer] = None |
| self.special_token_ids: Dict[str, int] = {} |
|
|
| def _build_base_tokenizer(self) -> Tokenizer: |
| """ |
| Creates a BPE tokenizer with code-oriented pre-tokenization rules. |
| """ |
| tokenizer = Tokenizer(BPE(unk_token="<UNK>")) |
| tokenizer.normalizer = NormalizerSequence([NFKC()]) |
|
|
| |
| multi_op = Regex( |
| r"(==|!=|<=|>=|:=|->|=>|\+\+|--|\+=|-=|\*=|/=|//=|%=|\*\*|&&|\|\||<<|>>)" |
| ) |
| |
| punct = Regex(r"([()\[\]{}.,:;])") |
|
|
| tokenizer.pre_tokenizer = PreTokenizerSequence( |
| [ |
| Split(multi_op, behavior="isolated"), |
| Split(punct, behavior="isolated"), |
| Metaspace(replacement="_", prepend_scheme="always", split=True), |
| ] |
| ) |
| tokenizer.decoder = BPEDecoder() |
| return tokenizer |
|
|
| def train(self, text_iterator: Iterable[str]) -> None: |
| """ |
| Trains the tokenizer from a stream of preformatted text samples. |
| """ |
| tokenizer = self._build_base_tokenizer() |
| trainer = BpeTrainer( |
| vocab_size=self.config.vocab_size, |
| min_frequency=self.config.min_frequency, |
| special_tokens=self.config.special_tokens, |
| show_progress=True, |
| ) |
| tokenizer.train_from_iterator(text_iterator, trainer=trainer, length=None) |
|
|
| |
| bos_id = tokenizer.token_to_id("<BOS>") |
| eos_id = tokenizer.token_to_id("<EOS>") |
| if bos_id is None or eos_id is None: |
| raise RuntimeError("Tokenizer training failed to register BOS/EOS tokens.") |
| tokenizer.post_processor = TemplateProcessing( |
| single="<BOS> $A <EOS>", |
| special_tokens=[("<BOS>", bos_id), ("<EOS>", eos_id)], |
| ) |
|
|
| self.tokenizer = tokenizer |
| self.special_token_ids = { |
| token: tokenizer.token_to_id(token) for token in self.config.special_tokens |
| } |
|
|
| def save(self, output_dir: str) -> None: |
| """ |
| Saves tokenizer JSON and config so all other components can reuse it. |
| """ |
| if self.tokenizer is None: |
| raise RuntimeError("Cannot save tokenizer before training or loading it.") |
| out = Path(output_dir) |
| out.mkdir(parents=True, exist_ok=True) |
| self.tokenizer.save(str(out / "tokenizer.json")) |
| with (out / "tokenizer_config.json").open("w", encoding="utf-8") as f: |
| json.dump(asdict(self.config), f, indent=2) |
|
|
| @classmethod |
| def load(cls, tokenizer_dir: str) -> "CodeTokenizer": |
| """ |
| Loads tokenizer from disk. |
| """ |
| base = Path(tokenizer_dir) |
| cfg_path = base / "tokenizer_config.json" |
| tok_path = base / "tokenizer.json" |
| if not cfg_path.exists() or not tok_path.exists(): |
| raise FileNotFoundError( |
| f"Missing tokenizer files in {tokenizer_dir}. " |
| "Expected tokenizer.json and tokenizer_config.json." |
| ) |
| with cfg_path.open("r", encoding="utf-8") as f: |
| cfg_data = json.load(f) |
| config = CodeTokenizerConfig(**cfg_data) |
| obj = cls(config=config) |
| obj.tokenizer = Tokenizer.from_file(str(tok_path)) |
| obj.special_token_ids = { |
| token: obj.tokenizer.token_to_id(token) for token in obj.config.special_tokens |
| } |
| return obj |
|
|
| def encode(self, text: str) -> List[int]: |
| """ |
| Encodes one preformatted text sample to token IDs. |
| """ |
| if self.tokenizer is None: |
| raise RuntimeError("Tokenizer is not ready. Train or load it first.") |
| return self.tokenizer.encode(text).ids |
|
|
| def decode(self, token_ids: List[int]) -> str: |
| """ |
| Decodes token IDs to text. |
| """ |
| if self.tokenizer is None: |
| raise RuntimeError("Tokenizer is not ready. Train or load it first.") |
| return self.tokenizer.decode(token_ids, skip_special_tokens=False) |
|
|
| def format_training_sample(self, prompt: str, code: str, language: str) -> str: |
| """ |
| Converts prompt + code into one structured training text sequence. |
| """ |
| lang_token = "<PYTHON>" if language.lower() == "python" else "<JAVASCRIPT>" |
| prompt_text = self._normalize_text(prompt) |
| code_text = self._code_to_structure_tokens(code) |
| return f"<PROMPT> {lang_token} {prompt_text} <CODE> {code_text}" |
|
|
| def _normalize_text(self, text: str) -> str: |
| """ |
| Normalizes regular text by cleaning newlines. |
| """ |
| return text.replace("\r\n", "\n").replace("\r", "\n").strip() |
|
|
| def _code_to_structure_tokens(self, code: str) -> str: |
| """ |
| Converts raw code into a string with explicit indentation and newline markers. |
| """ |
| code = code.replace("\r\n", "\n").replace("\r", "\n").replace("\t", " " * self.config.indent_width) |
| lines = code.split("\n") |
| indent_stack: List[int] = [0] |
| out_tokens: List[str] = [] |
|
|
| for raw_line in lines: |
| |
| if raw_line.strip() == "": |
| out_tokens.append("<NL>") |
| continue |
|
|
| current_indent = len(raw_line) - len(raw_line.lstrip(" ")) |
| line_content = raw_line.lstrip(" ") |
|
|
| while current_indent < indent_stack[-1]: |
| indent_stack.pop() |
| out_tokens.append("<DEDENT>") |
|
|
| while current_indent > indent_stack[-1]: |
| indent_stack.append(current_indent) |
| out_tokens.append("<INDENT>") |
|
|
| out_tokens.append(line_content) |
| out_tokens.append("<NL>") |
|
|
| while len(indent_stack) > 1: |
| indent_stack.pop() |
| out_tokens.append("<DEDENT>") |
|
|
| return " ".join(out_tokens).strip() |
|
|