# build_shift_char_tokenizer.py import json from pathlib import Path from typing import List from tokenizers.models import WordLevel from tokenizers.pre_tokenizers import Split from tokenizers.normalizers import Sequence, Replace, Lowercase, NFKC from tokenizers.processors import TemplateProcessing from transformers import PreTrainedTokenizerFast from tokenizers import Tokenizer, Regex, decoders from tokenizers.models import WordLevel from tokenizers.pre_tokenizers import Split def build_shift_char_tokenizer( out_dir: str, base_tokens: List[str], *, shift_token: str = "↨", special_tokens: List[str] = ("", "", "", ""), include_specials_in_128: bool = True, ): """ Create a HF-compatible char tokenizer with SHIFT+lowercase behavior. - base_tokens: your full 128-token alphabet if include_specials_in_128=True, otherwise your 128 data tokens and we’ll append specials (vocab will be >128). - shift_token must be present in base_tokens. """ out = Path(out_dir) out.mkdir(parents=True, exist_ok=True) # Validate vocab sizing base_set = list(dict.fromkeys(base_tokens)) # keep order, dedupe if base_set != base_tokens: raise ValueError(f"base_tokens has duplicates; order must define ids. Should be {base_tokens} but is {base_set}") if shift_token not in base_tokens: raise ValueError(f"'{shift_token}' must be in base_tokens.") if include_specials_in_128: # specials must already be present in base_tokens missing = [t for t in special_tokens if t not in base_tokens] if missing: raise ValueError(f"special tokens missing from base_tokens: {missing}") if len(base_tokens) != 128: raise ValueError(f"base_tokens must be exactly 128 when include_specials_in_128=True (got {len(base_tokens)}).") vocab_tokens = base_tokens else: # append specials; vocab_size will exceed 128 vocab_tokens = base_tokens + [t for t in special_tokens if t not in base_tokens] # Build vocab mapping token_to_id = {tok: i for i, tok in enumerate(vocab_tokens)} unk_token = "" if "" in token_to_id else None # Model: fixed WordLevel model = WordLevel(vocab=token_to_id, unk_token=unk_token) # Explicit uppercase mapping avoids backref issues uppercase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" normalizer_steps = [NFKC()] for u in uppercase: normalizer_steps.append(Replace(Regex(u), SHIFT + u.lower())) normalizer = Sequence(normalizer_steps) # Pre-tokenizer: isolate every codepoint, including newlines (use DOTALL) #pre_tok = Split(Regex(r"(?s)."), behavior="isolated") pre_tok = Split(Regex(r"\X"), behavior="isolated") tok = Tokenizer(model) tok.normalizer = normalizer tok.pre_tokenizer = pre_tok tok.decoder = decoders.Sequence([]) # concatenate tokens verbatim # Optional: tidy BOS/EOS on encode if you want them # (kept minimal; models often add these themselves) if "" in token_to_id and "" in token_to_id: tok.post_processor = TemplateProcessing( single="$0", pair="$A $B", special_tokens=[ # add e.g. ("", id), ("", id) here if you want automatic wrapping ], ) # Wrap in HF fast tokenizer and save hf_tok = PreTrainedTokenizerFast( tokenizer_object=tok, bos_token="" if "" in token_to_id else None, eos_token="" if "" in token_to_id else None, unk_token=unk_token, pad_token="" if "" in token_to_id else None, ) # metadata for HF tokenizer_config = { "model_max_length": 1024, # adjust for your use case } (Path(out_dir) / "tokenizer_config.json").write_text(json.dumps(tokenizer_config, indent=2), encoding="utf-8") hf_tok.save_pretrained(out_dir) print(f"Saved tokenizer to: {out_dir}") print(f"Vocab size: {len(vocab_tokens)} (include_specials_in_128={include_specials_in_128})") if __name__ == "__main__": # Example: define your exact 128 tokens including specials and SHIFT. # Keep ordering stable; ids are index positions. # Below is a sane template to edit. Make sure length == 128. SHIFT = "↨" specials = ["", "", "", ""] # Base character set (edit this list to be exactly 124 non-specials + 4 specials = 128) chars = list("\n\t ") # newline, tab, space chars += list("0123456789") chars += list("abcdefghijklmnopqrstuvwxyz") # Include punctuation/symbols you need. Keep only what you’ll actually see. chars += list("\"!$&'#,/+=-<>*@.:;[]{}()^_?") # from your sample chars += list("èé") # sample diacritics you mentioned # Add SHIFT token # Ensure NO uppercase letters are in the vocab (they’re represented via SHIFT+lowercase) base_tokens_wo_specials = [SHIFT] + chars # If you want exactly 128 including specials, adjust to 124 data tokens + 4 specials # Add or remove symbols to hit 124 before specials: # Pad with rarely-used placeholders if needed: while len(base_tokens_wo_specials) < 124: base_tokens_wo_specials.append(f"¤{len(base_tokens_wo_specials)}") # harmless placeholders if len(base_tokens_wo_specials) != 124: raise SystemExit(f"Currently have {len(base_tokens_wo_specials)} data tokens; adjust to 124 before specials.") base_tokens_including_specials = specials + base_tokens_wo_specials # specials first is fine build_shift_char_tokenizer( out_dir="char128_shift_tokenizer", base_tokens=base_tokens_including_specials, shift_token=SHIFT, special_tokens=specials, include_specials_in_128=True, )