| | |
| | """ |
| | Export bpe.model (YouTokenToMe) to tokenizer.json for Rust/ONNX inference. |
| | Extracts vocab and merges via yttm vocab --verbose. Compatible with Hugging Face tokenizers. |
| | """ |
| | import json |
| | import argparse |
| | import subprocess |
| | import sys |
| | import re |
| |
|
| | |
| | PAD_ID, UNK_ID, BOS_ID, EOS_ID = 0, 1, 2, 3 |
| | CONTEXT_LENGTH = 77 |
| |
|
| |
|
| | def _get_merges_from_yttm_verbose(model_path: str) -> list[tuple[str, str]]: |
| | """Run yttm vocab --verbose and parse merges. Returns list of (left, right) token pairs in order.""" |
| | result = subprocess.run( |
| | ["yttm", "vocab", "--model", model_path, "--verbose"], |
| | capture_output=True, |
| | text=True, |
| | timeout=300, |
| | ) |
| | if result.returncode != 0: |
| | raise RuntimeError(f"yttm vocab failed: {result.stderr}") |
| | merges: list[tuple[str, str]] = [] |
| | |
| | simple_re = re.compile(r"^(\d+)\t(.+)$") |
| | for line in result.stdout.strip().split("\n"): |
| | line = line.rstrip() |
| | if not line: |
| | continue |
| | m = simple_re.match(line) |
| | if not m: |
| | continue |
| | rest = m.group(2) |
| | if "=" in rest: |
| | token_z, merge_part = rest.split("=", 1) |
| | parts = merge_part.split() |
| | tok_part = parts[0] if parts else "" |
| | if "+" in tok_part: |
| | token_x, token_y = tok_part.split("+", 1) |
| | merges.append((token_x, token_y)) |
| | return merges |
| |
|
| |
|
| | def export_tokenizer_json(bpe_model_path: str, output_path: str) -> None: |
| | try: |
| | import youtokentome as yttm |
| | except ImportError: |
| | print("Install: pip install youtokentome", file=sys.stderr) |
| | sys.exit(1) |
| |
|
| | bpe_yttm = yttm.BPE(bpe_model_path) |
| | vocab_list = bpe_yttm.vocab() |
| | vocab = {tok: i for i, tok in enumerate(vocab_list)} |
| |
|
| | |
| | print("Extracting merges via yttm vocab --verbose...") |
| | try: |
| | merges = _get_merges_from_yttm_verbose(bpe_model_path) |
| | |
| | valid = [(x, y) for x, y in merges if x in vocab and y in vocab] |
| | n_skipped = len(merges) - len(valid) |
| | if n_skipped: |
| | print(f" Skipped {n_skipped} merges (token not in vocab)") |
| | if valid: |
| | merges_hf = [f"{x} {y}" for x, y in valid] |
| | ignore_merges = False |
| | else: |
| | merges_hf = [] |
| | ignore_merges = True |
| | except Exception as e: |
| | print(f"Could not extract merges ({e}), using ignore_merges=True", file=sys.stderr) |
| | merges_hf = [] |
| | ignore_merges = True |
| |
|
| | obj = { |
| | "version": "1.0", |
| | "truncation": None, |
| | "padding": None, |
| | "added_tokens": [ |
| | {"id": PAD_ID, "special": True, "content": vocab_list[PAD_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False}, |
| | {"id": UNK_ID, "special": True, "content": vocab_list[UNK_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False}, |
| | {"id": BOS_ID, "special": True, "content": vocab_list[BOS_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False}, |
| | {"id": EOS_ID, "special": True, "content": vocab_list[EOS_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False}, |
| | ], |
| | "normalizer": {"type": "Lowercase"}, |
| | "pre_tokenizer": {"type": "Metaspace", "replacement": "\u2581", "prepend_scheme": "first"}, |
| | "post_processor": None, |
| | "decoder": {"type": "Metaspace", "replacement": "\u2581", "prepend_scheme": "first"}, |
| | "model": { |
| | "type": "BPE", |
| | "dropout": None, |
| | "unk_token": vocab_list[UNK_ID], |
| | "continuing_subword_prefix": "", |
| | "end_of_word_suffix": "", |
| | "fuse_unk": False, |
| | "byte_fallback": False, |
| | "ignore_merges": ignore_merges, |
| | "vocab": vocab, |
| | "merges": merges_hf, |
| | }, |
| | } |
| |
|
| | with open(output_path, "w", newline="\n") as f: |
| | s = json.dumps(obj, ensure_ascii=True, indent=2) |
| | f.write(s) |
| |
|
| | print(f"Saved {output_path} (vocab_size={len(vocab)}, merges={len(merges_hf)})") |
| | print(" Special: pad=0 unk=1 bos=2 eos=3") |
| |
|
| |
|
| | def main(): |
| | p = argparse.ArgumentParser() |
| | p.add_argument("--model", default="bpe.model", help="Path to bpe.model") |
| | p.add_argument("--output", default="tokenizer.json", help="Output path") |
| | args = p.parse_args() |
| | export_tokenizer_json(args.model, args.output) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|