#!/usr/bin/env python3 """ Export bpe.model (YouTokenToMe) to tokenizer.json for Rust/ONNX inference. Extracts vocab and merges via yttm vocab --verbose. Compatible with Hugging Face tokenizers. """ import json import argparse import subprocess import sys import re # RuCLIP special token IDs (YouTokenToMe defaults) PAD_ID, UNK_ID, BOS_ID, EOS_ID = 0, 1, 2, 3 CONTEXT_LENGTH = 77 def _get_merges_from_yttm_verbose(model_path: str) -> list[tuple[str, str]]: """Run yttm vocab --verbose and parse merges. Returns list of (left, right) token pairs in order.""" result = subprocess.run( ["yttm", "vocab", "--model", model_path, "--verbose"], capture_output=True, text=True, timeout=300, ) if result.returncode != 0: raise RuntimeError(f"yttm vocab failed: {result.stderr}") merges: list[tuple[str, str]] = [] # Format: id\ttoken or id\ttoken=left+right left_id+right_id simple_re = re.compile(r"^(\d+)\t(.+)$") for line in result.stdout.strip().split("\n"): line = line.rstrip() if not line: continue m = simple_re.match(line) if not m: continue rest = m.group(2) if "=" in rest: token_z, merge_part = rest.split("=", 1) parts = merge_part.split() tok_part = parts[0] if parts else "" if "+" in tok_part: token_x, token_y = tok_part.split("+", 1) merges.append((token_x, token_y)) return merges def export_tokenizer_json(bpe_model_path: str, output_path: str) -> None: try: import youtokentome as yttm except ImportError: print("Install: pip install youtokentome", file=sys.stderr) sys.exit(1) bpe_yttm = yttm.BPE(bpe_model_path) vocab_list = bpe_yttm.vocab() vocab = {tok: i for i, tok in enumerate(vocab_list)} # Extract merges from yttm vocab --verbose print("Extracting merges via yttm vocab --verbose...") try: merges = _get_merges_from_yttm_verbose(bpe_model_path) # Only include merges where both tokens are in vocab (avoid parser artifacts like "=▁") valid = [(x, y) for x, y in merges if x in vocab and y in vocab] n_skipped = len(merges) - len(valid) if n_skipped: print(f" Skipped {n_skipped} merges (token not in vocab)") if valid: merges_hf = [f"{x} {y}" for x, y in valid] ignore_merges = False else: merges_hf = [] ignore_merges = True except Exception as e: print(f"Could not extract merges ({e}), using ignore_merges=True", file=sys.stderr) merges_hf = [] ignore_merges = True obj = { "version": "1.0", "truncation": None, "padding": None, "added_tokens": [ {"id": PAD_ID, "special": True, "content": vocab_list[PAD_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False}, {"id": UNK_ID, "special": True, "content": vocab_list[UNK_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False}, {"id": BOS_ID, "special": True, "content": vocab_list[BOS_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False}, {"id": EOS_ID, "special": True, "content": vocab_list[EOS_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False}, ], "normalizer": {"type": "Lowercase"}, "pre_tokenizer": {"type": "Metaspace", "replacement": "\u2581", "prepend_scheme": "first"}, "post_processor": None, "decoder": {"type": "Metaspace", "replacement": "\u2581", "prepend_scheme": "first"}, "model": { "type": "BPE", "dropout": None, "unk_token": vocab_list[UNK_ID], "continuing_subword_prefix": "", "end_of_word_suffix": "", "fuse_unk": False, "byte_fallback": False, "ignore_merges": ignore_merges, "vocab": vocab, "merges": merges_hf, }, } with open(output_path, "w", newline="\n") as f: s = json.dumps(obj, ensure_ascii=True, indent=2) f.write(s) print(f"Saved {output_path} (vocab_size={len(vocab)}, merges={len(merges_hf)})") print(" Special: pad=0 unk=1 bos=2 eos=3") def main(): p = argparse.ArgumentParser() p.add_argument("--model", default="bpe.model", help="Path to bpe.model") p.add_argument("--output", default="tokenizer.json", help="Output path") args = p.parse_args() export_tokenizer_json(args.model, args.output) if __name__ == "__main__": main()