ruclip-vit-large-patch14-336-onnx / export_tokenizer_json.py
ttkacheff's picture
Upload folder using huggingface_hub
de7a6b5 verified
#!/usr/bin/env python3
"""
Export bpe.model (YouTokenToMe) to tokenizer.json for Rust/ONNX inference.
Extracts vocab and merges via yttm vocab --verbose. Compatible with Hugging Face tokenizers.
"""
import json
import argparse
import subprocess
import sys
import re
# RuCLIP special token IDs (YouTokenToMe defaults)
PAD_ID, UNK_ID, BOS_ID, EOS_ID = 0, 1, 2, 3
CONTEXT_LENGTH = 77
def _get_merges_from_yttm_verbose(model_path: str) -> list[tuple[str, str]]:
"""Run yttm vocab --verbose and parse merges. Returns list of (left, right) token pairs in order."""
result = subprocess.run(
["yttm", "vocab", "--model", model_path, "--verbose"],
capture_output=True,
text=True,
timeout=300,
)
if result.returncode != 0:
raise RuntimeError(f"yttm vocab failed: {result.stderr}")
merges: list[tuple[str, str]] = []
# Format: id\ttoken or id\ttoken=left+right left_id+right_id
simple_re = re.compile(r"^(\d+)\t(.+)$")
for line in result.stdout.strip().split("\n"):
line = line.rstrip()
if not line:
continue
m = simple_re.match(line)
if not m:
continue
rest = m.group(2)
if "=" in rest:
token_z, merge_part = rest.split("=", 1)
parts = merge_part.split()
tok_part = parts[0] if parts else ""
if "+" in tok_part:
token_x, token_y = tok_part.split("+", 1)
merges.append((token_x, token_y))
return merges
def export_tokenizer_json(bpe_model_path: str, output_path: str) -> None:
try:
import youtokentome as yttm
except ImportError:
print("Install: pip install youtokentome", file=sys.stderr)
sys.exit(1)
bpe_yttm = yttm.BPE(bpe_model_path)
vocab_list = bpe_yttm.vocab()
vocab = {tok: i for i, tok in enumerate(vocab_list)}
# Extract merges from yttm vocab --verbose
print("Extracting merges via yttm vocab --verbose...")
try:
merges = _get_merges_from_yttm_verbose(bpe_model_path)
# Only include merges where both tokens are in vocab (avoid parser artifacts like "=▁")
valid = [(x, y) for x, y in merges if x in vocab and y in vocab]
n_skipped = len(merges) - len(valid)
if n_skipped:
print(f" Skipped {n_skipped} merges (token not in vocab)")
if valid:
merges_hf = [f"{x} {y}" for x, y in valid]
ignore_merges = False
else:
merges_hf = []
ignore_merges = True
except Exception as e:
print(f"Could not extract merges ({e}), using ignore_merges=True", file=sys.stderr)
merges_hf = []
ignore_merges = True
obj = {
"version": "1.0",
"truncation": None,
"padding": None,
"added_tokens": [
{"id": PAD_ID, "special": True, "content": vocab_list[PAD_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False},
{"id": UNK_ID, "special": True, "content": vocab_list[UNK_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False},
{"id": BOS_ID, "special": True, "content": vocab_list[BOS_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False},
{"id": EOS_ID, "special": True, "content": vocab_list[EOS_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False},
],
"normalizer": {"type": "Lowercase"},
"pre_tokenizer": {"type": "Metaspace", "replacement": "\u2581", "prepend_scheme": "first"},
"post_processor": None,
"decoder": {"type": "Metaspace", "replacement": "\u2581", "prepend_scheme": "first"},
"model": {
"type": "BPE",
"dropout": None,
"unk_token": vocab_list[UNK_ID],
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": False,
"byte_fallback": False,
"ignore_merges": ignore_merges,
"vocab": vocab,
"merges": merges_hf,
},
}
with open(output_path, "w", newline="\n") as f:
s = json.dumps(obj, ensure_ascii=True, indent=2)
f.write(s)
print(f"Saved {output_path} (vocab_size={len(vocab)}, merges={len(merges_hf)})")
print(" Special: pad=0 unk=1 bos=2 eos=3")
def main():
p = argparse.ArgumentParser()
p.add_argument("--model", default="bpe.model", help="Path to bpe.model")
p.add_argument("--output", default="tokenizer.json", help="Output path")
args = p.parse_args()
export_tokenizer_json(args.model, args.output)
if __name__ == "__main__":
main()