ruclip-vit-large-patch14-336-onnx

ruclip-vit-large-patch14-336-onnx / export_tokenizer_json.py

Upload folder using huggingface_hub

de7a6b5 verified 3 days ago

4.72 kB

	#!/usr/bin/env python3
	"""
	Export bpe.model (YouTokenToMe) to tokenizer.json for Rust/ONNX inference.
	Extracts vocab and merges via yttm vocab --verbose. Compatible with Hugging Face tokenizers.
	"""
	import json
	import argparse
	import subprocess
	import sys
	import re

	# RuCLIP special token IDs (YouTokenToMe defaults)
	PAD_ID, UNK_ID, BOS_ID, EOS_ID = 0, 1, 2, 3
	CONTEXT_LENGTH = 77


	def _get_merges_from_yttm_verbose(model_path: str) -> list[tuple[str, str]]:
	"""Run yttm vocab --verbose and parse merges. Returns list of (left, right) token pairs in order."""
	result = subprocess.run(
	["yttm", "vocab", "--model", model_path, "--verbose"],
	capture_output=True,
	text=True,
	timeout=300,
	)
	if result.returncode != 0:
	raise RuntimeError(f"yttm vocab failed: {result.stderr}")
	merges: list[tuple[str, str]] = []
	# Format: id\ttoken or id\ttoken=left+right left_id+right_id
	simple_re = re.compile(r"^(\d+)\t(.+)$")
	for line in result.stdout.strip().split("\n"):
	line = line.rstrip()
	if not line:
	continue
	m = simple_re.match(line)
	if not m:
	continue
	rest = m.group(2)
	if "=" in rest:
	token_z, merge_part = rest.split("=", 1)
	parts = merge_part.split()
	tok_part = parts[0] if parts else ""
	if "+" in tok_part:
	token_x, token_y = tok_part.split("+", 1)
	merges.append((token_x, token_y))
	return merges


	def export_tokenizer_json(bpe_model_path: str, output_path: str) -> None:
	try:
	import youtokentome as yttm
	except ImportError:
	print("Install: pip install youtokentome", file=sys.stderr)
	sys.exit(1)

	bpe_yttm = yttm.BPE(bpe_model_path)
	vocab_list = bpe_yttm.vocab()
	vocab = {tok: i for i, tok in enumerate(vocab_list)}

	# Extract merges from yttm vocab --verbose
	print("Extracting merges via yttm vocab --verbose...")
	try:
	merges = _get_merges_from_yttm_verbose(bpe_model_path)
	# Only include merges where both tokens are in vocab (avoid parser artifacts like "=▁")
	valid = [(x, y) for x, y in merges if x in vocab and y in vocab]
	n_skipped = len(merges) - len(valid)
	if n_skipped:
	print(f" Skipped {n_skipped} merges (token not in vocab)")
	if valid:
	merges_hf = [f"{x} {y}" for x, y in valid]
	ignore_merges = False
	else:
	merges_hf = []
	ignore_merges = True
	except Exception as e:
	print(f"Could not extract merges ({e}), using ignore_merges=True", file=sys.stderr)
	merges_hf = []
	ignore_merges = True

	obj = {
	"version": "1.0",
	"truncation": None,
	"padding": None,
	"added_tokens": [
	{"id": PAD_ID, "special": True, "content": vocab_list[PAD_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False},
	{"id": UNK_ID, "special": True, "content": vocab_list[UNK_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False},
	{"id": BOS_ID, "special": True, "content": vocab_list[BOS_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False},
	{"id": EOS_ID, "special": True, "content": vocab_list[EOS_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False},
	],
	"normalizer": {"type": "Lowercase"},
	"pre_tokenizer": {"type": "Metaspace", "replacement": "\u2581", "prepend_scheme": "first"},
	"post_processor": None,
	"decoder": {"type": "Metaspace", "replacement": "\u2581", "prepend_scheme": "first"},
	"model": {
	"type": "BPE",
	"dropout": None,
	"unk_token": vocab_list[UNK_ID],
	"continuing_subword_prefix": "",
	"end_of_word_suffix": "",
	"fuse_unk": False,
	"byte_fallback": False,
	"ignore_merges": ignore_merges,
	"vocab": vocab,
	"merges": merges_hf,
	},
	}

	with open(output_path, "w", newline="\n") as f:
	s = json.dumps(obj, ensure_ascii=True, indent=2)
	f.write(s)

	print(f"Saved {output_path} (vocab_size={len(vocab)}, merges={len(merges_hf)})")
	print(" Special: pad=0 unk=1 bos=2 eos=3")


	def main():
	p = argparse.ArgumentParser()
	p.add_argument("--model", default="bpe.model", help="Path to bpe.model")
	p.add_argument("--output", default="tokenizer.json", help="Output path")
	args = p.parse_args()
	export_tokenizer_json(args.model, args.output)


	if __name__ == "__main__":
	main()