File size: 4,720 Bytes
de7a6b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python3
"""
Export bpe.model (YouTokenToMe) to tokenizer.json for Rust/ONNX inference.
Extracts vocab and merges via yttm vocab --verbose. Compatible with Hugging Face tokenizers.
"""
import json
import argparse
import subprocess
import sys
import re

# RuCLIP special token IDs (YouTokenToMe defaults)
PAD_ID, UNK_ID, BOS_ID, EOS_ID = 0, 1, 2, 3
CONTEXT_LENGTH = 77


def _get_merges_from_yttm_verbose(model_path: str) -> list[tuple[str, str]]:
    """Run yttm vocab --verbose and parse merges. Returns list of (left, right) token pairs in order."""
    result = subprocess.run(
        ["yttm", "vocab", "--model", model_path, "--verbose"],
        capture_output=True,
        text=True,
        timeout=300,
    )
    if result.returncode != 0:
        raise RuntimeError(f"yttm vocab failed: {result.stderr}")
    merges: list[tuple[str, str]] = []
    # Format: id\ttoken  or  id\ttoken=left+right    left_id+right_id
    simple_re = re.compile(r"^(\d+)\t(.+)$")
    for line in result.stdout.strip().split("\n"):
        line = line.rstrip()
        if not line:
            continue
        m = simple_re.match(line)
        if not m:
            continue
        rest = m.group(2)
        if "=" in rest:
            token_z, merge_part = rest.split("=", 1)
            parts = merge_part.split()
            tok_part = parts[0] if parts else ""
            if "+" in tok_part:
                token_x, token_y = tok_part.split("+", 1)
                merges.append((token_x, token_y))
    return merges


def export_tokenizer_json(bpe_model_path: str, output_path: str) -> None:
    try:
        import youtokentome as yttm
    except ImportError:
        print("Install: pip install youtokentome", file=sys.stderr)
        sys.exit(1)

    bpe_yttm = yttm.BPE(bpe_model_path)
    vocab_list = bpe_yttm.vocab()
    vocab = {tok: i for i, tok in enumerate(vocab_list)}

    # Extract merges from yttm vocab --verbose
    print("Extracting merges via yttm vocab --verbose...")
    try:
        merges = _get_merges_from_yttm_verbose(bpe_model_path)
        # Only include merges where both tokens are in vocab (avoid parser artifacts like "=▁")
        valid = [(x, y) for x, y in merges if x in vocab and y in vocab]
        n_skipped = len(merges) - len(valid)
        if n_skipped:
            print(f"  Skipped {n_skipped} merges (token not in vocab)")
        if valid:
            merges_hf = [f"{x} {y}" for x, y in valid]
            ignore_merges = False
        else:
            merges_hf = []
            ignore_merges = True
    except Exception as e:
        print(f"Could not extract merges ({e}), using ignore_merges=True", file=sys.stderr)
        merges_hf = []
        ignore_merges = True

    obj = {
        "version": "1.0",
        "truncation": None,
        "padding": None,
        "added_tokens": [
            {"id": PAD_ID, "special": True, "content": vocab_list[PAD_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False},
            {"id": UNK_ID, "special": True, "content": vocab_list[UNK_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False},
            {"id": BOS_ID, "special": True, "content": vocab_list[BOS_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False},
            {"id": EOS_ID, "special": True, "content": vocab_list[EOS_ID], "single_word": False, "lstrip": False, "rstrip": False, "normalized": False},
        ],
        "normalizer": {"type": "Lowercase"},
        "pre_tokenizer": {"type": "Metaspace", "replacement": "\u2581", "prepend_scheme": "first"},
        "post_processor": None,
        "decoder": {"type": "Metaspace", "replacement": "\u2581", "prepend_scheme": "first"},
        "model": {
            "type": "BPE",
            "dropout": None,
            "unk_token": vocab_list[UNK_ID],
            "continuing_subword_prefix": "",
            "end_of_word_suffix": "",
            "fuse_unk": False,
            "byte_fallback": False,
            "ignore_merges": ignore_merges,
            "vocab": vocab,
            "merges": merges_hf,
        },
    }

    with open(output_path, "w", newline="\n") as f:
        s = json.dumps(obj, ensure_ascii=True, indent=2)
        f.write(s)

    print(f"Saved {output_path} (vocab_size={len(vocab)}, merges={len(merges_hf)})")
    print("  Special: pad=0 unk=1 bos=2 eos=3")


def main():
    p = argparse.ArgumentParser()
    p.add_argument("--model", default="bpe.model", help="Path to bpe.model")
    p.add_argument("--output", default="tokenizer.json", help="Output path")
    args = p.parse_args()
    export_tokenizer_json(args.model, args.output)


if __name__ == "__main__":
    main()