File size: 4,494 Bytes
29a351f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
Tokenizer training script - trains BPE tokenizer on SMILES data and uploads to HF Hub.
"""
import json
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from datasets import DatasetDict
from config import CACHE_DIR, TOKENIZER_NAME, SPECIAL_TOKENS, VOCAB_SIZE, MIN_FREQUENCY
from huggingface_hub import HfApi, create_repo
import os


def iter_text(ds: DatasetDict):
    """Iterator over source and target text from dataset."""
    for split in ds:
        for row in ds[split]:
            yield row["source"]
            yield row["target"]


def train_and_upload_tokenizer():
    """Train BPE tokenizer and upload to Hugging Face Hub."""
    print("=" * 60)
    print("Training Tokenizer")
    print("=" * 60)
    
    # Load dataset
    print(f"Loading forward dataset from {CACHE_DIR / 'forward'}...")
    forward = DatasetDict.load_from_disk(str(CACHE_DIR / "forward"))
    
    # Create tokenizer
    print("Creating BPE tokenizer...")
    tokenizer = Tokenizer(models.BPE(unk_token=SPECIAL_TOKENS["unk_token"]))
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    
    trainer = trainers.BpeTrainer(
        vocab_size=VOCAB_SIZE,
        min_frequency=MIN_FREQUENCY,
        special_tokens=list(SPECIAL_TOKENS.values()),
    )
    
    # Train tokenizer
    print("Training tokenizer on dataset...")
    tokenizer.train_from_iterator(iter_text(forward), trainer=trainer, length=len(forward["train"]) + len(forward.get("validation", [])))
    
    # Save locally
    local_path = "tokenizer.json"
    tokenizer.save(local_path)
    print(f"Saved tokenizer to {local_path}")
    
    # Create repo if doesn't exist
    print(f"Creating/accessing tokenizer repo: {TOKENIZER_NAME}")
    try:
        create_repo(
            TOKENIZER_NAME,
            repo_type="model",
            exist_ok=True,
            private=False,
            token=os.environ.get("HF_TOKEN")
        )
    except Exception as e:
        print(f"Note: {e}")
    
    # Upload tokenizer
    print(f"Uploading tokenizer to {TOKENIZER_NAME}...")
    api = HfApi(token=os.environ.get("HF_TOKEN"))
    
    try:
        api.upload_file(
            path_or_fileobj=local_path,
            path_in_repo="tokenizer.json",
            repo_id=TOKENIZER_NAME,
            repo_type="model",
        )
        print("Tokenizer JSON uploaded successfully!")
    except Exception as e:
        print(f"Error uploading tokenizer.json: {e}")
    
    # Create and upload tokenizer_config.json
    config = {
        "tokenizer_class": "ByteLevelBPETokenizer",
        "unk_token": SPECIAL_TOKENS["unk_token"],
        "bos_token": SPECIAL_TOKENS["bos_token"],
        "eos_token": SPECIAL_TOKENS["eos_token"],
        "pad_token": SPECIAL_TOKENS["pad_token"],
    }
    
    config_path = "tokenizer_config.json"
    with open(config_path, "w") as f:
        json.dump(config, f, indent=2)
    
    try:
        api.upload_file(
            path_or_fileobj=config_path,
            path_in_repo="tokenizer_config.json",
            repo_id=TOKENIZER_NAME,
            repo_type="model",
        )
        print("Tokenizer config uploaded successfully!")
    except Exception as e:
        print(f"Error uploading tokenizer_config.json: {e}")
    
    # Create special_tokens_map.json
    special_tokens_map = {
        "unk_token": {"content": SPECIAL_TOKENS["unk_token"], "lstrip": False, "normalized": True, "rstrip": False},
        "bos_token": {"content": SPECIAL_TOKENS["bos_token"], "lstrip": False, "normalized": True, "rstrip": False},
        "eos_token": {"content": SPECIAL_TOKENS["eos_token"], "lstrip": False, "normalized": True, "rstrip": False},
        "pad_token": {"content": SPECIAL_TOKENS["pad_token"], "lstrip": False, "normalized": True, "rstrip": False},
    }
    
    special_tokens_path = "special_tokens_map.json"
    with open(special_tokens_path, "w") as f:
        json.dump(special_tokens_map, f, indent=2)
    
    try:
        api.upload_file(
            path_or_fileobj=special_tokens_path,
            path_in_repo="special_tokens_map.json",
            repo_id=TOKENIZER_NAME,
            repo_type="model",
        )
        print("Special tokens map uploaded successfully!")
    except Exception as e:
        print(f"Error uploading special_tokens_map.json: {e}")
    
    print(f"\nTokenizer training complete!")
    print(f"Access your tokenizer at: https://huggingface.co/{TOKENIZER_NAME}")


if __name__ == "__main__":
    train_and_upload_tokenizer()