File size: 4,494 Bytes
29a351f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
"""
Tokenizer training script - trains BPE tokenizer on SMILES data and uploads to HF Hub.
"""
import json
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from datasets import DatasetDict
from config import CACHE_DIR, TOKENIZER_NAME, SPECIAL_TOKENS, VOCAB_SIZE, MIN_FREQUENCY
from huggingface_hub import HfApi, create_repo
import os
def iter_text(ds: DatasetDict):
"""Iterator over source and target text from dataset."""
for split in ds:
for row in ds[split]:
yield row["source"]
yield row["target"]
def train_and_upload_tokenizer():
"""Train BPE tokenizer and upload to Hugging Face Hub."""
print("=" * 60)
print("Training Tokenizer")
print("=" * 60)
# Load dataset
print(f"Loading forward dataset from {CACHE_DIR / 'forward'}...")
forward = DatasetDict.load_from_disk(str(CACHE_DIR / "forward"))
# Create tokenizer
print("Creating BPE tokenizer...")
tokenizer = Tokenizer(models.BPE(unk_token=SPECIAL_TOKENS["unk_token"]))
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
trainer = trainers.BpeTrainer(
vocab_size=VOCAB_SIZE,
min_frequency=MIN_FREQUENCY,
special_tokens=list(SPECIAL_TOKENS.values()),
)
# Train tokenizer
print("Training tokenizer on dataset...")
tokenizer.train_from_iterator(iter_text(forward), trainer=trainer, length=len(forward["train"]) + len(forward.get("validation", [])))
# Save locally
local_path = "tokenizer.json"
tokenizer.save(local_path)
print(f"Saved tokenizer to {local_path}")
# Create repo if doesn't exist
print(f"Creating/accessing tokenizer repo: {TOKENIZER_NAME}")
try:
create_repo(
TOKENIZER_NAME,
repo_type="model",
exist_ok=True,
private=False,
token=os.environ.get("HF_TOKEN")
)
except Exception as e:
print(f"Note: {e}")
# Upload tokenizer
print(f"Uploading tokenizer to {TOKENIZER_NAME}...")
api = HfApi(token=os.environ.get("HF_TOKEN"))
try:
api.upload_file(
path_or_fileobj=local_path,
path_in_repo="tokenizer.json",
repo_id=TOKENIZER_NAME,
repo_type="model",
)
print("Tokenizer JSON uploaded successfully!")
except Exception as e:
print(f"Error uploading tokenizer.json: {e}")
# Create and upload tokenizer_config.json
config = {
"tokenizer_class": "ByteLevelBPETokenizer",
"unk_token": SPECIAL_TOKENS["unk_token"],
"bos_token": SPECIAL_TOKENS["bos_token"],
"eos_token": SPECIAL_TOKENS["eos_token"],
"pad_token": SPECIAL_TOKENS["pad_token"],
}
config_path = "tokenizer_config.json"
with open(config_path, "w") as f:
json.dump(config, f, indent=2)
try:
api.upload_file(
path_or_fileobj=config_path,
path_in_repo="tokenizer_config.json",
repo_id=TOKENIZER_NAME,
repo_type="model",
)
print("Tokenizer config uploaded successfully!")
except Exception as e:
print(f"Error uploading tokenizer_config.json: {e}")
# Create special_tokens_map.json
special_tokens_map = {
"unk_token": {"content": SPECIAL_TOKENS["unk_token"], "lstrip": False, "normalized": True, "rstrip": False},
"bos_token": {"content": SPECIAL_TOKENS["bos_token"], "lstrip": False, "normalized": True, "rstrip": False},
"eos_token": {"content": SPECIAL_TOKENS["eos_token"], "lstrip": False, "normalized": True, "rstrip": False},
"pad_token": {"content": SPECIAL_TOKENS["pad_token"], "lstrip": False, "normalized": True, "rstrip": False},
}
special_tokens_path = "special_tokens_map.json"
with open(special_tokens_path, "w") as f:
json.dump(special_tokens_map, f, indent=2)
try:
api.upload_file(
path_or_fileobj=special_tokens_path,
path_in_repo="special_tokens_map.json",
repo_id=TOKENIZER_NAME,
repo_type="model",
)
print("Special tokens map uploaded successfully!")
except Exception as e:
print(f"Error uploading special_tokens_map.json: {e}")
print(f"\nTokenizer training complete!")
print(f"Access your tokenizer at: https://huggingface.co/{TOKENIZER_NAME}")
if __name__ == "__main__":
train_and_upload_tokenizer()
|