Spaces:

smitathkr1
/

ord-training-simple

Paused

ord-training-simple / src /tokenizer_train.py

Vaishnav14220

Orchestrate full ORD training pipeline in Space

29a351f 2 months ago

4.49 kB

	"""
	Tokenizer training script - trains BPE tokenizer on SMILES data and uploads to HF Hub.
	"""
	import json
	from tokenizers import Tokenizer, models, trainers, pre_tokenizers
	from datasets import DatasetDict
	from config import CACHE_DIR, TOKENIZER_NAME, SPECIAL_TOKENS, VOCAB_SIZE, MIN_FREQUENCY
	from huggingface_hub import HfApi, create_repo
	import os


	def iter_text(ds: DatasetDict):
	"""Iterator over source and target text from dataset."""
	for split in ds:
	for row in ds[split]:
	yield row["source"]
	yield row["target"]


	def train_and_upload_tokenizer():
	"""Train BPE tokenizer and upload to Hugging Face Hub."""
	print("=" * 60)
	print("Training Tokenizer")
	print("=" * 60)

	# Load dataset
	print(f"Loading forward dataset from {CACHE_DIR / 'forward'}...")
	forward = DatasetDict.load_from_disk(str(CACHE_DIR / "forward"))

	# Create tokenizer
	print("Creating BPE tokenizer...")
	tokenizer = Tokenizer(models.BPE(unk_token=SPECIAL_TOKENS["unk_token"]))
	tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

	trainer = trainers.BpeTrainer(
	vocab_size=VOCAB_SIZE,
	min_frequency=MIN_FREQUENCY,
	special_tokens=list(SPECIAL_TOKENS.values()),
	)

	# Train tokenizer
	print("Training tokenizer on dataset...")
	tokenizer.train_from_iterator(iter_text(forward), trainer=trainer, length=len(forward["train"]) + len(forward.get("validation", [])))

	# Save locally
	local_path = "tokenizer.json"
	tokenizer.save(local_path)
	print(f"Saved tokenizer to {local_path}")

	# Create repo if doesn't exist
	print(f"Creating/accessing tokenizer repo: {TOKENIZER_NAME}")
	try:
	create_repo(
	TOKENIZER_NAME,
	repo_type="model",
	exist_ok=True,
	private=False,
	token=os.environ.get("HF_TOKEN")
	)
	except Exception as e:
	print(f"Note: {e}")

	# Upload tokenizer
	print(f"Uploading tokenizer to {TOKENIZER_NAME}...")
	api = HfApi(token=os.environ.get("HF_TOKEN"))

	try:
	api.upload_file(
	path_or_fileobj=local_path,
	path_in_repo="tokenizer.json",
	repo_id=TOKENIZER_NAME,
	repo_type="model",
	)
	print("Tokenizer JSON uploaded successfully!")
	except Exception as e:
	print(f"Error uploading tokenizer.json: {e}")

	# Create and upload tokenizer_config.json
	config = {
	"tokenizer_class": "ByteLevelBPETokenizer",
	"unk_token": SPECIAL_TOKENS["unk_token"],
	"bos_token": SPECIAL_TOKENS["bos_token"],
	"eos_token": SPECIAL_TOKENS["eos_token"],
	"pad_token": SPECIAL_TOKENS["pad_token"],
	}

	config_path = "tokenizer_config.json"
	with open(config_path, "w") as f:
	json.dump(config, f, indent=2)

	try:
	api.upload_file(
	path_or_fileobj=config_path,
	path_in_repo="tokenizer_config.json",
	repo_id=TOKENIZER_NAME,
	repo_type="model",
	)
	print("Tokenizer config uploaded successfully!")
	except Exception as e:
	print(f"Error uploading tokenizer_config.json: {e}")

	# Create special_tokens_map.json
	special_tokens_map = {
	"unk_token": {"content": SPECIAL_TOKENS["unk_token"], "lstrip": False, "normalized": True, "rstrip": False},
	"bos_token": {"content": SPECIAL_TOKENS["bos_token"], "lstrip": False, "normalized": True, "rstrip": False},
	"eos_token": {"content": SPECIAL_TOKENS["eos_token"], "lstrip": False, "normalized": True, "rstrip": False},
	"pad_token": {"content": SPECIAL_TOKENS["pad_token"], "lstrip": False, "normalized": True, "rstrip": False},
	}

	special_tokens_path = "special_tokens_map.json"
	with open(special_tokens_path, "w") as f:
	json.dump(special_tokens_map, f, indent=2)

	try:
	api.upload_file(
	path_or_fileobj=special_tokens_path,
	path_in_repo="special_tokens_map.json",
	repo_id=TOKENIZER_NAME,
	repo_type="model",
	)
	print("Special tokens map uploaded successfully!")
	except Exception as e:
	print(f"Error uploading special_tokens_map.json: {e}")

	print(f"\nTokenizer training complete!")
	print(f"Access your tokenizer at: https://huggingface.co/{TOKENIZER_NAME}")


	if __name__ == "__main__":
	train_and_upload_tokenizer()