LaughLM / scripts /train_tokenizer.py
dignity045's picture
Duplicate from Dhiraj45/LaughLM
9639af0
from LaughLM.data.tokenizer_train import train_tokenizer
import shutil
import os
import time
start = time.time()
local_path = "/tmp/tokenizer.json"
drive_path = "/content/drive/MyDrive/LaughLM/tokenizer/tokenizer.json"
train_tokenizer(
dataset_name = "dignity045/tokenizer_dataset_v1",
vocab_size = 32000,
max_samples = 1_000_000,
num_workers = 16,
work_dir = "/tmp/tok_work",
output_path = local_path,
)
# Copy to Google Drive after training
os.makedirs(os.path.dirname(drive_path), exist_ok=True)
shutil.copy(local_path, drive_path)
print("✓ Tokenizer copied to Drive")
print("Total time:", (time.time()-start)/60, "minutes")