File size: 674 Bytes
9639af0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | from LaughLM.data.tokenizer_train import train_tokenizer
import shutil
import os
import time
start = time.time()
local_path = "/tmp/tokenizer.json"
drive_path = "/content/drive/MyDrive/LaughLM/tokenizer/tokenizer.json"
train_tokenizer(
dataset_name = "dignity045/tokenizer_dataset_v1",
vocab_size = 32000,
max_samples = 1_000_000,
num_workers = 16,
work_dir = "/tmp/tok_work",
output_path = local_path,
)
# Copy to Google Drive after training
os.makedirs(os.path.dirname(drive_path), exist_ok=True)
shutil.copy(local_path, drive_path)
print("✓ Tokenizer copied to Drive")
print("Total time:", (time.time()-start)/60, "minutes")
|