File size: 674 Bytes
9639af0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from LaughLM.data.tokenizer_train import train_tokenizer
import shutil
import os
import time

start = time.time()

local_path = "/tmp/tokenizer.json"
drive_path = "/content/drive/MyDrive/LaughLM/tokenizer/tokenizer.json"

train_tokenizer(
    dataset_name  = "dignity045/tokenizer_dataset_v1",
    vocab_size    = 32000,
    max_samples   = 1_000_000,
    num_workers   = 16,
    work_dir      = "/tmp/tok_work",
    output_path   = local_path,
)

# Copy to Google Drive after training
os.makedirs(os.path.dirname(drive_path), exist_ok=True)
shutil.copy(local_path, drive_path)

print("✓ Tokenizer copied to Drive")
print("Total time:", (time.time()-start)/60, "minutes")