| from LaughLM.data.tokenizer_train import train_tokenizer | |
| import shutil | |
| import os | |
| import time | |
| start = time.time() | |
| local_path = "/tmp/tokenizer.json" | |
| drive_path = "/content/drive/MyDrive/LaughLM/tokenizer/tokenizer.json" | |
| train_tokenizer( | |
| dataset_name = "dignity045/tokenizer_dataset_v1", | |
| vocab_size = 32000, | |
| max_samples = 1_000_000, | |
| num_workers = 16, | |
| work_dir = "/tmp/tok_work", | |
| output_path = local_path, | |
| ) | |
| # Copy to Google Drive after training | |
| os.makedirs(os.path.dirname(drive_path), exist_ok=True) | |
| shutil.copy(local_path, drive_path) | |
| print("✓ Tokenizer copied to Drive") | |
| print("Total time:", (time.time()-start)/60, "minutes") | |