| from tokenizers import ByteLevelBPETokenizer | |
| import os | |
| input_path = os.path.join("..", "data", "input.txt") | |
| if not os.path.exists(input_path): | |
| input_path = os.path.join("data", "input.txt") | |
| tokenizer = ByteLevelBPETokenizer() | |
| tokenizer.train(files=input_path, vocab_size=1000, min_frequency=2) | |
| tokenizer.save_model(os.path.dirname(__file__)) | |
| print("Tokenizer trained and saved.") |