Invalid JSON:
Unexpected token 'r', "from tokeni"... is not valid JSON
| from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders | |
| # Create a BPE tokenizer | |
| tokenizer = Tokenizer(models.BPE()) | |
| tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() | |
| tokenizer.decoder = decoders.ByteLevel() | |
| # Train on your text data | |
| trainer = trainers.BpeTrainer( | |
| vocab_size=30000, | |
| special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"] | |
| ) | |
| # Replace 'train.txt' with your text file containing all training data | |
| tokenizer.train(files=["train.txt"], trainer=trainer) | |
| # Save the tokenizer.json | |
| tokenizer.save("tokenizer.json") | |
| print("tokenizer.json is ready!") |