Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| from tokenizers import Tokenizer | |
| from tokenizers.models import BPE | |
| import tokenizers.pre_tokenizers as pre_tokenizers | |
| import tokenizers.processors as processors | |
| import tokenizers.decoders as decoders | |
| from tokenizers.trainers import BpeTrainer | |
| if __name__ == '__main__': | |
| tokenizer_path = Path('tokenizer/') | |
| tokenizer_path.mkdir(exist_ok=True) | |
| tokenizer = Tokenizer(BPE()) | |
| tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) | |
| tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) | |
| tokenizer.decoder = decoders.ByteLevel() | |
| trainer = BpeTrainer(special_tokens=['<|endoftext|>'], min_frequency=2) | |
| tokenizer.train(['data/harry_potter_data'],trainer) | |
| tokenizer.save(str(tokenizer_path / 'potter.json')) |