Spaces:
Sleeping
Sleeping
| from tokenizers import Tokenizer | |
| from tokenizers.models import BPE | |
| from tokenizers.trainers import BpeTrainer | |
| from tokenizers.pre_tokenizers import ByteLevel | |
| from tokenizers.decoders import ByteLevel as ByteLevelDecoder | |
| from tokenizers.processors import TemplateProcessing | |
| tokenizer = Tokenizer(BPE(unk_token="<UNK>")) | |
| tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) | |
| tokenizer.decoder = ByteLevelDecoder() | |
| trainer = BpeTrainer( | |
| vocab_size=5000, | |
| min_frequency=1, # Lowered to combine words more easily | |
| special_tokens=["<PAD>", "<UNK>", "<|endoftext|>"] | |
| ) | |
| tokenizer.train( | |
| files=["training_data.txt"], | |
| trainer=trainer | |
| ) | |
| tokenizer.post_processor = TemplateProcessing( | |
| single="$A <|endoftext|>", | |
| special_tokens=[ | |
| ("<|endoftext|>", tokenizer.token_to_id("<|endoftext|>")) | |
| ] | |
| ) | |
| tokenizer.save("aoban_tokenizer.json") | |
| print("Tokenizer fixed and saved.") | |