| # For models trained with SentencePiece with byte_fallback for autoregressive models | |
| # ./spm_train --vocab_size 32000 --character_coverage 1.0 --hard_vocab_limit --model_type bpe --pad_id 3 --shuffle_input_sentence true --model_prefix ./sentencepiece.model --byte_fallback=true --input text.txt --input_sentence_size=100000 --num_threads 8 | |
| wget -O sentencepiece_extractor.py https://raw.githubusercontent.com/huggingface/tokenizers/master/bindings/python/scripts/sentencepiece_extractor.py | |
| python sentencepiece_extractor.py --provider sentencepiece --model sentencepiece.model --merges-output-path ./merges.txt --vocab-output-path ./vocab.json | |
| python <<EOF | |
| from transformers import AutoTokenizer | |
| from tokenizers import SentencePieceBPETokenizer | |
| SentencePieceBPETokenizer.from_file("./vocab.json", "./merges.txt") | |
| tokenizer = SentencePieceBPETokenizer.from_file("./vocab.json", "./merges.txt") | |
| tokenizer.model.byte_fallback=True | |
| tokenizer.model.fuse_unk=True | |
| tokenizer.save("./tokenizer.json") | |
| htok = AutoTokenizer.from_pretrained("./") | |
| htok.padding_side = "right" | |
| htok.save_pretrained("./") | |
| EOF | |