| license: mit | |
| library_name: transformers | |
| tags: | |
| - cl100k_base | |
| - tiktoken | |
| # cl100k_base as `transformers` GPT2 tokenizer | |
| `cl100k_base` vocab converted from `tiktoken` to hf via [this code](https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee) by Xenova. | |
| ```py | |
| from transformers import GPT2TokenizerFast, AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("BEE-spoke-data/cl100k_base") | |
| # if issues, try GPT2TokenizerFast directly | |
| ``` | |
| ## details | |
| ```py | |
| GPT2TokenizerFast( | |
| name_or_path="BEE-spoke-data/cl100k_base", | |
| vocab_size=100261, | |
| model_max_length=8192, | |
| is_fast=True, | |
| padding_side="right", | |
| truncation_side="right", | |
| special_tokens={ | |
| "bos_token": "<|endoftext|>", | |
| "eos_token": "<|endoftext|>", | |
| "unk_token": "<|endoftext|>", | |
| }, | |
| clean_up_tokenization_spaces=True, | |
| added_tokens_decoder={ | |
| "100257": AddedToken( | |
| "<|endoftext|>", | |
| rstrip=False, | |
| lstrip=False, | |
| single_word=False, | |
| normalized=False, | |
| special=True, | |
| ), | |
| "100258": AddedToken( | |
| "<|fim_prefix|>", | |
| rstrip=False, | |
| lstrip=False, | |
| single_word=False, | |
| normalized=False, | |
| special=True, | |
| ), | |
| "100259": AddedToken( | |
| "<|fim_middle|>", | |
| rstrip=False, | |
| lstrip=False, | |
| single_word=False, | |
| normalized=False, | |
| special=True, | |
| ), | |
| "100260": AddedToken( | |
| "<|fim_suffix|>", | |
| rstrip=False, | |
| lstrip=False, | |
| single_word=False, | |
| normalized=False, | |
| special=True, | |
| ), | |
| "100276": AddedToken( | |
| "<|endofprompt|>", | |
| rstrip=False, | |
| lstrip=False, | |
| single_word=False, | |
| normalized=False, | |
| special=True, | |
| ), | |
| }, | |
| ) | |
| ``` |