File size: 2,697 Bytes
14a2165 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
{
"gpt2": {
"data_gym_to_mergeable_bpe_ranks": {
"vocab_bpe_file": "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
"encoder_json_file": "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json"
},
"explicit_n_vocab": 50257,
"pat_str": "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",
"special_tokens": {
"<|endoftext|>": 50256
}
},
"r50k_base": {
"load_tiktoken_bpe": "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
"explicit_n_vocab": 50257,
"pat_str": "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",
"special_tokens": {
"<|endoftext|>": 50256
}
},
"p50k_base": {
"load_tiktoken_bpe": "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
"explicit_n_vocab": 50281,
"pat_str": "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+",
"special_tokens": {
"<|endoftext|>": 50256
}
},
"p50k_edit": {
"load_tiktoken_bpe": "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
"special_tokens": {
"<|endoftext|>": 50256,
"<|fim_prefix|>": 50281,
"<|fim_middle|>": 50282,
"<|fim_suffix|>": 50283
},
"pat_str": "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+"
},
"cl100k_base": {
"load_tiktoken_bpe": "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
"special_tokens": {
"<|endoftext|>": 100257,
"<|fim_prefix|>": 100258,
"<|fim_middle|>": 100259,
"<|fim_suffix|>": 100260,
"<|endofprompt|>": 100276
},
"pat_str": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
},
"o200k_base": {
"load_tiktoken_bpe": "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
"special_tokens": {
"<|endoftext|>": 199999,
"<|endofprompt|>": 200018
},
"pat_str": "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
}
}
|