use tiktoken for gpt4o
Browse files
app.py
CHANGED
|
@@ -50,7 +50,7 @@ test_phrase_set_long_3 = load_test_phrases('multilingualphrases03.txt')
|
|
| 50 |
hf_tokenizers = {}
|
| 51 |
for model in models:
|
| 52 |
print("Loading tokenizer for", model)
|
| 53 |
-
if model
|
| 54 |
hf_tokenizers[model] = AutoTokenizer.from_pretrained(model)
|
| 55 |
else:
|
| 56 |
hf_tokenizers[model] = tiktoken.encoding_for_model(model)
|
|
@@ -60,7 +60,7 @@ def generate_tokens_as_table(text):
|
|
| 60 |
table = []
|
| 61 |
for model in models:
|
| 62 |
tokenizer = hf_tokenizers[model]
|
| 63 |
-
if model
|
| 64 |
tokens = tokenizer.encode(text, add_special_tokens=False)
|
| 65 |
else:
|
| 66 |
tokens = tokenizer.encode(text)
|
|
@@ -87,7 +87,7 @@ def generate_tokenizer_table(text):
|
|
| 87 |
|
| 88 |
for model in models:
|
| 89 |
tokenizer = hf_tokenizers[model]
|
| 90 |
-
if model
|
| 91 |
vocab_size[model] = tokenizer.vocab_size
|
| 92 |
token_counts[model] = len(tokenizer.encode(text, add_special_tokens=True))
|
| 93 |
else:
|
|
|
|
| 50 |
hf_tokenizers = {}
|
| 51 |
for model in models:
|
| 52 |
print("Loading tokenizer for", model)
|
| 53 |
+
if model != 'gpt-4o':
|
| 54 |
hf_tokenizers[model] = AutoTokenizer.from_pretrained(model)
|
| 55 |
else:
|
| 56 |
hf_tokenizers[model] = tiktoken.encoding_for_model(model)
|
|
|
|
| 60 |
table = []
|
| 61 |
for model in models:
|
| 62 |
tokenizer = hf_tokenizers[model]
|
| 63 |
+
if model != 'gpt-4o':
|
| 64 |
tokens = tokenizer.encode(text, add_special_tokens=False)
|
| 65 |
else:
|
| 66 |
tokens = tokenizer.encode(text)
|
|
|
|
| 87 |
|
| 88 |
for model in models:
|
| 89 |
tokenizer = hf_tokenizers[model]
|
| 90 |
+
if model != 'gpt-4o':
|
| 91 |
vocab_size[model] = tokenizer.vocab_size
|
| 92 |
token_counts[model] = len(tokenizer.encode(text, add_special_tokens=True))
|
| 93 |
else:
|