Spaces:
Running
Running
Add Qwen3-4B-Base tokenizer
Browse files- app.py +4 -4
- tests/test_utils.py +3 -1
- utils.py +4 -2
app.py
CHANGED
|
@@ -37,7 +37,7 @@ def read_svg_file(name: str) -> str:
|
|
| 37 |
def get_model_icon(name: str) -> str:
|
| 38 |
"""Get the HTML for the model icon."""
|
| 39 |
# Skip icons for collapsed models
|
| 40 |
-
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
|
| 41 |
return ""
|
| 42 |
|
| 43 |
svg_content = read_svg_file(name)
|
|
@@ -92,7 +92,7 @@ with gr.Blocks(title="Tokens matter.", theme=theme, css="""
|
|
| 92 |
with gr.Row():
|
| 93 |
with gr.Column():
|
| 94 |
for name in all_tokenizer_names[:2]:
|
| 95 |
-
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
|
| 96 |
continue
|
| 97 |
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
|
| 98 |
with gr.Group(elem_classes="tokenizer-panel"):
|
|
@@ -101,7 +101,7 @@ with gr.Blocks(title="Tokens matter.", theme=theme, css="""
|
|
| 101 |
main_output_boxes.append(box)
|
| 102 |
with gr.Column():
|
| 103 |
for name in all_tokenizer_names[2:4]:
|
| 104 |
-
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
|
| 105 |
continue
|
| 106 |
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
|
| 107 |
with gr.Group(elem_classes="tokenizer-panel"):
|
|
@@ -113,7 +113,7 @@ with gr.Blocks(title="Tokens matter.", theme=theme, css="""
|
|
| 113 |
more_models = gr.Accordion("More Models", open=False, visible=False)
|
| 114 |
with more_models:
|
| 115 |
for name in all_tokenizer_names:
|
| 116 |
-
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
|
| 117 |
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
|
| 118 |
with gr.Group(elem_classes="tokenizer-panel"):
|
| 119 |
gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>')
|
|
|
|
| 37 |
def get_model_icon(name: str) -> str:
|
| 38 |
"""Get the HTML for the model icon."""
|
| 39 |
# Skip icons for collapsed models
|
| 40 |
+
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b", "Qwen/Qwen3-4B-Base"]:
|
| 41 |
return ""
|
| 42 |
|
| 43 |
svg_content = read_svg_file(name)
|
|
|
|
| 92 |
with gr.Row():
|
| 93 |
with gr.Column():
|
| 94 |
for name in all_tokenizer_names[:2]:
|
| 95 |
+
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b", "Qwen/Qwen3-4B-Base"]:
|
| 96 |
continue
|
| 97 |
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
|
| 98 |
with gr.Group(elem_classes="tokenizer-panel"):
|
|
|
|
| 101 |
main_output_boxes.append(box)
|
| 102 |
with gr.Column():
|
| 103 |
for name in all_tokenizer_names[2:4]:
|
| 104 |
+
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b", "Qwen/Qwen3-4B-Base"]:
|
| 105 |
continue
|
| 106 |
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
|
| 107 |
with gr.Group(elem_classes="tokenizer-panel"):
|
|
|
|
| 113 |
more_models = gr.Accordion("More Models", open=False, visible=False)
|
| 114 |
with more_models:
|
| 115 |
for name in all_tokenizer_names:
|
| 116 |
+
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b", "Qwen/Qwen3-4B-Base"]:
|
| 117 |
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
|
| 118 |
with gr.Group(elem_classes="tokenizer-panel"):
|
| 119 |
gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>')
|
tests/test_utils.py
CHANGED
|
@@ -19,7 +19,7 @@ class TestTokenizerLoading(unittest.TestCase):
|
|
| 19 |
tokenizers = load_tokenizers()
|
| 20 |
|
| 21 |
# Check that we have the expected number of tokenizers
|
| 22 |
-
expected_count =
|
| 23 |
self.assertEqual(len(tokenizers), expected_count)
|
| 24 |
|
| 25 |
# Check that all expected tokenizers are present
|
|
@@ -29,6 +29,8 @@ class TestTokenizerLoading(unittest.TestCase):
|
|
| 29 |
"ZurichNLP/swissbert",
|
| 30 |
"mistralai/Mistral-Nemo-Instruct-2407",
|
| 31 |
"google/gemma-3-27b-it",
|
|
|
|
|
|
|
| 32 |
"gpt-4o"
|
| 33 |
}
|
| 34 |
self.assertEqual(set(tokenizers.keys()), expected_names)
|
|
|
|
| 19 |
tokenizers = load_tokenizers()
|
| 20 |
|
| 21 |
# Check that we have the expected number of tokenizers
|
| 22 |
+
expected_count = 8 # 7 HF + 1 OpenAI
|
| 23 |
self.assertEqual(len(tokenizers), expected_count)
|
| 24 |
|
| 25 |
# Check that all expected tokenizers are present
|
|
|
|
| 29 |
"ZurichNLP/swissbert",
|
| 30 |
"mistralai/Mistral-Nemo-Instruct-2407",
|
| 31 |
"google/gemma-3-27b-it",
|
| 32 |
+
"CohereLabs/aya-expanse-8b",
|
| 33 |
+
"Qwen/Qwen3-4B-Base",
|
| 34 |
"gpt-4o"
|
| 35 |
}
|
| 36 |
self.assertEqual(set(tokenizers.keys()), expected_names)
|
utils.py
CHANGED
|
@@ -96,6 +96,7 @@ def load_tokenizers() -> Dict[str, object]:
|
|
| 96 |
"google/gemma-3-27b-it",
|
| 97 |
"mistralai/Mistral-Nemo-Instruct-2407",
|
| 98 |
"CohereLabs/aya-expanse-8b",
|
|
|
|
| 99 |
]
|
| 100 |
for name in hf_names:
|
| 101 |
tokenizer_name, tokenizer = load_hf_tokenizer(name)
|
|
@@ -111,7 +112,8 @@ MODEL_DISPLAY_NAMES = {
|
|
| 111 |
"mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo",
|
| 112 |
"google/gemma-3-27b-it": "Gemma 3",
|
| 113 |
"gpt-4o": "ChatGPT (gpt-4o)",
|
| 114 |
-
"CohereLabs/aya-expanse-8b": "Aya Expanse"
|
|
|
|
| 115 |
}
|
| 116 |
|
| 117 |
def tokenize(s: str, tokenizer) -> List[str]:
|
|
@@ -156,7 +158,7 @@ def _tokenize_with_tokenizer(s: str, tokenizer) -> List[str]:
|
|
| 156 |
"""
|
| 157 |
if hasattr(tokenizer, "tokenize"):
|
| 158 |
encoded = tokenizer.encode(s, add_special_tokens=False)
|
| 159 |
-
if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya"]):
|
| 160 |
tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded]
|
| 161 |
else:
|
| 162 |
tokens = tokenizer.convert_ids_to_tokens(encoded)
|
|
|
|
| 96 |
"google/gemma-3-27b-it",
|
| 97 |
"mistralai/Mistral-Nemo-Instruct-2407",
|
| 98 |
"CohereLabs/aya-expanse-8b",
|
| 99 |
+
"Qwen/Qwen3-4B-Base",
|
| 100 |
]
|
| 101 |
for name in hf_names:
|
| 102 |
tokenizer_name, tokenizer = load_hf_tokenizer(name)
|
|
|
|
| 112 |
"mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo",
|
| 113 |
"google/gemma-3-27b-it": "Gemma 3",
|
| 114 |
"gpt-4o": "ChatGPT (gpt-4o)",
|
| 115 |
+
"CohereLabs/aya-expanse-8b": "Aya Expanse",
|
| 116 |
+
"Qwen/Qwen3-4B-Base": "Qwen3 4B Base",
|
| 117 |
}
|
| 118 |
|
| 119 |
def tokenize(s: str, tokenizer) -> List[str]:
|
|
|
|
| 158 |
"""
|
| 159 |
if hasattr(tokenizer, "tokenize"):
|
| 160 |
encoded = tokenizer.encode(s, add_special_tokens=False)
|
| 161 |
+
if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya", "qwen"]):
|
| 162 |
tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded]
|
| 163 |
else:
|
| 164 |
tokens = tokenizer.convert_ids_to_tokens(encoded)
|