jvamvas commited on
Commit
ff68509
·
1 Parent(s): 35e7f94

Add Qwen3-4B-Base tokenizer

Browse files
Files changed (3) hide show
  1. app.py +4 -4
  2. tests/test_utils.py +3 -1
  3. utils.py +4 -2
app.py CHANGED
@@ -37,7 +37,7 @@ def read_svg_file(name: str) -> str:
37
  def get_model_icon(name: str) -> str:
38
  """Get the HTML for the model icon."""
39
  # Skip icons for collapsed models
40
- if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
41
  return ""
42
 
43
  svg_content = read_svg_file(name)
@@ -92,7 +92,7 @@ with gr.Blocks(title="Tokens matter.", theme=theme, css="""
92
  with gr.Row():
93
  with gr.Column():
94
  for name in all_tokenizer_names[:2]:
95
- if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
96
  continue
97
  display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
98
  with gr.Group(elem_classes="tokenizer-panel"):
@@ -101,7 +101,7 @@ with gr.Blocks(title="Tokens matter.", theme=theme, css="""
101
  main_output_boxes.append(box)
102
  with gr.Column():
103
  for name in all_tokenizer_names[2:4]:
104
- if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
105
  continue
106
  display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
107
  with gr.Group(elem_classes="tokenizer-panel"):
@@ -113,7 +113,7 @@ with gr.Blocks(title="Tokens matter.", theme=theme, css="""
113
  more_models = gr.Accordion("More Models", open=False, visible=False)
114
  with more_models:
115
  for name in all_tokenizer_names:
116
- if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
117
  display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
118
  with gr.Group(elem_classes="tokenizer-panel"):
119
  gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>')
 
37
  def get_model_icon(name: str) -> str:
38
  """Get the HTML for the model icon."""
39
  # Skip icons for collapsed models
40
+ if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b", "Qwen/Qwen3-4B-Base"]:
41
  return ""
42
 
43
  svg_content = read_svg_file(name)
 
92
  with gr.Row():
93
  with gr.Column():
94
  for name in all_tokenizer_names[:2]:
95
+ if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b", "Qwen/Qwen3-4B-Base"]:
96
  continue
97
  display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
98
  with gr.Group(elem_classes="tokenizer-panel"):
 
101
  main_output_boxes.append(box)
102
  with gr.Column():
103
  for name in all_tokenizer_names[2:4]:
104
+ if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b", "Qwen/Qwen3-4B-Base"]:
105
  continue
106
  display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
107
  with gr.Group(elem_classes="tokenizer-panel"):
 
113
  more_models = gr.Accordion("More Models", open=False, visible=False)
114
  with more_models:
115
  for name in all_tokenizer_names:
116
+ if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b", "Qwen/Qwen3-4B-Base"]:
117
  display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
118
  with gr.Group(elem_classes="tokenizer-panel"):
119
  gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>')
tests/test_utils.py CHANGED
@@ -19,7 +19,7 @@ class TestTokenizerLoading(unittest.TestCase):
19
  tokenizers = load_tokenizers()
20
 
21
  # Check that we have the expected number of tokenizers
22
- expected_count = 6 # 5 HF + 1 OpenAI
23
  self.assertEqual(len(tokenizers), expected_count)
24
 
25
  # Check that all expected tokenizers are present
@@ -29,6 +29,8 @@ class TestTokenizerLoading(unittest.TestCase):
29
  "ZurichNLP/swissbert",
30
  "mistralai/Mistral-Nemo-Instruct-2407",
31
  "google/gemma-3-27b-it",
 
 
32
  "gpt-4o"
33
  }
34
  self.assertEqual(set(tokenizers.keys()), expected_names)
 
19
  tokenizers = load_tokenizers()
20
 
21
  # Check that we have the expected number of tokenizers
22
+ expected_count = 8 # 7 HF + 1 OpenAI
23
  self.assertEqual(len(tokenizers), expected_count)
24
 
25
  # Check that all expected tokenizers are present
 
29
  "ZurichNLP/swissbert",
30
  "mistralai/Mistral-Nemo-Instruct-2407",
31
  "google/gemma-3-27b-it",
32
+ "CohereLabs/aya-expanse-8b",
33
+ "Qwen/Qwen3-4B-Base",
34
  "gpt-4o"
35
  }
36
  self.assertEqual(set(tokenizers.keys()), expected_names)
utils.py CHANGED
@@ -96,6 +96,7 @@ def load_tokenizers() -> Dict[str, object]:
96
  "google/gemma-3-27b-it",
97
  "mistralai/Mistral-Nemo-Instruct-2407",
98
  "CohereLabs/aya-expanse-8b",
 
99
  ]
100
  for name in hf_names:
101
  tokenizer_name, tokenizer = load_hf_tokenizer(name)
@@ -111,7 +112,8 @@ MODEL_DISPLAY_NAMES = {
111
  "mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo",
112
  "google/gemma-3-27b-it": "Gemma 3",
113
  "gpt-4o": "ChatGPT (gpt-4o)",
114
- "CohereLabs/aya-expanse-8b": "Aya Expanse"
 
115
  }
116
 
117
  def tokenize(s: str, tokenizer) -> List[str]:
@@ -156,7 +158,7 @@ def _tokenize_with_tokenizer(s: str, tokenizer) -> List[str]:
156
  """
157
  if hasattr(tokenizer, "tokenize"):
158
  encoded = tokenizer.encode(s, add_special_tokens=False)
159
- if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya"]):
160
  tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded]
161
  else:
162
  tokens = tokenizer.convert_ids_to_tokens(encoded)
 
96
  "google/gemma-3-27b-it",
97
  "mistralai/Mistral-Nemo-Instruct-2407",
98
  "CohereLabs/aya-expanse-8b",
99
+ "Qwen/Qwen3-4B-Base",
100
  ]
101
  for name in hf_names:
102
  tokenizer_name, tokenizer = load_hf_tokenizer(name)
 
112
  "mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo",
113
  "google/gemma-3-27b-it": "Gemma 3",
114
  "gpt-4o": "ChatGPT (gpt-4o)",
115
+ "CohereLabs/aya-expanse-8b": "Aya Expanse",
116
+ "Qwen/Qwen3-4B-Base": "Qwen3 4B Base",
117
  }
118
 
119
  def tokenize(s: str, tokenizer) -> List[str]:
 
158
  """
159
  if hasattr(tokenizer, "tokenize"):
160
  encoded = tokenizer.encode(s, add_special_tokens=False)
161
+ if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya", "qwen"]):
162
  tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded]
163
  else:
164
  tokens = tokenizer.convert_ids_to_tokens(encoded)