chyams Claude Opus 4.5 commited on
Commit
01852d9
·
1 Parent(s): 522ab9a

Separate GPT-2 tokenizer for demo; add Qwen2.5-7B fp16

Browse files

- Tokenizer tab now uses GPT-2's tokenizer (more interesting subword splits)
- Generation still uses admin-selected model (Qwen default)
- Added Qwen2.5-7B fp16 option for L4's 24GB VRAM
- Removed fictional GPT-OSS-20B models

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (2) hide show
  1. app.py +10 -8
  2. models.py +37 -10
app.py CHANGED
@@ -14,7 +14,7 @@ from datetime import datetime, timezone, timedelta
14
 
15
  import gradio as gr
16
 
17
- from models import AVAILABLE_MODELS, manager
18
 
19
  # ---------------------------------------------------------------------------
20
  # Admin password -- set via env var on HF Spaces, or fall back to default
@@ -498,11 +498,12 @@ def on_show_steps_change(show_steps):
498
  # ---------------------------------------------------------------------------
499
 
500
  def tokenize_text(text):
501
- """Tokenize input and return formatted HTML."""
502
- if not manager.is_ready():
503
- return f"<p style='color:red;'>{manager.status_message()}</p>"
504
 
505
- tokens = manager.tokenize(text)
 
 
 
506
  return _render_tokens_html(tokens)
507
 
508
 
@@ -680,13 +681,14 @@ def create_app():
680
  with gr.Tab("Tokenizer"):
681
  gr.Markdown("### Token Visualization")
682
  gr.Markdown(
683
- "Enter any text to see how the model's tokenizer splits it into tokens. "
684
- "Hover over each token to see its numeric ID."
 
685
  )
686
 
687
  t3_input = gr.Textbox(
688
  label="Text",
689
- value="Huston-Tillotson University is an HBCU in Austin, Texas",
690
  lines=3,
691
  )
692
  t3_btn = gr.Button("Tokenize", variant="primary")
 
14
 
15
  import gradio as gr
16
 
17
+ from models import AVAILABLE_MODELS, manager, demo_tokenizer
18
 
19
  # ---------------------------------------------------------------------------
20
  # Admin password -- set via env var on HF Spaces, or fall back to default
 
498
  # ---------------------------------------------------------------------------
499
 
500
  def tokenize_text(text):
501
+ """Tokenize input and return formatted HTML.
 
 
502
 
503
+ Uses GPT-2's tokenizer (not the generation model's tokenizer) because
504
+ GPT-2's smaller vocabulary produces more interesting subword splits.
505
+ """
506
+ tokens = demo_tokenizer.tokenize(text)
507
  return _render_tokens_html(tokens)
508
 
509
 
 
681
  with gr.Tab("Tokenizer"):
682
  gr.Markdown("### Token Visualization")
683
  gr.Markdown(
684
+ "See how text is split into tokens before the model processes it. "
685
+ "Hover over each token to see its numeric ID. "
686
+ "Uses GPT-2's tokenizer, which splits words into interesting subword pieces."
687
  )
688
 
689
  t3_input = gr.Textbox(
690
  label="Text",
691
+ value="Huston-Tillotson University is an HBCU in Austin, Texas.",
692
  lines=3,
693
  )
694
  t3_btn = gr.Button("Tokenize", variant="primary")
models.py CHANGED
@@ -23,6 +23,11 @@ AVAILABLE_MODELS = {
23
  "dtype": "float16",
24
  "description": "Fast, good quality (default)",
25
  },
 
 
 
 
 
26
  "Qwen2.5-7B (4-bit)": {
27
  "id": "Qwen/Qwen2.5-7B",
28
  "quantize": "4bit",
@@ -38,16 +43,6 @@ AVAILABLE_MODELS = {
38
  "quantize": "4bit",
39
  "description": "Best quality, quantized",
40
  },
41
- "GPT-OSS-20B": {
42
- "id": "openai/gpt-oss-20b",
43
- "dtype": "auto",
44
- "description": "OpenAI 20B, full precision (local/large GPU only)",
45
- },
46
- "GPT-OSS-20B (4-bit)": {
47
- "id": "openai/gpt-oss-20b",
48
- "quantize": "4bit",
49
- "description": "OpenAI 20B, quantized to fit T4 (~10-12GB)",
50
- },
51
  }
52
 
53
  DEFAULT_MODEL = "Qwen2.5-3B"
@@ -318,6 +313,38 @@ class ModelManager:
318
  ids = self.tokenizer.encode(text)
319
  return [(self.tokenizer.decode([tid]), tid) for tid in ids]
320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  # ------------------------------------------------------------------
322
  # Config helpers
323
  # ------------------------------------------------------------------
 
23
  "dtype": "float16",
24
  "description": "Fast, good quality (default)",
25
  },
26
+ "Qwen2.5-7B": {
27
+ "id": "Qwen/Qwen2.5-7B",
28
+ "dtype": "float16",
29
+ "description": "Higher quality, needs 24GB+ VRAM (L4/A10)",
30
+ },
31
  "Qwen2.5-7B (4-bit)": {
32
  "id": "Qwen/Qwen2.5-7B",
33
  "quantize": "4bit",
 
43
  "quantize": "4bit",
44
  "description": "Best quality, quantized",
45
  },
 
 
 
 
 
 
 
 
 
 
46
  }
47
 
48
  DEFAULT_MODEL = "Qwen2.5-3B"
 
313
  ids = self.tokenizer.encode(text)
314
  return [(self.tokenizer.decode([tid]), tid) for tid in ids]
315
 
316
+
317
+ # ---------------------------------------------------------------------------
318
+ # Separate tokenizer for demo purposes (GPT-2 shows more interesting splits)
319
+ # ---------------------------------------------------------------------------
320
+
321
+ class DemoTokenizer:
322
+ """Lightweight tokenizer for the Tokenizer tab.
323
+
324
+ Uses GPT-2's BPE tokenizer which has a smaller vocabulary and produces
325
+ more interesting subword splits than modern tokenizers like Qwen's.
326
+ """
327
+
328
+ def __init__(self):
329
+ self.tokenizer = None
330
+ self._loaded = False
331
+
332
+ def ensure_loaded(self):
333
+ """Load tokenizer on first use (lazy loading)."""
334
+ if not self._loaded:
335
+ self.tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
336
+ self._loaded = True
337
+
338
+ def tokenize(self, text: str) -> list[tuple[str, int]]:
339
+ """Tokenize text and return list of (token_str, token_id)."""
340
+ self.ensure_loaded()
341
+ ids = self.tokenizer.encode(text)
342
+ return [(self.tokenizer.decode([tid]), tid) for tid in ids]
343
+
344
+
345
+ # Module-level singleton for demo tokenizer
346
+ demo_tokenizer = DemoTokenizer()
347
+
348
  # ------------------------------------------------------------------
349
  # Config helpers
350
  # ------------------------------------------------------------------