chyams Claude Opus 4.5 commited on
Commit
a3fbacc
·
1 Parent(s): 376d344

LLM Explorer: UI overhaul, remove Generator tab, add purple theme

Browse files

- Remove Generator tab (Probability Explorer with show-steps-off covers it)
- Add Top-K sampling slider to Probability Explorer
- Temperature slider to 0 (greedy argmax at temp=0)
- Token display: remove quotes, trim whitespace, show symbol for spaces
- Step highlighting: only bold latest token, not all generated text
- Apply course palette (#63348d / #ded9f4), Merriweather headings
- Create justfile with push, push-hf, setup-hf targets
- Update CLAUDE.md files with session decisions

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (2) hide show
  1. app.py +98 -116
  2. models.py +16 -38
app.py CHANGED
@@ -1,13 +1,13 @@
1
- """LLM Explorer Interactive tools for understanding how LLMs work.
2
 
3
- Gradio app with three tabs:
4
- 1. Step-by-Step Probability Explorer
5
- 2. Interactive Generator
6
- 3. Tokenizer
7
 
8
  Plus a password-protected Admin panel for runtime configuration.
9
  """
10
 
 
11
  import os
12
 
13
  import gradio as gr
@@ -15,10 +15,36 @@ import gradio as gr
15
  from models import AVAILABLE_MODELS, manager
16
 
17
  # ---------------------------------------------------------------------------
18
- # Admin password set via env var on HF Spaces, or fall back to config/default
19
  # ---------------------------------------------------------------------------
20
  ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD", "admin")
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # ---------------------------------------------------------------------------
23
  # HTML rendering helpers
24
  # ---------------------------------------------------------------------------
@@ -34,20 +60,37 @@ TOKEN_COLORS = [
34
  ]
35
 
36
 
37
- def _render_step_html(step_data: dict, prompt: str) -> str:
38
- """Render one generation step as styled HTML."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  s = step_data
40
- generated = s["text"][len(prompt):]
41
 
42
  # Build probability bar chart
43
  rows_html = ""
44
  for token_str, prob, tid in s["top_tokens"]:
45
  bar_width = max(1, int(prob * 300))
46
  is_selected = tid == s["token_id"]
47
- bg = "#2563eb" if is_selected else "#94a3b8"
48
  label_style = "font-weight:700;" if is_selected else ""
49
  arrow = " \u2190" if is_selected else ""
50
- token_display = repr(token_str)
51
  rows_html += f"""
52
  <div style="display:flex;align-items:center;gap:8px;margin:2px 0;font-family:monospace;font-size:13px;">
53
  <span style="width:140px;text-align:right;color:#1e293b;{label_style}">{token_display}</span>
@@ -58,23 +101,23 @@ def _render_step_html(step_data: dict, prompt: str) -> str:
58
  return f"""
59
  <div style="border:1px solid #e2e8f0;border-radius:8px;padding:12px;margin:8px 0;background:#fff;">
60
  <div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:8px;">
61
- <span style="font-weight:600;color:#1e293b;">Step {s['step']}</span>
62
  <span style="color:#64748b;font-size:12px;">Entropy: {s['entropy']:.2f} bits</span>
63
  </div>
64
  <div style="font-family:monospace;font-size:14px;padding:8px;background:#f8fafc;border-radius:4px;margin-bottom:8px;word-wrap:break-word;">
65
- <span style="color:#64748b;">{prompt}</span><span style="color:#1e293b;font-weight:600;">{generated}</span>
66
  </div>
67
  {rows_html}
68
  </div>"""
69
 
70
 
71
  def _render_final_text_html(prompt: str, generated_text: str) -> str:
72
- """Render just the final generated text."""
73
  generated = generated_text[len(prompt):]
74
  return f"""
75
  <div style="border:1px solid #e2e8f0;border-radius:8px;padding:16px;background:#fff;">
76
  <div style="font-family:monospace;font-size:16px;line-height:1.6;word-wrap:break-word;">
77
- <span style="color:#94a3b8;">{prompt}</span><span style="color:#1e293b;font-weight:600;">{generated}</span>
78
  </div>
79
  </div>"""
80
 
@@ -84,8 +127,7 @@ def _render_tokens_html(tokens: list[tuple[str, int]]) -> str:
84
  chips = ""
85
  for i, (token_str, tid) in enumerate(tokens):
86
  bg, fg = TOKEN_COLORS[i % len(TOKEN_COLORS)]
87
- # Escape HTML
88
- display = token_str.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
89
  # Show spaces explicitly
90
  if display.strip() == "":
91
  display = repr(token_str).strip("'")
@@ -111,10 +153,10 @@ def _render_tokens_html(tokens: list[tuple[str, int]]) -> str:
111
 
112
 
113
  # ---------------------------------------------------------------------------
114
- # Tab 1: Step-by-Step Probability Explorer
115
  # ---------------------------------------------------------------------------
116
 
117
- def explore_probabilities(prompt, temperature, top_n, steps, show_steps, seed):
118
  """Generate tokens step by step and return formatted HTML."""
119
  if not manager.is_ready():
120
  return f"<p style='color:red;'>{manager.status_message()}</p>"
@@ -125,6 +167,7 @@ def explore_probabilities(prompt, temperature, top_n, steps, show_steps, seed):
125
  steps=int(steps),
126
  temperature=temperature,
127
  top_n=int(top_n),
 
128
  seed=seed,
129
  show_steps=show_steps,
130
  )
@@ -133,7 +176,10 @@ def explore_probabilities(prompt, temperature, top_n, steps, show_steps, seed):
133
  return "<p>No results generated.</p>"
134
 
135
  if show_steps:
136
- html_parts = [_render_step_html(r, prompt) for r in results]
 
 
 
137
  return "\n".join(html_parts)
138
  else:
139
  final_text = results[-1]["text"]
@@ -147,27 +193,7 @@ def on_show_steps_change(show_steps):
147
 
148
 
149
  # ---------------------------------------------------------------------------
150
- # Tab 2: Interactive Generator
151
- # ---------------------------------------------------------------------------
152
-
153
- def generate_interactive(prompt, num_tokens, temperature, top_k, seed):
154
- """Generate text and return it."""
155
- if not manager.is_ready():
156
- return f"*{manager.status_message()}*"
157
-
158
- seed = int(seed)
159
- text = manager.generate_text(
160
- prompt=prompt,
161
- num_tokens=int(num_tokens),
162
- temperature=temperature,
163
- top_k=int(top_k),
164
- seed=seed,
165
- )
166
- return text
167
-
168
-
169
- # ---------------------------------------------------------------------------
170
- # Tab 3: Tokenizer
171
  # ---------------------------------------------------------------------------
172
 
173
  def tokenize_text(text):
@@ -209,9 +235,6 @@ def admin_save_defaults(prompt, temperature, top_n, steps, seed):
209
  return "Defaults saved."
210
 
211
 
212
- import json
213
-
214
-
215
  # ---------------------------------------------------------------------------
216
  # Build the Gradio app
217
  # ---------------------------------------------------------------------------
@@ -219,9 +242,29 @@ import json
219
  def create_app():
220
  cfg = manager.get_config()
221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  with gr.Blocks(
223
  title="LLM Explorer",
224
- theme=gr.themes.Soft(),
 
225
  ) as demo:
226
  gr.Markdown("# LLM Explorer\n*Interactive tools for understanding how LLMs work*")
227
 
@@ -254,14 +297,20 @@ def create_app():
254
  with gr.Row():
255
  t1_temperature = gr.Slider(
256
  label="Temperature",
257
- minimum=0.1, maximum=2.5, step=0.1,
258
  value=cfg.get("default_temperature", 0.8),
259
  )
260
  t1_top_n = gr.Slider(
261
- label="Top-N tokens",
262
  minimum=5, maximum=30, step=1,
263
  value=cfg.get("default_top_n", 10),
264
  )
 
 
 
 
 
 
265
  t1_steps = gr.Slider(
266
  label="Steps",
267
  minimum=1, maximum=20, step=1,
@@ -287,79 +336,12 @@ def create_app():
287
 
288
  t1_generate_btn.click(
289
  fn=explore_probabilities,
290
- inputs=[t1_prompt, t1_temperature, t1_top_n, t1_steps, t1_show_steps, t1_seed],
291
  outputs=[t1_output],
292
  )
293
 
294
  # ==================================================================
295
- # Tab 2: Interactive Generator
296
- # ==================================================================
297
- with gr.Tab("Generator"):
298
- gr.Markdown("### Interactive Text Generator")
299
- gr.Markdown(
300
- "Generate text from a prompt. Adjust temperature and top-k, "
301
- "then release the slider to regenerate with the same seed."
302
- )
303
-
304
- t2_prompt = gr.Textbox(
305
- label="Prompt",
306
- value=cfg.get("default_prompt", "The best thing about Huston-Tillotson University is"),
307
- lines=2,
308
- )
309
-
310
- with gr.Row():
311
- t2_num_tokens = gr.Slider(
312
- label="Tokens to generate",
313
- minimum=5, maximum=100, step=1,
314
- value=30,
315
- )
316
- t2_temperature = gr.Slider(
317
- label="Temperature",
318
- minimum=0.1, maximum=2.5, step=0.1,
319
- value=cfg.get("default_temperature", 0.8),
320
- )
321
- t2_top_k = gr.Slider(
322
- label="Top-K",
323
- minimum=1, maximum=100, step=1,
324
- value=40,
325
- )
326
-
327
- with gr.Accordion(f"Seed: {cfg.get('default_seed', 42)}", open=False):
328
- t2_seed = gr.Number(
329
- label="Random seed",
330
- value=cfg.get("default_seed", 42),
331
- precision=0,
332
- )
333
-
334
- t2_generate_btn = gr.Button("Generate", variant="primary")
335
- t2_output = gr.Textbox(
336
- label="Generated text",
337
- lines=8,
338
- interactive=False,
339
- )
340
-
341
- gen_inputs = [t2_prompt, t2_num_tokens, t2_temperature, t2_top_k, t2_seed]
342
-
343
- t2_generate_btn.click(
344
- fn=generate_interactive,
345
- inputs=gen_inputs,
346
- outputs=[t2_output],
347
- )
348
-
349
- # Slider release events trigger regeneration
350
- t2_temperature.release(
351
- fn=generate_interactive,
352
- inputs=gen_inputs,
353
- outputs=[t2_output],
354
- )
355
- t2_top_k.release(
356
- fn=generate_interactive,
357
- inputs=gen_inputs,
358
- outputs=[t2_output],
359
- )
360
-
361
- # ==================================================================
362
- # Tab 3: Tokenizer
363
  # ==================================================================
364
  with gr.Tab("Tokenizer"):
365
  gr.Markdown("### Token Visualization")
 
1
+ """LLM Explorer -- Interactive tools for understanding how LLMs work.
2
 
3
+ Gradio app with two tabs:
4
+ 1. Probability Explorer (step-by-step or bulk generation)
5
+ 2. Tokenizer
 
6
 
7
  Plus a password-protected Admin panel for runtime configuration.
8
  """
9
 
10
+ import json
11
  import os
12
 
13
  import gradio as gr
 
15
  from models import AVAILABLE_MODELS, manager
16
 
17
  # ---------------------------------------------------------------------------
18
+ # Admin password -- set via env var on HF Spaces, or fall back to default
19
  # ---------------------------------------------------------------------------
20
  ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD", "admin")
21
 
22
+ # ---------------------------------------------------------------------------
23
+ # Custom CSS
24
+ # ---------------------------------------------------------------------------
25
+ CUSTOM_CSS = """
26
+ @import url('https://fonts.googleapis.com/css2?family=Merriweather:wght@300;400;700;900&display=swap');
27
+
28
+ .gradio-container {
29
+ max-width: 960px !important;
30
+ }
31
+
32
+ h1, h2, h3 {
33
+ font-family: 'Merriweather', serif !important;
34
+ color: #63348d !important;
35
+ }
36
+
37
+ .primary.svelte-1ee7cit, button.primary {
38
+ background: #63348d !important;
39
+ border-color: #63348d !important;
40
+ }
41
+
42
+ .primary.svelte-1ee7cit:hover, button.primary:hover {
43
+ background: #4e2870 !important;
44
+ border-color: #4e2870 !important;
45
+ }
46
+ """
47
+
48
  # ---------------------------------------------------------------------------
49
  # HTML rendering helpers
50
  # ---------------------------------------------------------------------------
 
60
  ]
61
 
62
 
63
+ def _esc(text: str) -> str:
64
+ """Escape HTML special characters."""
65
+ return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
66
+
67
+
68
+ def _token_label(token_str: str) -> str:
69
+ """Format a token for display in probability table (no quotes, trimmed)."""
70
+ display = _esc(token_str).strip()
71
+ if not display:
72
+ return "\u2423" # visible space symbol
73
+ return display
74
+
75
+
76
+ def _render_step_html(step_data: dict, prompt: str, prev_generated: str) -> str:
77
+ """Render one generation step as styled HTML.
78
+
79
+ prev_generated is the text generated in all prior steps (between prompt
80
+ and the latest token), so only the newest token gets highlighted.
81
+ """
82
  s = step_data
83
+ latest_token = s["token"]
84
 
85
  # Build probability bar chart
86
  rows_html = ""
87
  for token_str, prob, tid in s["top_tokens"]:
88
  bar_width = max(1, int(prob * 300))
89
  is_selected = tid == s["token_id"]
90
+ bg = "#63348d" if is_selected else "#c4b5d4"
91
  label_style = "font-weight:700;" if is_selected else ""
92
  arrow = " \u2190" if is_selected else ""
93
+ token_display = _token_label(token_str)
94
  rows_html += f"""
95
  <div style="display:flex;align-items:center;gap:8px;margin:2px 0;font-family:monospace;font-size:13px;">
96
  <span style="width:140px;text-align:right;color:#1e293b;{label_style}">{token_display}</span>
 
101
  return f"""
102
  <div style="border:1px solid #e2e8f0;border-radius:8px;padding:12px;margin:8px 0;background:#fff;">
103
  <div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:8px;">
104
+ <span style="font-family:'Merriweather',serif;font-weight:600;color:#63348d;">Step {s['step']}</span>
105
  <span style="color:#64748b;font-size:12px;">Entropy: {s['entropy']:.2f} bits</span>
106
  </div>
107
  <div style="font-family:monospace;font-size:14px;padding:8px;background:#f8fafc;border-radius:4px;margin-bottom:8px;word-wrap:break-word;">
108
+ <span style="color:#94a3b8;">{_esc(prompt)}</span><span style="color:#1e293b;">{_esc(prev_generated)}</span><span style="background:#ded9f4;color:#63348d;font-weight:700;padding:1px 4px;border-radius:3px;">{_esc(latest_token)}</span>
109
  </div>
110
  {rows_html}
111
  </div>"""
112
 
113
 
114
  def _render_final_text_html(prompt: str, generated_text: str) -> str:
115
+ """Render just the final generated text (show-steps OFF mode)."""
116
  generated = generated_text[len(prompt):]
117
  return f"""
118
  <div style="border:1px solid #e2e8f0;border-radius:8px;padding:16px;background:#fff;">
119
  <div style="font-family:monospace;font-size:16px;line-height:1.6;word-wrap:break-word;">
120
+ <span style="color:#94a3b8;">{_esc(prompt)}</span><span style="color:#1e293b;font-weight:600;">{_esc(generated)}</span>
121
  </div>
122
  </div>"""
123
 
 
127
  chips = ""
128
  for i, (token_str, tid) in enumerate(tokens):
129
  bg, fg = TOKEN_COLORS[i % len(TOKEN_COLORS)]
130
+ display = _esc(token_str)
 
131
  # Show spaces explicitly
132
  if display.strip() == "":
133
  display = repr(token_str).strip("'")
 
153
 
154
 
155
  # ---------------------------------------------------------------------------
156
+ # Tab 1: Probability Explorer
157
  # ---------------------------------------------------------------------------
158
 
159
+ def explore_probabilities(prompt, temperature, top_n, top_k, steps, show_steps, seed):
160
  """Generate tokens step by step and return formatted HTML."""
161
  if not manager.is_ready():
162
  return f"<p style='color:red;'>{manager.status_message()}</p>"
 
167
  steps=int(steps),
168
  temperature=temperature,
169
  top_n=int(top_n),
170
+ top_k=int(top_k),
171
  seed=seed,
172
  show_steps=show_steps,
173
  )
 
176
  return "<p>No results generated.</p>"
177
 
178
  if show_steps:
179
+ html_parts = []
180
+ for i, r in enumerate(results):
181
+ prev_generated = results[i - 1]["text"][len(prompt):] if i > 0 else ""
182
+ html_parts.append(_render_step_html(r, prompt, prev_generated))
183
  return "\n".join(html_parts)
184
  else:
185
  final_text = results[-1]["text"]
 
193
 
194
 
195
  # ---------------------------------------------------------------------------
196
+ # Tab 2: Tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  # ---------------------------------------------------------------------------
198
 
199
  def tokenize_text(text):
 
235
  return "Defaults saved."
236
 
237
 
 
 
 
238
  # ---------------------------------------------------------------------------
239
  # Build the Gradio app
240
  # ---------------------------------------------------------------------------
 
242
  def create_app():
243
  cfg = manager.get_config()
244
 
245
+ theme = gr.themes.Soft(
246
+ primary_hue=gr.themes.Color(
247
+ c50="#faf8fc",
248
+ c100="#f3f0f7",
249
+ c200="#ded9f4",
250
+ c300="#c4b5e0",
251
+ c400="#a78bcc",
252
+ c500="#8b5fb8",
253
+ c600="#7c3aad",
254
+ c700="#63348d",
255
+ c800="#52296f",
256
+ c900="#421f59",
257
+ c950="#2a1339",
258
+ ),
259
+ neutral_hue="slate",
260
+ font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
261
+ font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"],
262
+ )
263
+
264
  with gr.Blocks(
265
  title="LLM Explorer",
266
+ theme=theme,
267
+ css=CUSTOM_CSS,
268
  ) as demo:
269
  gr.Markdown("# LLM Explorer\n*Interactive tools for understanding how LLMs work*")
270
 
 
297
  with gr.Row():
298
  t1_temperature = gr.Slider(
299
  label="Temperature",
300
+ minimum=0, maximum=2.5, step=0.1,
301
  value=cfg.get("default_temperature", 0.8),
302
  )
303
  t1_top_n = gr.Slider(
304
+ label="Top-N display",
305
  minimum=5, maximum=30, step=1,
306
  value=cfg.get("default_top_n", 10),
307
  )
308
+ t1_top_k = gr.Slider(
309
+ label="Top-K sampling",
310
+ minimum=0, maximum=100, step=1,
311
+ value=0,
312
+ info="0 = sample from full vocabulary",
313
+ )
314
  t1_steps = gr.Slider(
315
  label="Steps",
316
  minimum=1, maximum=20, step=1,
 
336
 
337
  t1_generate_btn.click(
338
  fn=explore_probabilities,
339
+ inputs=[t1_prompt, t1_temperature, t1_top_n, t1_top_k, t1_steps, t1_show_steps, t1_seed],
340
  outputs=[t1_output],
341
  )
342
 
343
  # ==================================================================
344
+ # Tab 2: Tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  # ==================================================================
346
  with gr.Tab("Tokenizer"):
347
  gr.Markdown("### Token Visualization")
models.py CHANGED
@@ -6,7 +6,6 @@ Provides inference methods for next-token probabilities and step-by-step generat
6
 
7
  import gc
8
  import json
9
- import math
10
  import os
11
  import threading
12
  from pathlib import Path
@@ -243,6 +242,7 @@ class ModelManager:
243
  steps: int = 8,
244
  temperature: float = 0.8,
245
  top_n: int = 10,
 
246
  seed: int = 42,
247
  show_steps: bool = True,
248
  ) -> list[dict]:
@@ -265,13 +265,25 @@ class ModelManager:
265
 
266
  for i in range(steps):
267
  logits = self._get_logits(text)
 
 
 
 
 
 
 
 
268
  probs = self.apply_temperature(logits, temperature)
269
  entropy = self.entropy_bits(probs)
270
  top_tokens = self.top_k_table(probs, k=top_n) if show_steps else []
271
 
272
- # Sample with deterministic seed per step
273
- rng.manual_seed(seed + i)
274
- idx = torch.multinomial(probs.cpu(), num_samples=1, generator=rng).item()
 
 
 
 
275
  token_str = self.tokenizer.decode([idx])
276
  text += token_str
277
 
@@ -286,40 +298,6 @@ class ModelManager:
286
 
287
  return results
288
 
289
- def generate_text(
290
- self,
291
- prompt: str,
292
- num_tokens: int = 30,
293
- temperature: float = 0.8,
294
- top_k: int = 40,
295
- seed: int = 42,
296
- ) -> str:
297
- """Generate text with top-k sampling. Returns prompt + generated text."""
298
- if not self.is_ready():
299
- return prompt
300
-
301
- text = prompt
302
- rng = torch.Generator()
303
-
304
- for i in range(num_tokens):
305
- logits = self._get_logits(text)
306
-
307
- # Apply top-k filtering
308
- if top_k > 0:
309
- top_k_vals, top_k_idxs = torch.topk(logits, k=min(top_k, logits.shape[0]))
310
- mask = torch.full_like(logits, float("-inf"))
311
- mask.scatter_(0, top_k_idxs, top_k_vals)
312
- logits = mask
313
-
314
- probs = self.apply_temperature(logits, temperature)
315
-
316
- rng.manual_seed(seed + i)
317
- idx = torch.multinomial(probs.cpu(), num_samples=1, generator=rng).item()
318
- token_str = self.tokenizer.decode([idx])
319
- text += token_str
320
-
321
- return text
322
-
323
  def tokenize(self, text: str) -> list[tuple[str, int]]:
324
  """Tokenize text and return list of (token_str, token_id)."""
325
  if self.tokenizer is None:
 
6
 
7
  import gc
8
  import json
 
9
  import os
10
  import threading
11
  from pathlib import Path
 
242
  steps: int = 8,
243
  temperature: float = 0.8,
244
  top_n: int = 10,
245
+ top_k: int = 0,
246
  seed: int = 42,
247
  show_steps: bool = True,
248
  ) -> list[dict]:
 
265
 
266
  for i in range(steps):
267
  logits = self._get_logits(text)
268
+
269
+ # Apply top-k filtering before temperature
270
+ if top_k > 0:
271
+ top_k_vals, top_k_idxs = torch.topk(logits, k=min(top_k, logits.shape[0]))
272
+ mask = torch.full_like(logits, float("-inf"))
273
+ mask.scatter_(0, top_k_idxs, top_k_vals)
274
+ logits = mask
275
+
276
  probs = self.apply_temperature(logits, temperature)
277
  entropy = self.entropy_bits(probs)
278
  top_tokens = self.top_k_table(probs, k=top_n) if show_steps else []
279
 
280
+ # Temperature 0 = greedy (always pick highest probability)
281
+ if temperature == 0:
282
+ idx = torch.argmax(probs).item()
283
+ else:
284
+ rng.manual_seed(seed + i)
285
+ idx = torch.multinomial(probs.cpu(), num_samples=1, generator=rng).item()
286
+
287
  token_str = self.tokenizer.decode([idx])
288
  text += token_str
289
 
 
298
 
299
  return results
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  def tokenize(self, text: str) -> list[tuple[str, int]]:
302
  """Tokenize text and return list of (token_str, token_id)."""
303
  if self.tokenizer is None: