santhosh commited on
Commit
3b5c1dc
·
verified ·
1 Parent(s): 4ce299d

Deploy demo @ 04347af

Browse files
Files changed (6) hide show
  1. README.md +15 -4
  2. index.html +45 -18
  3. js/app.js +426 -0
  4. js/tokenizers.js +18 -0
  5. js/worker.js +69 -0
  6. style.css +402 -18
README.md CHANGED
@@ -1,10 +1,21 @@
1
  ---
2
  title: Malayalam Tokenizer Comparison
3
- emoji: 💻
4
- colorFrom: indigo
5
- colorTo: red
6
  sdk: static
7
  pinned: false
 
 
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
1
  ---
2
  title: Malayalam Tokenizer Comparison
3
+ emoji: 🔤
4
+ colorFrom: green
5
+ colorTo: blue
6
  sdk: static
7
  pinned: false
8
+ license: mit
9
+ tags:
10
+ - malayalam
11
+ - tokenizer
12
+ - nlp
13
  ---
14
 
15
+ # Malayalam Tokenizer Comparison
16
+
17
+ Visualize how different tokenizers split Malayalam text into tokens.
18
+ Compare Malayalam-specific tokenizers (BPE and Unigram) side-by-side with
19
+ popular models like GPT-4, LLaMA, Mistral, and others.
20
+
21
+ Built from [smc/malayalam-tokenizer](https://github.com/smc/malayalam-tokenizer).
index.html CHANGED
@@ -1,19 +1,46 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  </html>
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Malayalam Tokenizer Comparison</title>
7
+ <link rel="stylesheet" href="style.css">
8
+ </head>
9
+ <body>
10
+ <header>
11
+ <h1>Malayalam Tokenizer Comparison</h1>
12
+ <p>Visualize how different tokenizers split Malayalam text into tokens.</p>
13
+ </header>
14
+
15
+ <main>
16
+ <section class="input-section">
17
+ <label for="text-input">Input text</label>
18
+ <textarea
19
+ id="text-input"
20
+ rows="4"
21
+ placeholder="Type or paste Malayalam text here…"
22
+ spellcheck="false"
23
+ autocomplete="off"
24
+ >കേരളം ദക്ഷിണേന്ത്യയിലെ ഒരു സംസ്ഥാനമാണ്.</textarea>
25
+ </section>
26
+
27
+ <section class="add-tokenizer-section">
28
+ <select id="tokenizer-select">
29
+ <option value="">— choose a tokenizer —</option>
30
+ </select>
31
+ <button id="add-btn" type="button" disabled>Add tokenizer</button>
32
+ </section>
33
+
34
+ <div id="panels" role="list" aria-label="Tokenizer panels"></div>
35
+
36
+ <p id="empty-state" class="empty-state" aria-live="polite">
37
+ No tokenizers added yet. Choose one from the dropdown above.
38
+ </p>
39
+ </main>
40
+
41
+ <!-- Token-color highlight styles injected by app.js -->
42
+ <style id="highlight-styles"></style>
43
+
44
+ <script type="module" src="js/app.js"></script>
45
+ </body>
46
  </html>
js/app.js ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { TOKENIZER_OPTIONS } from "./tokenizers.js";
2
+
3
+ // ─── Token color palette ──────────────────────────────────────────────────────
4
+ // Reference CSS custom properties so dark-mode overrides apply automatically.
5
+ // The index wraps modulo 12; values are defined in style.css as --tok-N.
6
+
7
+ const NUM_TOKEN_COLORS = 12;
8
+
9
+ function tokenColorVar(index) {
10
+ return `var(--tok-${index % NUM_TOKEN_COLORS})`;
11
+ }
12
+
13
+ const DEBOUNCE_MS = 350;
14
+
15
+ // ─── State ────────────────────────────────────────────────────────────────────
16
+
17
+ /** @type {Map<number, PanelState>} */
18
+ const panels = new Map();
19
+ let panelCounter = 0;
20
+
21
+ /**
22
+ * @typedef {{
23
+ * modelId: string,
24
+ * name: string,
25
+ * worker: Worker,
26
+ * panel: HTMLElement,
27
+ * textMirror: HTMLElement,
28
+ * statsEl: HTMLElement,
29
+ * tbody: HTMLElement,
30
+ * tokenCount: number,
31
+ * }} PanelState
32
+ */
33
+
34
+ // ─── DOM refs ─────────────────────────────────────────────────────────────────
35
+
36
+ const textInput = /** @type {HTMLTextAreaElement} */ (
37
+ document.getElementById("text-input")
38
+ );
39
+ const selectEl = /** @type {HTMLSelectElement} */ (
40
+ document.getElementById("tokenizer-select")
41
+ );
42
+ const addBtn = /** @type {HTMLButtonElement} */ (
43
+ document.getElementById("add-btn")
44
+ );
45
+ const panelsEl = /** @type {HTMLElement} */ (
46
+ document.getElementById("panels")
47
+ );
48
+ const emptyState = /** @type {HTMLElement} */ (
49
+ document.getElementById("empty-state")
50
+ );
51
+ const highlightStyleEl = /** @type {HTMLStyleElement} */ (
52
+ document.getElementById("highlight-styles")
53
+ );
54
+
55
+ // ─── Populate dropdown ────────────────────────────────────────────────────────
56
+
57
+ for (const [modelId, name] of Object.entries(TOKENIZER_OPTIONS)) {
58
+ if (!modelId) continue;
59
+ const opt = document.createElement("option");
60
+ opt.value = modelId;
61
+ opt.textContent = name;
62
+ selectEl.appendChild(opt);
63
+ }
64
+
65
+ selectEl.addEventListener("change", () => {
66
+ addBtn.disabled = !selectEl.value;
67
+ });
68
+
69
+ addBtn.addEventListener("click", () => {
70
+ const modelId = selectEl.value;
71
+ if (!modelId) return;
72
+ const name = TOKENIZER_OPTIONS[modelId] ?? modelId;
73
+ addPanel(modelId, name);
74
+ selectEl.value = "";
75
+ addBtn.disabled = true;
76
+ });
77
+
78
+ // ─── Text input ───────────────────────────────────────────────────────────────
79
+
80
+ let debounceTimer = null;
81
+
82
+ textInput.addEventListener("input", () => {
83
+ clearTimeout(debounceTimer);
84
+ debounceTimer = setTimeout(retokenizeAll, DEBOUNCE_MS);
85
+ });
86
+
87
+ function retokenizeAll() {
88
+ const text = textInput.value;
89
+ for (const id of panels.keys()) {
90
+ runTokenizer(id, text);
91
+ }
92
+ }
93
+
94
+ // ─── Panel lifecycle ──────────────────────────────────────────────────────────
95
+
96
+ function addPanel(modelId, name) {
97
+ const id = panelCounter++;
98
+
99
+ const panel = document.createElement("div");
100
+ panel.className = "panel loading";
101
+ panel.setAttribute("role", "listitem");
102
+ panel.dataset.panelId = String(id);
103
+ panel.innerHTML = panelTemplate(name, modelId);
104
+
105
+ panel
106
+ .querySelector(".remove-btn")
107
+ .addEventListener("click", () => removePanel(id));
108
+
109
+ panelsEl.appendChild(panel);
110
+ updateEmptyState();
111
+
112
+ const worker = new Worker("js/worker.js", { type: "module" });
113
+ worker.addEventListener("message", (ev) => onWorkerMessage(id, ev.data));
114
+ worker.addEventListener("error", (ev) =>
115
+ onWorkerError(id, ev.message ?? "Worker error"),
116
+ );
117
+
118
+ panels.set(id, {
119
+ modelId,
120
+ name,
121
+ worker,
122
+ panel,
123
+ textMirror: panel.querySelector(".panel-text"),
124
+ statsEl: panel.querySelector(".panel-stats"),
125
+ tbody: panel.querySelector("tbody"),
126
+ tokenCount: 0,
127
+ });
128
+
129
+ runTokenizer(id, textInput.value);
130
+ }
131
+
132
+ function removePanel(id) {
133
+ const state = panels.get(id);
134
+ if (!state) return;
135
+ clearPanelHighlights(id, state.tokenCount);
136
+ state.worker.terminate();
137
+ state.panel.remove();
138
+ panels.delete(id);
139
+ updateEmptyState();
140
+ }
141
+
142
+ function updateEmptyState() {
143
+ emptyState.classList.toggle("hidden", panels.size > 0);
144
+ }
145
+
146
+ function panelTemplate(name, modelId) {
147
+ return `
148
+ <div class="panel-header">
149
+ <a class="panel-title" href="https://huggingface.co/${esc(modelId)}" target="_blank" rel="noopener noreferrer" title="${esc(modelId)}">${esc(name)}</a>
150
+ <button class="remove-btn" type="button" aria-label="Remove ${esc(name)}">Remove</button>
151
+ </div>
152
+ <div class="panel-loading">
153
+ <span class="spinner" aria-hidden="true"></span>
154
+ Loading tokenizer…
155
+ </div>
156
+ <div class="panel-text" aria-label="Tokenized text"></div>
157
+ <div class="panel-stats"></div>
158
+ <div class="panel-table-wrap">
159
+ <table aria-label="Token list">
160
+ <thead>
161
+ <tr>
162
+ <th></th>
163
+ <th>#</th>
164
+ <th>Token</th>
165
+ <th>ID</th>
166
+ </tr>
167
+ </thead>
168
+ <tbody></tbody>
169
+ </table>
170
+ </div>`;
171
+ }
172
+
173
+ // ─── Worker communication ─────────────────────────────────────────────────────
174
+
175
+ function runTokenizer(id, text) {
176
+ const state = panels.get(id);
177
+ if (!state) return;
178
+ state.panel.classList.add("loading");
179
+ state.worker.postMessage({ model_id: state.modelId, text });
180
+ }
181
+
182
+ /**
183
+ * @param {number} id
184
+ * @param {{ token_ids: number[], decoded: string[], margins: number[] }} data
185
+ */
186
+ function onWorkerMessage(id, data) {
187
+ const state = panels.get(id);
188
+ if (!state) return;
189
+
190
+ state.panel.classList.remove("loading");
191
+
192
+ const { token_ids, decoded, margins } = data;
193
+ const text = textInput.value;
194
+
195
+ // Clear stale highlights
196
+ clearPanelHighlights(id, state.tokenCount);
197
+ state.tokenCount = token_ids.length;
198
+
199
+ // Sync mirror text (single text node — required for Range offsets)
200
+ state.textMirror.textContent = text;
201
+
202
+ // Compute per-token character offsets in the original string
203
+ const offsets = computeOffsets(text, decoded, margins ?? []);
204
+
205
+ // Paint highlights via CSS Custom Highlight API
206
+ paintHighlights(id, state.textMirror, offsets);
207
+
208
+ // Stats bar
209
+ const ratio =
210
+ text.length > 0 ? (text.length / token_ids.length).toFixed(2) : "—";
211
+ state.statsEl.innerHTML =
212
+ `<span>Tokens: <strong>${token_ids.length}</strong></span>` +
213
+ `<span>Chars/token: <strong>${ratio}</strong></span>` +
214
+ `<span>Characters: <strong>${text.length}</strong></span>`;
215
+
216
+ // Table
217
+ renderTable(id, state, token_ids, decoded, offsets);
218
+ }
219
+
220
+ function onWorkerError(id, msg) {
221
+ const state = panels.get(id);
222
+ if (!state) return;
223
+ state.panel.classList.remove("loading");
224
+ state.textMirror.textContent = `! ${msg}`;
225
+ }
226
+
227
+ // ─── Offset computation ───────────────────────────────────────────────────────
228
+
229
+ /**
230
+ * Walk `decoded` tokens greedily left-to-right against `text`, returning
231
+ * [{start, end}|null] for each token.
232
+ *
233
+ * The HuggingFace worker decodes each token individually. Metaspace converts
234
+ * the leading ▁ (U+2581) back to a space, so `decoded` values should
235
+ * concatenate to the original text (modulo special tokens like <s>, </s>).
236
+ *
237
+ * @param {string} text
238
+ * @param {string[]} decoded
239
+ * @param {number[]} margins — BERT: margin>0 means a word boundary space precedes
240
+ * @returns {Array<{start:number,end:number}|null>}
241
+ */
242
+ function computeOffsets(text, decoded, margins) {
243
+ const offsets = [];
244
+ let cursor = 0;
245
+
246
+ for (let i = 0; i < decoded.length; i++) {
247
+ const tok = decoded[i] ?? "";
248
+
249
+ if (tok.length === 0) {
250
+ offsets.push(null);
251
+ continue;
252
+ }
253
+
254
+ // BERT word-boundary space: margin > 0 means we should skip a space
255
+ if (margins[i] > 0 && text[cursor] === " ") {
256
+ cursor++;
257
+ }
258
+
259
+ // Greedy forward search from cursor
260
+ const idx = text.indexOf(tok, cursor);
261
+ if (idx === -1) {
262
+ // Special token or mismatch — no visible range
263
+ offsets.push(null);
264
+ continue;
265
+ }
266
+
267
+ offsets.push({ start: idx, end: idx + tok.length });
268
+ cursor = idx + tok.length;
269
+ }
270
+
271
+ return offsets;
272
+ }
273
+
274
+ // ─── CSS Custom Highlight API ─────────────────────────────────────────────────
275
+
276
+ // Track which ::highlight() rules have been injected to avoid duplication
277
+ const injectedRules = new Set();
278
+
279
+ /**
280
+ * Ensure ::highlight(name) rule exists in the shared <style> element.
281
+ * @param {string} name
282
+ * @param {string} bgColor
283
+ */
284
+ function ensureRule(name, bgColor) {
285
+ if (injectedRules.has(name)) return;
286
+ injectedRules.add(name);
287
+ const sheet = highlightStyleEl.sheet;
288
+ sheet.insertRule(
289
+ `::highlight(${name}) { background-color: ${bgColor}; color: var(--tok-text); }`,
290
+ sheet.cssRules.length,
291
+ );
292
+ }
293
+
294
+ /**
295
+ * Ensure the hover ::highlight() rule exists for a panel.
296
+ * @param {number} panelId
297
+ */
298
+ function ensureHoverRule(panelId) {
299
+ const name = `p${panelId}-hover`;
300
+ if (injectedRules.has(name)) return;
301
+ injectedRules.add(name);
302
+ const sheet = highlightStyleEl.sheet;
303
+ sheet.insertRule(
304
+ `::highlight(${name}) { outline: 2px solid var(--tok-hover); border-radius: 2px; }`,
305
+ sheet.cssRules.length,
306
+ );
307
+ }
308
+
309
+ /**
310
+ * Register one CSS Highlight per token offset.
311
+ * Name: `p{panelId}-t{tokenIndex}`
312
+ * Color: var(--tok-N) from CSS
313
+ */
314
+ function paintHighlights(panelId, mirrorEl, offsets) {
315
+ if (!CSS.highlights) return;
316
+
317
+ const textNode = mirrorEl.firstChild;
318
+ if (!textNode) return;
319
+
320
+ for (let i = 0; i < offsets.length; i++) {
321
+ const off = offsets[i];
322
+ if (!off) continue;
323
+
324
+ const name = `p${panelId}-t${i}`;
325
+
326
+ ensureRule(name, tokenColorVar(i));
327
+
328
+ const range = new Range();
329
+ range.setStart(textNode, off.start);
330
+ range.setEnd(textNode, off.end);
331
+ CSS.highlights.set(name, new Highlight(range));
332
+ }
333
+ }
334
+
335
+ /**
336
+ * Remove all named highlights for a panel.
337
+ */
338
+ function clearPanelHighlights(panelId, tokenCount) {
339
+ if (!CSS.highlights) return;
340
+ for (let i = 0; i < tokenCount; i++) {
341
+ CSS.highlights.delete(`p${panelId}-t${i}`);
342
+ }
343
+ CSS.highlights.delete(`p${panelId}-hover`);
344
+ }
345
+
346
+ // ─── Token table ──────────────────────────────────────────────────────────────
347
+
348
+ /**
349
+ * @param {number} panelId
350
+ * @param {PanelState} state
351
+ * @param {number[]} token_ids
352
+ * @param {string[]} decoded
353
+ * @param {Array<{start:number,end:number}|null>} offsets
354
+ */
355
+ function renderTable(panelId, state, token_ids, decoded, offsets) {
356
+ const { tbody, textMirror } = state;
357
+ tbody.innerHTML = "";
358
+
359
+ ensureHoverRule(panelId);
360
+
361
+ const fragment = document.createDocumentFragment();
362
+
363
+ for (let i = 0; i < token_ids.length; i++) {
364
+ const color = tokenColorVar(i);
365
+ const off = offsets[i];
366
+
367
+ const tr = document.createElement("tr");
368
+ tr.style.setProperty("--row-color", color);
369
+ tr.dataset.tokenIndex = String(i);
370
+
371
+ // Color swatch
372
+ const tdSwatch = document.createElement("td");
373
+ tdSwatch.className = "color-swatch";
374
+ const swatch = document.createElement("span");
375
+ swatch.style.background = color;
376
+ tdSwatch.appendChild(swatch);
377
+
378
+ // Index
379
+ const tdIdx = document.createElement("td");
380
+ tdIdx.textContent = String(i + 1);
381
+
382
+ // Token text
383
+ const tdTok = document.createElement("td");
384
+ tdTok.className = "tok-cell";
385
+ tdTok.textContent = decoded[i] ?? "";
386
+
387
+ // Token ID
388
+ const tdId = document.createElement("td");
389
+ tdId.className = "id-cell";
390
+ tdId.textContent = String(token_ids[i]);
391
+
392
+ tr.append(tdSwatch, tdIdx, tdTok, tdId);
393
+
394
+ // Hover: show distinct outline highlight on the mirrored text
395
+ if (off) {
396
+ tr.addEventListener("mouseenter", () => {
397
+ tr.classList.add("active");
398
+ if (!CSS.highlights) return;
399
+ const node = textMirror.firstChild;
400
+ if (!node) return;
401
+ const r = new Range();
402
+ r.setStart(node, off.start);
403
+ r.setEnd(node, off.end);
404
+ CSS.highlights.set(`p${panelId}-hover`, new Highlight(r));
405
+ });
406
+ tr.addEventListener("mouseleave", () => {
407
+ tr.classList.remove("active");
408
+ CSS.highlights?.delete(`p${panelId}-hover`);
409
+ });
410
+ }
411
+
412
+ fragment.appendChild(tr);
413
+ }
414
+
415
+ tbody.appendChild(fragment);
416
+ }
417
+
418
+ // ─── Utility ──────────────────────────────────────────────────────────────────
419
+
420
+ function esc(str) {
421
+ return str
422
+ .replace(/&/g, "&amp;")
423
+ .replace(/</g, "&lt;")
424
+ .replace(/>/g, "&gt;")
425
+ .replace(/"/g, "&quot;");
426
+ }
js/tokenizers.js ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Define list of tokenizers and their corresponding human-readable names
2
+ export const TOKENIZER_OPTIONS = Object.freeze({
3
+ "smcproject/malayalam-bpe-tokenizer": "Malayalam BPE toklenizer",
4
+ "smcproject/malayalam-unigram-tokenizer": "Malayalam Unigram toklenizer",
5
+ "Xenova/gpt-4": "gpt-4 / gpt-3.5-turbo / text-embedding-ada-002",
6
+ "Xenova/text-davinci-003": "text-davinci-003 / text-davinci-002",
7
+ "Xenova/gpt-3": "gpt-3",
8
+ "Xenova/grok-1-tokenizer": "Grok-1",
9
+ "Xenova/claude-tokenizer": "Claude",
10
+ "Xenova/mistral-tokenizer-v3": "Mistral v3",
11
+ "Xenova/mistral-tokenizer-v1": "Mistral v1",
12
+ "Xenova/gemma-tokenizer": "Gemma",
13
+ "Xenova/llama-3-tokenizer": "Llama 3",
14
+ "Xenova/llama-tokenizer": "LLaMA / Llama 2",
15
+ "Xenova/c4ai-command-r-v01-tokenizer": "Cohere Command-R",
16
+ "Xenova/t5-small": "T5",
17
+ "Xenova/bert-base-cased": "bert-base-cased",
18
+ });
js/worker.js ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Although not strictly necessary, we delegate the tokenization to a worker thread to avoid
2
+ // any potential issues with the tokenizer blocking the main thread (especially for large inputs).
3
+
4
+ import { AutoTokenizer } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.2.1";
5
+
6
+ // This is a map of all the tokenizer instances that we have loaded.
7
+ // model_id -> promise that resolves to tokenizer
8
+ const TOKENIZER_MAPPINGS = new Map();
9
+
10
+ // Listen for messages from the main thread
11
+ self.addEventListener("message", async (event) => {
12
+ const { model_id, text } = event.data;
13
+
14
+ // Only load the tokenizer if it hasn't been loaded yet
15
+ let tokenizerPromise = TOKENIZER_MAPPINGS.get(model_id);
16
+ if (!tokenizerPromise) {
17
+ // For visualization purposes, we may need to modify the tokenizer slightly
18
+ tokenizerPromise = AutoTokenizer.from_pretrained(model_id).then(
19
+ (tokenizer) => {
20
+ // NOTE: We just remove the StripDecoder from the llama tokenizer
21
+ const tokenizer_class = (
22
+ tokenizer._tokenizer_config?.tokenizer_class ?? ""
23
+ ).replace(/Fast$/, "");
24
+ switch (tokenizer_class) {
25
+ case "LlamaTokenizer":
26
+ case "Grok1Tokenizer":
27
+ // tokenizer.decoder.decoders.at(-1).constructor.name === 'StripDecoder'
28
+ tokenizer.decoder.decoders.pop();
29
+ break;
30
+ case "T5Tokenizer":
31
+ tokenizer.decoder.addPrefixSpace = false;
32
+ break;
33
+ }
34
+ return tokenizer;
35
+ },
36
+ );
37
+
38
+ TOKENIZER_MAPPINGS.set(model_id, tokenizerPromise);
39
+ }
40
+
41
+ const tokenizer = await tokenizerPromise;
42
+
43
+ // Tokenize the input text
44
+ const token_ids = tokenizer.encode(text);
45
+
46
+ // Decode the token IDs back to text
47
+ let decoded = token_ids.map((x) => tokenizer.decode([x]));
48
+
49
+ // Minor post-processing for visualization purposes
50
+ let margins = [];
51
+ switch (tokenizer.constructor.name) {
52
+ case "BertTokenizer":
53
+ margins = decoded.map((x, i) => (i === 0 || x.startsWith("##") ? 0 : 8));
54
+ decoded = decoded.map((x) => x.replace("##", ""));
55
+ break;
56
+ case "T5Tokenizer":
57
+ if (decoded.length > 0 && decoded !== " ") {
58
+ decoded[0] = decoded[0].replace(/^ /, "");
59
+ }
60
+ break;
61
+ }
62
+
63
+ // Send the output back to the main thread
64
+ self.postMessage({
65
+ token_ids,
66
+ decoded,
67
+ margins,
68
+ });
69
+ });
style.css CHANGED
@@ -1,28 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  body {
2
- padding: 2rem;
3
- font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  }
5
 
6
- h1 {
7
- font-size: 16px;
8
- margin-top: 0;
 
 
 
 
 
9
  }
10
 
11
- p {
12
- color: rgb(107, 114, 128);
13
- font-size: 15px;
14
- margin-bottom: 10px;
15
- margin-top: 5px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  }
17
 
18
- .card {
19
- max-width: 620px;
20
- margin: 0 auto;
21
- padding: 16px;
22
- border: 1px solid lightgray;
23
- border-radius: 16px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
25
 
26
- .card p:last-child {
27
- margin-bottom: 0;
 
 
28
  }
 
1
+ /* ─── Design tokens ─────────────────────────────────────────────────────── */
2
+
3
+ :root {
4
+ color-scheme: light dark;
5
+ --radius: 8px;
6
+ --radius-sm: 4px;
7
+ --font-sans: system-ui, -apple-system, sans-serif;
8
+ --font-mono: ui-monospace, "Cascadia Code", "Fira Code", monospace;
9
+ --shadow:
10
+ 0 1px 4px color-mix(in srgb, CanvasText 8%, transparent), 0 2px 12px
11
+ color-mix(in srgb, CanvasText 5%, transparent);
12
+
13
+ /* Foreground color rendered over token highlight backgrounds */
14
+ --tok-text: CanvasText;
15
+ /* Hover outline color for highlighted token ranges */
16
+ --tok-hover: AccentColor;
17
+
18
+ /* Token color palette — 12 distinct hues in oklch, light-mode values.
19
+ Dark-mode overrides follow in the @media block below. */
20
+ --tok-0: oklch(80% 0.13 30);
21
+ --tok-1: oklch(82% 0.13 80);
22
+ --tok-2: oklch(80% 0.13 140);
23
+ --tok-3: oklch(80% 0.13 195);
24
+ --tok-4: oklch(80% 0.13 250);
25
+ --tok-5: oklch(80% 0.13 300);
26
+ --tok-6: oklch(80% 0.13 340);
27
+ --tok-7: oklch(82% 0.13 55);
28
+ --tok-8: oklch(80% 0.13 165);
29
+ --tok-9: oklch(80% 0.13 220);
30
+ --tok-10: oklch(80% 0.13 270);
31
+ --tok-11: oklch(80% 0.13 320);
32
+ }
33
+
34
+ @media (prefers-color-scheme: dark) {
35
+ :root {
36
+ /* Slightly higher lightness so highlights read on dark Canvas */
37
+ --tok-0: oklch(68% 0.16 30);
38
+ --tok-1: oklch(70% 0.16 80);
39
+ --tok-2: oklch(68% 0.16 140);
40
+ --tok-3: oklch(68% 0.16 195);
41
+ --tok-4: oklch(68% 0.16 250);
42
+ --tok-5: oklch(68% 0.16 300);
43
+ --tok-6: oklch(68% 0.16 340);
44
+ --tok-7: oklch(70% 0.16 55);
45
+ --tok-8: oklch(68% 0.16 165);
46
+ --tok-9: oklch(68% 0.16 220);
47
+ --tok-10: oklch(68% 0.16 270);
48
+ --tok-11: oklch(68% 0.16 320);
49
+ }
50
+ }
51
+
52
+ /* ─── Reset ─────────────────────────────────────────────────────────────── */
53
+
54
+ *,
55
+ *::before,
56
+ *::after {
57
+ box-sizing: border-box;
58
+ margin: 0;
59
+ padding: 0;
60
+ }
61
+
62
+ /* ─── Base ──────────────────────────────────────────────────────────────── */
63
+
64
  body {
65
+ font-family: var(--font-sans);
66
+ background: Canvas;
67
+ color: CanvasText;
68
+ line-height: 1.6;
69
+ min-height: 100dvh;
70
+ padding-block-end: 4rem;
71
+ }
72
+
73
+ /* ─── Header ────────────────────────────────────────────────────────────── */
74
+
75
+ header {
76
+ background: Canvas;
77
+ padding: 1.25rem 2rem;
78
+ max-width: 1400px;
79
+ margin-inline: auto;
80
+ h1 {
81
+ font-size: 1.35rem;
82
+ font-weight: 700;
83
+ letter-spacing: -0.01em;
84
+ }
85
+
86
+ p {
87
+ font-size: 0.875rem;
88
+ color: GrayText;
89
+ margin-block-start: 0.2rem;
90
+ }
91
  }
92
 
93
+ /* ─── Main layout ───────────────────────────────────────────────────────── */
94
+
95
+ main {
96
+ max-width: 1400px;
97
+ margin-inline: auto;
98
+ padding: 1.5rem 2rem;
99
+ display: grid;
100
+ gap: 1.25rem;
101
  }
102
 
103
+ /* ─── Input section ─────────────────────────────────────────────────────── */
104
+
105
+ .input-section {
106
+ display: grid;
107
+ gap: 0.5rem;
108
+
109
+ label {
110
+ font-size: 0.8rem;
111
+ font-weight: 600;
112
+ text-transform: uppercase;
113
+ letter-spacing: 0.05em;
114
+ color: GrayText;
115
+ }
116
+
117
+ textarea {
118
+ font-family: var(--font-mono);
119
+ font-size: 1.1rem;
120
+ line-height: 1.7;
121
+ background: Field;
122
+ color: FieldText;
123
+ border: 1px solid ButtonBorder;
124
+ border-radius: var(--radius);
125
+ padding: 0.75rem 1rem;
126
+ resize: vertical;
127
+ width: 100%;
128
+ transition: border-color 0.15s;
129
+
130
+ &:focus {
131
+ outline: none;
132
+ border-color: AccentColor;
133
+ box-shadow: 0 0 0 3px color-mix(in srgb, AccentColor 20%, transparent);
134
+ }
135
+ }
136
  }
137
 
138
+ /* ─── Add tokenizer row ─────────────────────────────────────────────────── */
139
+
140
+ .add-tokenizer-section {
141
+ display: flex;
142
+ gap: 0.5rem;
143
+ align-items: center;
144
+ flex-wrap: wrap;
145
+
146
+ select {
147
+ font-family: var(--font-sans);
148
+ font-size: 0.9rem;
149
+ background: Field;
150
+ color: FieldText;
151
+ border: 1px solid ButtonBorder;
152
+ border-radius: var(--radius-sm);
153
+ padding: 0.45rem 0.75rem;
154
+ flex: 1 1 280px;
155
+ max-width: 480px;
156
+ cursor: pointer;
157
+ transition: border-color 0.15s;
158
+
159
+ &:focus {
160
+ outline: none;
161
+ border-color: AccentColor;
162
+ }
163
+ }
164
+
165
+ button {
166
+ font-family: var(--font-sans);
167
+ font-size: 0.9rem;
168
+ font-weight: 600;
169
+ background: AccentColor;
170
+ color: AccentColorText;
171
+ border: none;
172
+ border-radius: var(--radius-sm);
173
+ padding: 0.45rem 1.1rem;
174
+ cursor: pointer;
175
+ transition: opacity 0.15s;
176
+ white-space: nowrap;
177
+
178
+ &:hover:not(:disabled) {
179
+ opacity: 0.85;
180
+ }
181
+
182
+ &:disabled {
183
+ opacity: 0.4;
184
+ cursor: not-allowed;
185
+ }
186
+ }
187
+ }
188
+
189
+ /* ─── Empty state ───────────────────────────────────────────────────────── */
190
+
191
+ .empty-state {
192
+ text-align: center;
193
+ color: GrayText;
194
+ font-size: 0.9rem;
195
+ padding: 2.5rem 1rem;
196
+ border: 1.5px dashed ButtonBorder;
197
+ border-radius: var(--radius);
198
+ background: Canvas;
199
+
200
+ &.hidden {
201
+ display: none;
202
+ }
203
+ }
204
+
205
+ /* ─── Panels grid ───────────────────────────────────────────────────────── */
206
+
207
+ #panels {
208
+ display: grid;
209
+ grid-template-columns: repeat(auto-fit, minmax(340px, 1fr));
210
+ gap: 1rem;
211
+ align-items: start;
212
+ }
213
+
214
+ /* ─── Individual tokenizer panel ────────────────────────────────────────── */
215
+
216
+ .panel {
217
+ background: Canvas;
218
+ border: 1px solid ButtonBorder;
219
+ border-radius: var(--radius);
220
+ box-shadow: var(--shadow);
221
+ display: grid;
222
+ grid-template-rows: auto auto auto 1fr;
223
+ overflow: hidden;
224
+
225
+ /* Panel header bar */
226
+ .panel-header {
227
+ display: flex;
228
+ align-items: center;
229
+ justify-content: space-between;
230
+ gap: 0.5rem;
231
+ padding: 0.65rem 1rem;
232
+ background: ButtonFace;
233
+ border-block-end: 1px solid ButtonBorder;
234
+
235
+ .panel-title {
236
+ font-size: 0.85rem;
237
+ font-weight: 600;
238
+ color: ButtonText;
239
+ text-decoration: none;
240
+ overflow: hidden;
241
+ text-overflow: ellipsis;
242
+ white-space: nowrap;
243
+ }
244
+
245
+ .remove-btn {
246
+ font-size: 0.75rem;
247
+ font-weight: 600;
248
+ color: GrayText;
249
+ background: none;
250
+ border: 1px solid ButtonBorder;
251
+ border-radius: var(--radius-sm);
252
+ padding: 0.2rem 0.5rem;
253
+ cursor: pointer;
254
+ flex-shrink: 0;
255
+ transition:
256
+ color 0.12s,
257
+ border-color 0.12s;
258
+
259
+ &:hover {
260
+ color: LinkText;
261
+ border-color: LinkText;
262
+ }
263
+ }
264
+ }
265
+
266
+ /* Text mirror with highlights */
267
+ .panel-text {
268
+ font-family: var(--font-mono);
269
+ font-size: 1.05rem;
270
+ line-height: 1.8;
271
+ padding: 0.85rem 1rem;
272
+ border-block-end: 1px solid ButtonBorder;
273
+ white-space: pre-wrap;
274
+ word-break: break-all;
275
+ min-height: 3.5rem;
276
+ }
277
+
278
+ /* Stats bar */
279
+ .panel-stats {
280
+ display: flex;
281
+ gap: 1.5rem;
282
+ padding: 0.45rem 1rem;
283
+ background: ButtonFace;
284
+ border-block-end: 1px solid ButtonBorder;
285
+ font-size: 0.78rem;
286
+ color: GrayText;
287
+ flex-wrap: wrap;
288
+
289
+ strong {
290
+ color: CanvasText;
291
+ font-weight: 600;
292
+ }
293
+ }
294
+
295
+ /* Token table */
296
+ .panel-table-wrap {
297
+ overflow-y: auto;
298
+ max-height: 360px;
299
+ }
300
+
301
+ table {
302
+ width: 100%;
303
+ border-collapse: collapse;
304
+ font-size: 0.82rem;
305
+
306
+ thead {
307
+ position: sticky;
308
+ top: 0;
309
+ background: ButtonFace;
310
+ z-index: 1;
311
+
312
+ th {
313
+ text-align: left;
314
+ padding: 0.4rem 0.75rem;
315
+ font-weight: 600;
316
+ font-size: 0.72rem;
317
+ text-transform: uppercase;
318
+ letter-spacing: 0.05em;
319
+ color: GrayText;
320
+ border-block-end: 1px solid ButtonBorder;
321
+ }
322
+ }
323
+
324
+ tbody tr {
325
+ cursor: default;
326
+ transition: background 0.08s;
327
+
328
+ &:hover {
329
+ background: color-mix(in srgb, ButtonFace 60%, Canvas);
330
+ }
331
+
332
+ &.active {
333
+ background: color-mix(
334
+ in srgb,
335
+ var(--row-color, AccentColor) 18%,
336
+ Canvas
337
+ );
338
+ }
339
+
340
+ td {
341
+ padding: 0.3rem 0.75rem;
342
+ border-block-end: 1px solid
343
+ color-mix(in srgb, ButtonBorder 60%, transparent);
344
+ vertical-align: middle;
345
+
346
+ &.tok-cell {
347
+ font-family: var(--font-mono);
348
+ font-size: 0.88rem;
349
+ }
350
+
351
+ &.id-cell {
352
+ font-family: var(--font-mono);
353
+ color: GrayText;
354
+ font-size: 0.8rem;
355
+ text-align: right;
356
+ }
357
+
358
+ &.color-swatch {
359
+ width: 18px;
360
+ padding-inline: 0.5rem;
361
+
362
+ span {
363
+ display: block;
364
+ width: 10px;
365
+ height: 10px;
366
+ border-radius: 50%;
367
+ background: var(--row-color, ButtonBorder);
368
+ }
369
+ }
370
+ }
371
+ }
372
+ }
373
+
374
+ /* Loading state */
375
+ &.loading {
376
+ .panel-text,
377
+ .panel-stats,
378
+ .panel-table-wrap {
379
+ opacity: 0.4;
380
+ pointer-events: none;
381
+ }
382
+ }
383
+
384
+ .panel-loading {
385
+ display: none;
386
+ align-items: center;
387
+ gap: 0.5rem;
388
+ padding: 0.6rem 1rem;
389
+ font-size: 0.8rem;
390
+ color: GrayText;
391
+
392
+ .spinner {
393
+ width: 14px;
394
+ height: 14px;
395
+ border: 2px solid ButtonBorder;
396
+ border-top-color: AccentColor;
397
+ border-radius: 50%;
398
+ animation: spin 0.7s linear infinite;
399
+ flex-shrink: 0;
400
+ }
401
+ }
402
+
403
+ &.loading .panel-loading {
404
+ display: flex;
405
+ }
406
  }
407
 
408
+ @keyframes spin {
409
+ to {
410
+ transform: rotate(360deg);
411
+ }
412
  }