Tokenizer-Visualizer / index.html
quickgrid's picture
Update index.html
cc8e1dc verified
raw
history blame
53.3 kB
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>TokenViz - Universal Tokenizer Visualizer</title>
<script src="https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.5.0"></script>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=JetBrains+Mono:wght@400;500;600&display=swap" rel="stylesheet">
<style>
:root {
--bg-primary: #0a0a0f;
--bg-secondary: #12121a;
--bg-tertiary: #1a1a25;
--bg-card: #16161f;
--bg-hover: #1e1e2e;
--border-color: #2a2a3a;
--border-light: #3a3a4a;
--text-primary: #e8e8f0;
--text-secondary: #a0a0b8;
--text-muted: #6b6b80;
--accent: #6366f1;
--accent-hover: #818cf8;
--accent-glow: rgba(99, 102, 241, 0.15);
--success: #22c55e;
--warning: #f59e0b;
--error: #ef4444;
--token-colors: #ef4444, #f97316, #f59e0b, #84cc16, #22c55e, #14b8a6, #06b6d4, #3b82f6, #6366f1, #8b5cf6, #a855f7, #d946ef, #ec4899, #f43f5e;
}
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
background: var(--bg-primary);
color: var(--text-primary);
min-height: 100vh;
overflow-x: hidden;
}
/* Animated background mesh */
.bg-mesh {
position: fixed;
top: 0;
left: 0;
width: 100%;
height: 100%;
z-index: 0;
pointer-events: none;
opacity: 0.4;
}
.bg-mesh::before {
content: '';
position: absolute;
top: -50%;
left: -50%;
width: 200%;
height: 200%;
background:
radial-gradient(circle at 20% 80%, rgba(99, 102, 241, 0.08) 0%, transparent 50%),
radial-gradient(circle at 80% 20%, rgba(139, 92, 246, 0.06) 0%, transparent 50%),
radial-gradient(circle at 50% 50%, rgba(236, 72, 153, 0.04) 0%, transparent 50%);
animation: meshFloat 20s ease-in-out infinite;
}
@keyframes meshFloat {
0%, 100% { transform: translate(0, 0) rotate(0deg); }
33% { transform: translate(30px, -30px) rotate(1deg); }
66% { transform: translate(-20px, 20px) rotate(-1deg); }
}
.container {
position: relative;
z-index: 1;
max-width: 1400px;
margin: 0 auto;
padding: 2rem;
}
/* Header */
header {
text-align: center;
margin-bottom: 3rem;
padding-top: 1rem;
}
.logo {
display: inline-flex;
align-items: center;
gap: 0.75rem;
margin-bottom: 1rem;
}
.logo-icon {
width: 40px;
height: 40px;
background: linear-gradient(135deg, var(--accent), #8b5cf6);
border-radius: 12px;
display: flex;
align-items: center;
justify-content: center;
font-size: 1.25rem;
box-shadow: 0 4px 20px rgba(99, 102, 241, 0.3);
}
.logo h1 {
font-size: 1.75rem;
font-weight: 700;
background: linear-gradient(135deg, var(--text-primary), var(--accent));
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
}
.subtitle {
color: var(--text-secondary);
font-size: 0.95rem;
max-width: 600px;
margin: 0 auto;
line-height: 1.6;
}
/* Model Selector */
.model-section {
background: var(--bg-card);
border: 1px solid var(--border-color);
border-radius: 16px;
padding: 1.5rem;
margin-bottom: 1.5rem;
backdrop-filter: blur(10px);
}
.section-title {
font-size: 0.875rem;
font-weight: 600;
color: var(--text-secondary);
text-transform: uppercase;
letter-spacing: 0.05em;
margin-bottom: 1rem;
display: flex;
align-items: center;
gap: 0.5rem;
}
.model-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
gap: 0.75rem;
}
.model-card {
background: var(--bg-secondary);
border: 1px solid var(--border-color);
border-radius: 12px;
padding: 1rem;
cursor: pointer;
transition: all 0.3s ease;
position: relative;
overflow: hidden;
}
.model-card::before {
content: '';
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 2px;
background: linear-gradient(90deg, var(--accent), #8b5cf6);
transform: scaleX(0);
transform-origin: left;
transition: transform 0.3s ease;
}
.model-card:hover {
border-color: var(--border-light);
transform: translateY(-2px);
box-shadow: 0 8px 30px rgba(0, 0, 0, 0.3);
}
.model-card:hover::before {
transform: scaleX(1);
}
.model-card.active {
border-color: var(--accent);
background: var(--accent-glow);
box-shadow: 0 0 30px var(--accent-glow);
}
.model-card.active::before {
transform: scaleX(1);
}
.model-name {
font-weight: 600;
font-size: 0.9rem;
color: var(--text-primary);
margin-bottom: 0.25rem;
}
.model-org {
font-size: 0.75rem;
color: var(--text-muted);
}
.model-badge {
display: inline-block;
font-size: 0.65rem;
padding: 0.15rem 0.4rem;
border-radius: 4px;
background: rgba(99, 102, 241, 0.15);
color: var(--accent);
margin-top: 0.5rem;
font-weight: 500;
}
/* Custom Model Input */
.custom-model {
margin-top: 1rem;
display: flex;
gap: 0.75rem;
align-items: stretch;
}
.custom-model input {
flex: 1;
background: var(--bg-primary);
border: 1px solid var(--border-color);
border-radius: 10px;
padding: 0.75rem 1rem;
color: var(--text-primary);
font-family: 'JetBrains Mono', monospace;
font-size: 0.85rem;
outline: none;
transition: all 0.3s ease;
}
.custom-model input:focus {
border-color: var(--accent);
box-shadow: 0 0 0 3px var(--accent-glow);
}
.custom-model input::placeholder {
color: var(--text-muted);
}
.btn {
background: linear-gradient(135deg, var(--accent), #8b5cf6);
color: white;
border: none;
border-radius: 10px;
padding: 0.75rem 1.5rem;
font-weight: 600;
font-size: 0.875rem;
cursor: pointer;
transition: all 0.3s ease;
display: inline-flex;
align-items: center;
gap: 0.5rem;
white-space: nowrap;
}
.btn:hover {
transform: translateY(-1px);
box-shadow: 0 4px 20px rgba(99, 102, 241, 0.4);
}
.btn:active {
transform: translateY(0);
}
.btn:disabled {
opacity: 0.5;
cursor: not-allowed;
transform: none;
}
.btn-secondary {
background: var(--bg-tertiary);
border: 1px solid var(--border-color);
color: var(--text-secondary);
}
.btn-secondary:hover {
background: var(--bg-hover);
border-color: var(--border-light);
box-shadow: none;
}
/* Input Section */
.input-section {
background: var(--bg-card);
border: 1px solid var(--border-color);
border-radius: 16px;
padding: 1.5rem;
margin-bottom: 1.5rem;
}
.input-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 1rem;
}
.input-stats {
display: flex;
gap: 1.5rem;
font-size: 0.8rem;
color: var(--text-muted);
}
.stat-item {
display: flex;
align-items: center;
gap: 0.35rem;
}
.stat-value {
color: var(--accent);
font-weight: 600;
}
textarea {
width: 100%;
background: var(--bg-primary);
border: 1px solid var(--border-color);
border-radius: 12px;
padding: 1rem;
color: var(--text-primary);
font-family: 'Inter', sans-serif;
font-size: 1rem;
line-height: 1.6;
resize: vertical;
min-height: 120px;
outline: none;
transition: all 0.3s ease;
}
textarea:focus {
border-color: var(--accent);
box-shadow: 0 0 0 3px var(--accent-glow);
}
textarea::placeholder {
color: var(--text-muted);
}
.input-actions {
display: flex;
gap: 0.75rem;
margin-top: 1rem;
flex-wrap: wrap;
}
/* Visualization Section */
.viz-section {
background: var(--bg-card);
border: 1px solid var(--border-color);
border-radius: 16px;
padding: 1.5rem;
margin-bottom: 1.5rem;
min-height: 200px;
}
.viz-tabs {
display: flex;
gap: 0.5rem;
margin-bottom: 1.5rem;
border-bottom: 1px solid var(--border-color);
padding-bottom: 0.75rem;
}
.viz-tab {
background: none;
border: none;
color: var(--text-muted);
font-size: 0.875rem;
font-weight: 500;
padding: 0.5rem 1rem;
cursor: pointer;
border-radius: 8px;
transition: all 0.3s ease;
position: relative;
}
.viz-tab:hover {
color: var(--text-secondary);
background: var(--bg-hover);
}
.viz-tab.active {
color: var(--accent);
background: var(--accent-glow);
}
/* Token Display */
.tokens-container {
display: flex;
flex-wrap: wrap;
gap: 0.5rem;
align-items: flex-start;
font-family: 'JetBrains Mono', monospace;
line-height: 2;
}
.token {
display: inline-flex;
flex-direction: column;
align-items: center;
position: relative;
cursor: pointer;
transition: transform 0.2s ease;
}
.token:hover {
transform: translateY(-2px);
z-index: 10;
}
.token-box {
padding: 0.35rem 0.6rem;
border-radius: 8px;
font-size: 0.85rem;
font-weight: 500;
border: 1px solid transparent;
transition: all 0.2s ease;
position: relative;
min-width: 2rem;
text-align: center;
}
.token:hover .token-box {
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
}
.token-id {
font-size: 0.65rem;
color: var(--text-muted);
margin-top: 0.2rem;
font-family: 'JetBrains Mono', monospace;
}
.token-tooltip {
position: absolute;
bottom: calc(100% + 8px);
left: 50%;
transform: translateX(-50%) scale(0.9);
background: var(--bg-tertiary);
border: 1px solid var(--border-light);
border-radius: 10px;
padding: 0.75rem;
font-size: 0.8rem;
white-space: nowrap;
opacity: 0;
pointer-events: none;
transition: all 0.2s ease;
z-index: 100;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.4);
}
.token:hover .token-tooltip {
opacity: 1;
transform: translateX(-50%) scale(1);
}
.token-tooltip::after {
content: '';
position: absolute;
top: 100%;
left: 50%;
transform: translateX(-50%);
border: 6px solid transparent;
border-top-color: var(--border-light);
}
.tooltip-row {
display: flex;
gap: 0.5rem;
align-items: center;
}
.tooltip-label {
color: var(--text-muted);
font-size: 0.75rem;
}
.tooltip-value {
color: var(--text-primary);
font-weight: 500;
}
/* Byte-level view */
.byte-view {
font-family: 'JetBrains Mono', monospace;
font-size: 0.85rem;
line-height: 1.8;
}
.byte-row {
display: flex;
gap: 0.25rem;
margin-bottom: 0.25rem;
align-items: center;
}
.byte-char {
width: 2rem;
text-align: center;
color: var(--text-secondary);
}
.byte-hex {
width: 2.5rem;
text-align: center;
color: var(--accent);
font-size: 0.8rem;
}
.byte-token {
padding: 0.15rem 0.4rem;
border-radius: 4px;
font-size: 0.8rem;
margin-left: 0.5rem;
}
/* ID List View */
.id-list {
display: flex;
flex-wrap: wrap;
gap: 0.5rem;
font-family: 'JetBrains Mono', monospace;
}
.id-chip {
background: var(--bg-secondary);
border: 1px solid var(--border-color);
border-radius: 8px;
padding: 0.4rem 0.75rem;
font-size: 0.85rem;
color: var(--text-secondary);
transition: all 0.2s ease;
}
.id-chip:hover {
border-color: var(--accent);
color: var(--accent);
}
/* Comparison View */
.comparison-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
gap: 1rem;
}
.comparison-card {
background: var(--bg-secondary);
border: 1px solid var(--border-color);
border-radius: 12px;
padding: 1rem;
}
.comparison-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 0.75rem;
padding-bottom: 0.75rem;
border-bottom: 1px solid var(--border-color);
}
.comparison-name {
font-weight: 600;
font-size: 0.9rem;
}
.comparison-count {
font-size: 0.8rem;
color: var(--text-muted);
background: var(--bg-primary);
padding: 0.2rem 0.5rem;
border-radius: 6px;
}
/* Stats Grid */
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 1rem;
margin-bottom: 1.5rem;
}
.stat-card {
background: var(--bg-card);
border: 1px solid var(--border-color);
border-radius: 12px;
padding: 1.25rem;
transition: all 0.3s ease;
}
.stat-card:hover {
border-color: var(--border-light);
transform: translateY(-2px);
}
.stat-icon {
width: 36px;
height: 36px;
border-radius: 10px;
display: flex;
align-items: center;
justify-content: center;
font-size: 1.1rem;
margin-bottom: 0.75rem;
}
.stat-card:nth-child(1) .stat-icon { background: rgba(99, 102, 241, 0.15); }
.stat-card:nth-child(2) .stat-icon { background: rgba(34, 197, 94, 0.15); }
.stat-card:nth-child(3) .stat-icon { background: rgba(245, 158, 11, 0.15); }
.stat-card:nth-child(4) .stat-icon { background: rgba(236, 72, 153, 0.15); }
.stat-label {
font-size: 0.8rem;
color: var(--text-muted);
margin-bottom: 0.25rem;
}
.stat-number {
font-size: 1.5rem;
font-weight: 700;
color: var(--text-primary);
}
/* Loading */
.loading-overlay {
position: fixed;
top: 0;
left: 0;
width: 100%;
height: 100%;
background: rgba(10, 10, 15, 0.9);
backdrop-filter: blur(8px);
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
z-index: 1000;
transition: opacity 0.3s ease;
}
.loading-overlay.hidden {
opacity: 0;
pointer-events: none;
}
.spinner {
width: 48px;
height: 48px;
border: 3px solid var(--border-color);
border-top-color: var(--accent);
border-radius: 50%;
animation: spin 0.8s linear infinite;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
.loading-text {
margin-top: 1rem;
color: var(--text-secondary);
font-size: 0.9rem;
}
.loading-subtext {
margin-top: 0.5rem;
color: var(--text-muted);
font-size: 0.8rem;
}
/* Error toast */
.toast {
position: fixed;
bottom: 2rem;
right: 2rem;
background: var(--bg-tertiary);
border: 1px solid var(--error);
border-radius: 12px;
padding: 1rem 1.5rem;
display: flex;
align-items: center;
gap: 0.75rem;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.4);
z-index: 1001;
transform: translateY(100px);
opacity: 0;
transition: all 0.3s ease;
}
.toast.show {
transform: translateY(0);
opacity: 1;
}
.toast-icon {
color: var(--error);
font-size: 1.25rem;
}
.toast-message {
color: var(--text-primary);
font-size: 0.9rem;
}
/* Responsive */
@media (max-width: 768px) {
.container {
padding: 1rem;
}
.model-grid {
grid-template-columns: 1fr;
}
.stats-grid {
grid-template-columns: repeat(2, 1fr);
}
.custom-model {
flex-direction: column;
}
.input-header {
flex-direction: column;
gap: 0.75rem;
align-items: flex-start;
}
}
/* Scrollbar */
::-webkit-scrollbar {
width: 8px;
height: 8px;
}
::-webkit-scrollbar-track {
background: var(--bg-primary);
}
::-webkit-scrollbar-thumb {
background: var(--border-color);
border-radius: 4px;
}
::-webkit-scrollbar-thumb:hover {
background: var(--border-light);
}
/* Empty state */
.empty-state {
text-align: center;
padding: 3rem 1rem;
color: var(--text-muted);
}
.empty-state-icon {
font-size: 3rem;
margin-bottom: 1rem;
opacity: 0.5;
}
.empty-state-text {
font-size: 1rem;
margin-bottom: 0.5rem;
}
.empty-state-hint {
font-size: 0.85rem;
opacity: 0.7;
}
/* Token color classes */
.tc-0 { background: rgba(239, 68, 68, 0.15); border-color: rgba(239, 68, 68, 0.3); color: #fca5a5; }
.tc-1 { background: rgba(249, 115, 22, 0.15); border-color: rgba(249, 115, 22, 0.3); color: #fdba74; }
.tc-2 { background: rgba(245, 158, 11, 0.15); border-color: rgba(245, 158, 11, 0.3); color: #fcd34d; }
.tc-3 { background: rgba(132, 204, 22, 0.15); border-color: rgba(132, 204, 22, 0.3); color: #bef264; }
.tc-4 { background: rgba(34, 197, 94, 0.15); border-color: rgba(34, 197, 94, 0.3); color: #86efac; }
.tc-5 { background: rgba(20, 184, 166, 0.15); border-color: rgba(20, 184, 166, 0.3); color: #5eead4; }
.tc-6 { background: rgba(6, 182, 212, 0.15); border-color: rgba(6, 182, 212, 0.3); color: #67e8f9; }
.tc-7 { background: rgba(59, 130, 246, 0.15); border-color: rgba(59, 130, 246, 0.3); color: #93c5fd; }
.tc-8 { background: rgba(99, 102, 241, 0.15); border-color: rgba(99, 102, 241, 0.3); color: #a5b4fc; }
.tc-9 { background: rgba(139, 92, 246, 0.15); border-color: rgba(139, 92, 246, 0.3); color: #c4b5fd; }
.tc-10 { background: rgba(168, 85, 247, 0.15); border-color: rgba(168, 85, 247, 0.3); color: #d8b4fe; }
.tc-11 { background: rgba(217, 70, 239, 0.15); border-color: rgba(217, 70, 239, 0.3); color: #e9d5ff; }
.tc-12 { background: rgba(236, 72, 153, 0.15); border-color: rgba(236, 72, 153, 0.3); color: #f9a8d4; }
.tc-13 { background: rgba(244, 63, 94, 0.15); border-color: rgba(244, 63, 94, 0.3); color: #fda4af; }
.special-token .token-box {
background: rgba(99, 102, 241, 0.2) !important;
border-color: var(--accent) !important;
color: var(--accent-hover) !important;
font-style: italic;
}
/* Info panel */
.info-panel {
background: var(--bg-secondary);
border-radius: 10px;
padding: 1rem;
margin-top: 1rem;
font-size: 0.85rem;
color: var(--text-secondary);
border: 1px solid var(--border-color);
}
.info-panel code {
background: var(--bg-primary);
padding: 0.15rem 0.35rem;
border-radius: 4px;
font-family: 'JetBrains Mono', monospace;
font-size: 0.8rem;
color: var(--accent);
}
/* Copy button */
.copy-btn {
background: none;
border: 1px solid var(--border-color);
color: var(--text-muted);
border-radius: 6px;
padding: 0.4rem 0.75rem;
font-size: 0.8rem;
cursor: pointer;
transition: all 0.2s ease;
display: inline-flex;
align-items: center;
gap: 0.35rem;
}
.copy-btn:hover {
border-color: var(--accent);
color: var(--accent);
}
.copy-btn.copied {
border-color: var(--success);
color: var(--success);
}
</style>
<base target="_blank">
</head>
<body>
<div class="bg-mesh"></div>
<div class="loading-overlay" id="loadingOverlay">
<div class="spinner"></div>
<div class="loading-text">Loading tokenizer...</div>
<div class="loading-subtext" id="loadingSubtext">This may take a moment for large vocabularies</div>
</div>
<div class="toast" id="toast">
<span class="toast-icon">⚠️</span>
<span class="toast-message" id="toastMessage">Error message</span>
</div>
<div class="container">
<header>
<div class="logo">
<div class="logo-icon">πŸ”€</div>
<h1>TokenViz</h1>
</div>
<p class="subtitle">
Universal tokenizer visualization for any HuggingFace model.
See exactly how LLMs break down text into tokens, IDs, and bytes β€” all in your browser with zero GPU required.
</p>
</header>
<!-- Model Selection -->
<div class="model-section">
<div class="section-title">
<span>πŸ€–</span> Select Model
</div>
<div class="model-grid" id="modelGrid">
<!-- Models will be populated by JS -->
</div>
<div class="custom-model">
<input type="text" id="customModelInput" placeholder="custom-org/model-name (e.g., meta-llama/Llama-3.1-8B)" />
<button class="btn" id="loadCustomBtn" onclick="loadCustomModel()">
<span>πŸ“₯</span> Load
</button>
</div>
</div>
<!-- Stats -->
<div class="stats-grid" id="statsGrid" style="display: none;">
<div class="stat-card">
<div class="stat-icon">πŸ“Š</div>
<div class="stat-label">Vocabulary Size</div>
<div class="stat-number" id="vocabSize">-</div>
</div>
<div class="stat-card">
<div class="stat-icon">πŸ”’</div>
<div class="stat-label">Token Count</div>
<div class="stat-number" id="tokenCount">-</div>
</div>
<div class="stat-card">
<div class="stat-icon">πŸ“</div>
<div class="stat-label">Char / Token Ratio</div>
<div class="stat-number" id="ratio">-</div>
</div>
<div class="stat-card">
<div class="stat-icon">⚑</div>
<div class="stat-label">Model Type</div>
<div class="stat-number" id="modelType">-</div>
</div>
</div>
<!-- Input -->
<div class="input-section">
<div class="input-header">
<div class="section-title" style="margin: 0;">
<span>✏️</span> Input Text
</div>
<div class="input-stats">
<div class="stat-item">
<span>Chars:</span>
<span class="stat-value" id="charCount">0</span>
</div>
<div class="stat-item">
<span>Words:</span>
<span class="stat-value" id="wordCount">0</span>
</div>
</div>
</div>
<textarea id="inputText" placeholder="Type or paste text here to see how the tokenizer breaks it down...
Try: 'Hello world! 🌍 The quick brown fox jumps over 13 lazy dogs.'" oninput="handleInput()"></textarea>
<div class="input-actions">
<button class="btn btn-secondary" onclick="loadExample('simple')">Simple</button>
<button class="btn btn-secondary" onclick="loadExample('code')">Code</button>
<button class="btn btn-secondary" onclick="loadExample('multilingual')">Multilingual</button>
<button class="btn btn-secondary" onclick="loadExample('math')">Math</button>
<button class="btn btn-secondary" onclick="loadExample('emoji')">Emoji</button>
<button class="btn btn-secondary" onclick="clearText()">Clear</button>
<button class="btn" onclick="copyTokenIds()" id="copyBtn" style="margin-left: auto;">
<span>πŸ“‹</span> Copy IDs
</button>
</div>
</div>
<!-- Visualization -->
<div class="viz-section">
<div class="viz-tabs">
<button class="viz-tab active" onclick="switchTab('tokens')">πŸ”€ Tokens</button>
<button class="viz-tab" onclick="switchTab('bytes')">πŸ’Ύ Bytes</button>
<button class="viz-tab" onclick="switchTab('ids')">πŸ”’ IDs</button>
<button class="viz-tab" onclick="switchTab('compare')">βš–οΈ Compare</button>
</div>
<div id="vizContent">
<div class="empty-state">
<div class="empty-state-icon">πŸ”</div>
<div class="empty-state-text">Enter text above to visualize tokenization</div>
<div class="empty-state-hint">Select a model and start typing to see the magic happen</div>
</div>
</div>
</div>
<!-- Info -->
<div class="info-panel">
<strong>πŸ’‘ How it works:</strong> This app uses <code>@huggingface/transformers</code> (v3.5.0) to load tokenizer files directly from the HuggingFace Hub in your browser.
It downloads <code>tokenizer.json</code> and <code>tokenizer_config.json</code> and runs tokenization entirely client-side with WebAssembly β€” no GPU or server required.
Works with BPE, WordPiece, Unigram, and SentencePiece tokenizers from any model.
</div>
</div>
<script>
// ============================================
// CONFIGURATION & STATE
// ============================================
const PRESET_MODELS = [
{ id: 'Xenova/gpt2', name: 'GPT-2', org: 'OpenAI', type: 'BPE' },
{ id: 'Xenova/bert-base-uncased', name: 'BERT Base', org: 'Google', type: 'WordPiece' },
{ id: 'Xenova/meta-llama/Llama-3.1-8B', name: 'Llama 3.1', org: 'Meta', type: 'BPE' },
{ id: 'Xenova/mistralai/Mistral-7B-v0.1', name: 'Mistral 7B', org: 'Mistral AI', type: 'BPE' },
{ id: 'Xenova/t5-small', name: 'T5 Small', org: 'Google', type: 'SentencePiece' },
{ id: 'Xenova/deepseek-ai/DeepSeek-V3', name: 'DeepSeek V3', org: 'DeepSeek', type: 'BPE' },
{ id: 'Xenova/Qwen/Qwen2.5-7B-Instruct', name: 'Qwen 2.5', org: 'Alibaba', type: 'BPE' },
{ id: 'Xenova/microsoft/Phi-3-mini-4k-instruct', name: 'Phi-3 Mini', org: 'Microsoft', type: 'BPE' },
{ id: 'Xenova/HuggingFaceTB/SmolLM2-360M-Instruct', name: 'SmolLM2', org: 'HuggingFace', type: 'BPE' },
{ id: 'Xenova/google/gemma-2-2b-it', name: 'Gemma 2', org: 'Google', type: 'BPE' },
{ id: 'Xenova/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO', name: 'Nous Hermes', org: 'Nous', type: 'BPE' },
{ id: 'Xenova/stabilityai/stablelm-2-1_6b', name: 'StableLM 2', org: 'Stability AI', type: 'BPE' },
];
let currentModel = null;
let currentTokenizer = null;
let currentTokens = [];
let activeTab = 'tokens';
let compareMode = false;
let compareTokenizers = {};
// ============================================
// INITIALIZATION
// ============================================
function init() {
renderModelGrid();
// Auto-select first model
selectModel(PRESET_MODELS[0].id);
}
function renderModelGrid() {
const grid = document.getElementById('modelGrid');
grid.innerHTML = PRESET_MODELS.map(model => `
<div class="model-card ${currentModel === model.id ? 'active' : ''}"
onclick="selectModel('${model.id}')"
data-model="${model.id}">
<div class="model-name">${model.name}</div>
<div class="model-org">${model.org}</div>
<span class="model-badge">${model.type}</span>
</div>
`).join('');
}
// ============================================
// MODEL LOADING
// ============================================
async function selectModel(modelId) {
if (currentModel === modelId && currentTokenizer) return;
showLoading(true, `Loading ${modelId}...`);
try {
const { AutoTokenizer } = window.transformers;
// Use from_pretrained with local files only if needed, but default to hub
const tokenizer = await AutoTokenizer.from_pretrained(modelId, {
revision: 'main',
// Allow remote since we're in browser
});
currentModel = modelId;
currentTokenizer = tokenizer;
// Update UI
document.querySelectorAll('.model-card').forEach(card => {
card.classList.toggle('active', card.dataset.model === modelId);
});
// Update stats
updateModelStats();
// Re-tokenize if text exists
const text = document.getElementById('inputText').value;
if (text) {
await tokenize(text);
}
showToast(`Loaded ${getModelName(modelId)} successfully`, 'success');
} catch (err) {
console.error('Failed to load tokenizer:', err);
showToast(`Failed to load ${modelId}: ${err.message}`, 'error');
} finally {
showLoading(false);
}
}
async function loadCustomModel() {
const input = document.getElementById('customModelInput');
const modelId = input.value.trim();
if (!modelId) {
showToast('Please enter a model ID', 'error');
return;
}
// Check if already in presets
const exists = PRESET_MODELS.find(m => m.id === modelId || m.id.endsWith(modelId));
if (exists) {
await selectModel(exists.id);
return;
}
showLoading(true, `Loading ${modelId}...`);
try {
const { AutoTokenizer } = window.transformers;
const tokenizer = await AutoTokenizer.from_pretrained(modelId);
// Add to presets
const newModel = {
id: modelId,
name: modelId.split('/').pop(),
org: modelId.split('/')[0] || 'Custom',
type: 'Unknown'
};
PRESET_MODELS.push(newModel);
renderModelGrid();
// Scroll to and select
await selectModel(modelId);
input.value = '';
showToast(`Loaded custom model ${modelId}`, 'success');
} catch (err) {
showToast(`Failed to load ${modelId}: ${err.message}`, 'error');
} finally {
showLoading(false);
}
}
function updateModelStats() {
if (!currentTokenizer) return;
const vocab = currentTokenizer.vocab || {};
const vocabSize = Object.keys(vocab).length || currentTokenizer.vocab_size || '?';
document.getElementById('vocabSize').textContent = vocabSize.toLocaleString();
document.getElementById('modelType').textContent = currentTokenizer.model_type || 'Unknown';
document.getElementById('statsGrid').style.display = 'grid';
}
// ============================================
// TOKENIZATION
// ============================================
async function handleInput() {
const text = document.getElementById('inputText').value;
// Update char/word count
document.getElementById('charCount').textContent = text.length;
document.getElementById('wordCount').textContent = text.trim() ? text.trim().split(/\s+/).length : 0;
if (!currentTokenizer) {
showToast('Please select a model first', 'error');
return;
}
await tokenize(text);
}
async function tokenize(text) {
if (!text || !currentTokenizer) return;
try {
// Get tokens with their text representation
const encoding = currentTokenizer.encode(text, { return_offsets_mapping: true });
const tokenIds = encoding.input_ids || encoding;
// Convert IDs to tokens
const tokens = [];
for (let i = 0; i < tokenIds.length; i++) {
const id = tokenIds[i];
let tokenText = '';
try {
// Try to decode single token
tokenText = currentTokenizer.decode([id], { skip_special_tokens: false });
} catch (e) {
// Fallback: try to get from vocab
const vocab = currentTokenizer.vocab || {};
const reverseVocab = Object.fromEntries(
Object.entries(vocab).map(([k, v]) => [v, k])
);
tokenText = reverseVocab[id] || `[${id}]`;
}
// Detect special tokens
const specialTokens = currentTokenizer.special_tokens || [];
const isSpecial = specialTokens.some(st => tokenText.includes(st)) ||
id === currentTokenizer.bos_token_id ||
id === currentTokenizer.eos_token_id ||
id === currentTokenizer.pad_token_id ||
id === currentTokenizer.unk_token_id;
tokens.push({
id: id,
text: tokenText,
isSpecial: isSpecial,
index: i
});
}
currentTokens = tokens;
// Update stats
document.getElementById('tokenCount').textContent = tokens.length;
document.getElementById('ratio').textContent = text.length > 0
? (text.length / tokens.length).toFixed(2)
: '-';
// Render
renderVisualization();
} catch (err) {
console.error('Tokenization error:', err);
showToast(`Tokenization failed: ${err.message}`, 'error');
}
}
// ============================================
// VISUALIZATION RENDERING
// ============================================
function renderVisualization() {
const container = document.getElementById('vizContent');
if (!currentTokens.length) {
container.innerHTML = `
<div class="empty-state">
<div class="empty-state-icon">πŸ”</div>
<div class="empty-state-text">Enter text above to visualize tokenization</div>
</div>
`;
return;
}
switch (activeTab) {
case 'tokens':
renderTokensView(container);
break;
case 'bytes':
renderBytesView(container);
break;
case 'ids':
renderIdsView(container);
break;
case 'compare':
renderCompareView(container);
break;
}
}
function renderTokensView(container) {
const html = currentTokens.map((token, idx) => {
const colorClass = `tc-${idx % 14}`;
const specialClass = token.isSpecial ? 'special-token' : '';
const displayText = escapeHtml(token.text).replace(/ /g, 'Β·').replace(/\n/g, '\n');
return `
<div class="token ${specialClass}">
<div class="token-tooltip">
<div class="tooltip-row">
<span class="tooltip-label">ID:</span>
<span class="tooltip-value">${token.id}</span>
</div>
<div class="tooltip-row">
<span class="tooltip-label">Text:</span>
<span class="tooltip-value">"${escapeHtml(token.text)}"</span>
</div>
<div class="tooltip-row">
<span class="tooltip-label">Index:</span>
<span class="tooltip-value">${token.index}</span>
</div>
${token.isSpecial ? '<div class="tooltip-row"><span class="tooltip-label">Type:</span><span class="tooltip-value" style="color: var(--accent)">Special Token</span></div>' : ''}
</div>
<div class="token-box ${colorClass}">${displayText || 'Β·'}</div>
<div class="token-id">${token.id}</div>
</div>
`;
}).join('');
container.innerHTML = `<div class="tokens-container">${html}</div>`;
}
function renderBytesView(container) {
const text = document.getElementById('inputText').value;
const encoder = new TextEncoder();
const bytes = encoder.encode(text);
let html = '<div class="byte-view">';
// Group by tokens
let byteIdx = 0;
currentTokens.forEach((token, tIdx) => {
const tokenBytes = encoder.encode(token.text);
const colorClass = `tc-${tIdx % 14}`;
html += `<div style="margin-bottom: 0.5rem; padding: 0.5rem; background: var(--bg-secondary); border-radius: 8px;">`;
html += `<div style="font-size: 0.8rem; color: var(--text-muted); margin-bottom: 0.25rem;">Token ${tIdx}: "${escapeHtml(token.text)}" (ID: ${token.id})</div>`;
for (let i = 0; i < tokenBytes.length; i++) {
const byte = tokenBytes[i];
const char = byte >= 32 && byte < 127 ? String.fromCharCode(byte) : 'Β·';
html += `
<div class="byte-row">
<span class="byte-char">${escapeHtml(char)}</span>
<span class="byte-hex">0x${byte.toString(16).padStart(2, '0')}</span>
<span class="byte-dec" style="width: 3rem; text-align: center; color: var(--text-muted); font-size: 0.8rem;">${byte}</span>
<span class="byte-token ${colorClass}" style="font-size: 0.75rem; padding: 0.1rem 0.3rem;">Byte ${i}</span>
</div>
`;
}
html += '</div>';
});
html += '</div>';
container.innerHTML = html;
}
function renderIdsView(container) {
const ids = currentTokens.map(t => t.id);
const html = ids.map((id, idx) => `
<div class="id-chip" title="Index: ${idx}">${id}</div>
`).join('');
container.innerHTML = `
<div style="margin-bottom: 1rem;">
<div class="copy-btn" onclick="copyToClipboard('[${ids.join(', ')}]')">
<span>πŸ“‹</span> Copy Array
</div>
<div class="copy-btn" onclick="copyToClipboard(${JSON.stringify(ids)})" style="margin-left: 0.5rem;">
<span>πŸ“‹</span> Copy JSON
</div>
</div>
<div class="id-list">${html}</div>
<div style="margin-top: 1rem; padding: 1rem; background: var(--bg-secondary); border-radius: 8px; font-family: 'JetBrains Mono', monospace; font-size: 0.85rem; color: var(--text-secondary); overflow-x: auto;">
[${ids.join(', ')}]
</div>
`;
}
async function renderCompareView(container) {
// Load a few comparison tokenizers if not loaded
const compareModels = PRESET_MODELS.slice(0, 4).filter(m => m.id !== currentModel);
let html = '<div class="comparison-grid">';
// Current model
html += `
<div class="comparison-card" style="border-color: var(--accent);">
<div class="comparison-header">
<span class="comparison-name" style="color: var(--accent);">⭐ ${getModelName(currentModel)}</span>
<span class="comparison-count">${currentTokens.length} tokens</span>
</div>
<div class="tokens-container" style="font-size: 0.75rem;">
${currentTokens.map((t, i) => `
<div class="token">
<div class="token-box tc-${i % 14}" style="padding: 0.2rem 0.4rem; font-size: 0.75rem;">${escapeHtml(t.text).replace(/ /g, 'Β·') || 'Β·'}</div>
</div>
`).join('')}
</div>
</div>
`;
// Compare with others
const text = document.getElementById('inputText').value;
for (const model of compareModels) {
try {
if (!compareTokenizers[model.id]) {
const { AutoTokenizer } = window.transformers;
compareTokenizers[model.id] = await AutoTokenizer.from_pretrained(model.id);
}
const tok = compareTokenizers[model.id];
const encoding = tok.encode(text);
const ids = encoding.input_ids || encoding;
const otherTokens = [];
for (const id of ids) {
let txt = '';
try {
txt = tok.decode([id], { skip_special_tokens: false });
} catch (e) {
txt = `[${id}]`;
}
otherTokens.push(txt);
}
html += `
<div class="comparison-card">
<div class="comparison-header">
<span class="comparison-name">${model.name}</span>
<span class="comparison-count">${otherTokens.length} tokens</span>
</div>
<div class="tokens-container" style="font-size: 0.75rem;">
${otherTokens.map((t, i) => `
<div class="token">
<div class="token-box tc-${i % 14}" style="padding: 0.2rem 0.4rem; font-size: 0.75rem;">${escapeHtml(t).replace(/ /g, 'Β·') || 'Β·'}</div>
</div>
`).join('')}
</div>
</div>
`;
} catch (e) {
html += `
<div class="comparison-card">
<div class="comparison-header">
<span class="comparison-name">${model.name}</span>
</div>
<div style="color: var(--text-muted); font-size: 0.85rem;">Failed to load</div>
</div>
`;
}
}
html += '</div>';
container.innerHTML = html;
}
// ============================================
// UI HELPERS
// ============================================
function switchTab(tab) {
activeTab = tab;
document.querySelectorAll('.viz-tab').forEach(t => t.classList.remove('active'));
event.target.classList.add('active');
renderVisualization();
}
function loadExample(type) {
const examples = {
simple: "Hello world! This is a simple example of how tokenization works.",
code: "function fibonacci(n) {\n if (n <= 1) return n;\n return fibonacci(n - 1) + fibonacci(n - 2);\n}",
multilingual: "Hello δΈ–η•Œ 🌍! Bonjour le monde! Β‘Hola mundo! γ“γ‚“γ«γ‘γ―δΈ–η•Œ!",
math: "The equation $E = mc^2$ shows that energy equals mass times the speed of light squared. ∫(x² + 3x)dx",
emoji: "πŸŽ‰πŸŽŠ Party time! πŸ₯³πŸŽ‚πŸŽˆπŸŽπŸŽ„πŸŽƒπŸ¦ƒπŸŽ…πŸ€ΆπŸ§‘β€πŸŽ„πŸŽ†πŸŽ‡βœ¨πŸŽ€πŸŽ‹πŸŽπŸŽŽπŸŽπŸŽπŸŽ‘πŸ§§πŸŽ€πŸŽπŸŽ—οΈπŸŽŸοΈπŸŽ«πŸŽ–οΈπŸ†πŸ…πŸ₯‡πŸ₯ˆπŸ₯‰"
};
document.getElementById('inputText').value = examples[type];
handleInput();
}
function clearText() {
document.getElementById('inputText').value = '';
handleInput();
}
function copyTokenIds() {
if (!currentTokens.length) return;
const ids = currentTokens.map(t => t.id);
copyToClipboard(`[${ids.join(', ')}]`);
}
async function copyToClipboard(text) {
try {
await navigator.clipboard.writeText(text);
showToast('Copied to clipboard!', 'success');
} catch (err) {
// Fallback
const textarea = document.createElement('textarea');
textarea.value = text;
document.body.appendChild(textarea);
textarea.select();
document.execCommand('copy');
document.body.removeChild(textarea);
showToast('Copied to clipboard!', 'success');
}
}
function getModelName(modelId) {
const preset = PRESET_MODELS.find(m => m.id === modelId);
return preset ? preset.name : modelId.split('/').pop();
}
function escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
// ============================================
// LOADING & TOAST
// ============================================
function showLoading(show, text = '') {
const overlay = document.getElementById('loadingOverlay');
const subtext = document.getElementById('loadingSubtext');
if (show) {
overlay.querySelector('.loading-text').textContent = text || 'Loading...';
overlay.classList.remove('hidden');
} else {
overlay.classList.add('hidden');
}
}
function showToast(message, type = 'error') {
const toast = document.getElementById('toast');
const msgEl = document.getElementById('toastMessage');
const iconEl = toast.querySelector('.toast-icon');
msgEl.textContent = message;
if (type === 'success') {
iconEl.textContent = 'βœ…';
toast.style.borderColor = 'var(--success)';
} else {
iconEl.textContent = '⚠️';
toast.style.borderColor = 'var(--error)';
}
toast.classList.add('show');
setTimeout(() => toast.classList.remove('show'), 3000);
}
// ============================================
// START
// ============================================
document.addEventListener('DOMContentLoaded', init);
// Handle Enter key in custom model input
document.getElementById('customModelInput')?.addEventListener('keypress', (e) => {
if (e.key === 'Enter') loadCustomModel();
});
</script>
</body>
</html>