shreyask's picture
Upload folder using huggingface_hub
7b6759d verified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Voxtral Realtime 4B β€” Live Speech-to-Text</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600;700&family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
background-color: #FFFAEB;
background-image:
linear-gradient(#E9E2CB 1px, transparent 1px),
linear-gradient(90deg, #E9E2CB 1px, transparent 1px);
background-size: 40px 40px;
color: #1E1E1E;
min-height: 100vh;
display: flex; flex-direction: column;
align-items: center; padding: 2rem 1rem;
}
a { color: #FF8205; text-decoration: none; }
a:hover { text-decoration: underline; }
.container { max-width: 680px; width: 100%; }
/* ── Header card ── */
.header-card {
background: linear-gradient(135deg, #FFFAEB 0%, #FFF0C3 100%);
border: 2px solid #E9E2CB;
border-top: 4px solid #FF8205;
border-radius: 8px;
padding: 1.75rem 2rem;
margin-bottom: 1.25rem;
box-shadow: 0 4px 24px rgba(0,0,0,0.06);
}
.header-title {
font-size: 1.6rem; font-weight: 700; color: #1E1E1E;
letter-spacing: -0.02em;
display: flex; align-items: center; gap: 0.6rem;
}
.header-title .accent { color: #FF8205; }
.header-subtitle {
color: #555; font-size: 0.85rem; margin-top: 0.35rem;
}
.header-links {
display: flex; align-items: center; gap: 0.5rem; flex-wrap: wrap;
margin-top: 0.6rem; font-size: 0.72rem; color: #888;
font-family: 'JetBrains Mono', monospace;
}
.header-links .sep { color: #E9E2CB; }
/* ── Load section ── */
.load-card {
background: rgba(255,255,255,0.6);
border: 2px solid #E9E2CB;
border-radius: 8px;
padding: 2rem;
text-align: center;
box-shadow: 0 4px 24px rgba(0,0,0,0.04);
margin-bottom: 1.25rem;
}
.load-card.hidden { display: none; }
.config-row {
display: flex; gap: 0.5rem; justify-content: center;
margin-bottom: 1.25rem;
}
.config-row select {
background: #FFFAEB; border: 1.5px solid #E9E2CB; border-radius: 4px;
padding: 0.5rem 0.75rem; color: #1E1E1E; font-size: 0.8rem;
font-family: 'JetBrains Mono', monospace; cursor: pointer;
}
.config-row select:focus { outline: none; border-color: #FF8205; }
.load-btn {
background: #FF8205; border: none; border-radius: 4px;
padding: 0.75rem 2.5rem; color: #fff;
font-family: 'JetBrains Mono', monospace;
font-size: 0.8rem; font-weight: 700;
text-transform: uppercase; letter-spacing: 0.08em;
cursor: pointer; transition: all 0.2s;
}
.load-btn:hover { background: #E67300; }
.load-btn:disabled { opacity: 0.5; cursor: not-allowed; }
.load-hint {
font-size: 0.7rem; color: #888; margin-top: 1rem;
font-family: 'JetBrains Mono', monospace;
}
.progress-wrap {
margin-top: 1rem; opacity: 0; transition: opacity 0.3s;
}
.progress-wrap.visible { opacity: 1; }
.progress-track {
width: 100%; height: 4px; background: #E9E2CB; border-radius: 2px;
overflow: hidden;
}
.progress-fill {
height: 100%; width: 0%; border-radius: 2px;
background: #FF8205; transition: width 0.3s;
}
.load-status {
font-family: 'JetBrains Mono', monospace;
font-size: 0.7rem; color: #888; margin-top: 0.4rem;
min-height: 1.2em;
}
/* ── Transcription card ── */
.transcript-card {
background: #FFFAEB;
border: 2px solid #E9E2CB;
border-radius: 8px;
box-shadow: 0 8px 32px rgba(0,0,0,0.06);
overflow: hidden;
margin-bottom: 0.75rem;
}
.card-header {
background: rgba(255,255,255,0.6);
border-bottom: 1px solid #E9E2CB;
padding: 0.6rem 1rem;
display: flex; align-items: center; justify-content: space-between;
}
.card-header-left {
display: flex; align-items: center; gap: 0.6rem;
}
.card-title {
font-family: 'JetBrains Mono', monospace;
font-size: 0.6rem; font-weight: 700; color: #1E1E1E;
text-transform: uppercase; letter-spacing: 0.1em;
}
.status-badge {
display: inline-flex; align-items: center; gap: 0.4rem;
padding: 0.2rem 0.6rem; border-radius: 2px;
font-family: 'JetBrains Mono', monospace;
font-size: 0.55rem; font-weight: 700;
text-transform: uppercase; letter-spacing: 0.08em;
}
.status-idle {
background: #f0f0f0; color: #888; border: 1px solid #E9E2CB;
}
.status-listening {
background: rgba(255,130,5,0.15); color: #CC6A04;
border: 1px solid #FF8205;
}
.status-transcribing {
background: rgba(255,130,5,0.25); color: #CC6A04;
border: 1px solid #FF8205;
}
.status-dot {
width: 6px; height: 6px; border-radius: 50%;
background: currentColor;
}
.status-dot.pulse {
animation: pulse 1.2s ease-in-out infinite;
}
@keyframes pulse {
0%, 100% { opacity: 1; transform: scale(1); }
50% { opacity: 0.4; transform: scale(0.7); }
}
.card-content {
padding: 1.25rem 1.25rem 1.5rem;
min-height: 180px;
background-image:
linear-gradient(rgba(0,0,0,0.02) 1px, transparent 1px),
linear-gradient(90deg, rgba(0,0,0,0.02) 1px, transparent 1px);
background-size: 20px 20px;
}
#transcript {
font-family: 'JetBrains Mono', monospace;
font-size: 0.95rem; line-height: 1.8; color: #1E1E1E;
white-space: pre-wrap; word-break: break-word;
}
#transcript.placeholder {
color: #bbb; font-style: italic;
}
.transcript-cursor {
display: inline-block; width: 8px; height: 16px;
background: #FF8205; margin-left: 3px; vertical-align: middle;
animation: blink 1s step-end infinite;
}
@keyframes blink {
0%, 100% { opacity: 1; }
50% { opacity: 0; }
}
.card-footer {
background: rgba(255,255,255,0.4);
border-top: 1px solid #E9E2CB;
padding: 0.4rem 1rem;
font-family: 'JetBrains Mono', monospace;
font-size: 0.6rem; color: #888;
display: flex; justify-content: space-between;
}
/* ── Mic + waveform ── */
.controls {
display: flex; flex-direction: column; align-items: center;
gap: 0.75rem; margin-bottom: 1.25rem;
}
.mic-btn {
width: 88px; height: 88px; border-radius: 50%;
border: 3px solid #E9E2CB; background: rgba(255,255,255,0.7);
cursor: pointer; display: flex; align-items: center; justify-content: center;
transition: all 0.2s;
box-shadow: 0 4px 16px rgba(0,0,0,0.06);
}
.mic-btn:hover { border-color: #FF8205; background: rgba(255,130,5,0.05); }
.mic-btn.listening {
border-color: #FF8205; background: rgba(255,130,5,0.1);
box-shadow: 0 0 0 0 rgba(255,130,5,0.3);
animation: ring 2s ease-out infinite;
}
.mic-btn.disabled { opacity: 0.3; cursor: not-allowed; pointer-events: none; }
@keyframes ring {
0% { box-shadow: 0 0 0 0 rgba(255,130,5,0.3); }
100% { box-shadow: 0 0 0 20px rgba(255,130,5,0); }
}
.mic-btn svg { width: 36px; height: 36px; fill: #999; transition: fill 0.2s; }
.mic-btn.listening svg { fill: #FF8205; }
.waveform { width: 100%; max-width: 480px; height: 44px; }
.waveform canvas {
width: 100%; height: 100%; display: block; border-radius: 4px;
border: 1px solid #E9E2CB; background: rgba(255,255,255,0.4);
}
/* ── Footer ── */
.footer {
text-align: center; font-size: 0.65rem; color: #aaa;
font-family: 'JetBrains Mono', monospace;
margin-top: 0.5rem;
}
</style>
<script type="module" crossorigin src="/assets/index-BuZjYFHI.js"></script>
</head>
<body>
<div class="container">
<!-- Header -->
<div class="header-card">
<div class="header-title">
<span class="accent">Voxtral</span> Realtime 4B
</div>
<p class="header-subtitle">
Real-time speech transcription running entirely in your browser via WebGPU
</p>
<div class="header-links">
<a href="https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602" target="_blank">mistralai/Voxtral-Mini-4B-Realtime-2602</a>
<span class="sep">&middot;</span>
<a href="https://huggingface.co/onnx-community/Voxtral-Mini-4B-Realtime-2602-ONNX" target="_blank">ONNX weights</a>
<span class="sep">&middot;</span>
<a href="https://huggingface.co/docs/transformers.js" target="_blank">transformers.js</a>
</div>
</div>
<!-- Load Model -->
<div class="load-card" id="loadSection">
<div class="config-row">
<select id="dtype" title="Quantization">
<option value="q4">q4 (~1.5 GB)</option>
<option value="q4f16">q4f16 (~1.5 GB)</option>
<option value="fp16">fp16 (~8 GB)</option>
</select>
<select id="device" title="Backend">
<option value="webgpu">WebGPU</option>
<option value="wasm">WASM (CPU)</option>
</select>
</div>
<button class="load-btn" id="loadBtn" onclick="window.__loadModel()">Load Model</button>
<div class="load-hint">Requires WebGPU (Chrome 113+, Edge 113+). Models are cached after first download.</div>
<div class="progress-wrap" id="progressWrap">
<div class="progress-track"><div class="progress-fill" id="progressFill"></div></div>
<div class="load-status" id="loadStatus"></div>
</div>
</div>
<!-- Active UI -->
<div id="activeUI" style="display:none">
<div class="controls">
<button class="mic-btn" id="micBtn" title="Start / stop listening">
<svg viewBox="0 0 24 24"><path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm-1-9c0-.55.45-1 1-1s1 .45 1 1v6c0 .55-.45 1-1 1s-1-.45-1-1V5zm6 6c0 2.76-2.24 5-5 5s-5-2.24-5-5H5c0 3.53 2.61 6.43 6 6.92V21h2v-3.08c3.39-.49 6-3.39 6-6.92h-2z"/></svg>
</button>
<div class="waveform"><canvas id="waveCanvas"></canvas></div>
</div>
</div>
<!-- Transcript -->
<div class="transcript-card" id="transcriptCard" style="display:none">
<div class="card-header">
<div class="card-header-left">
<span class="card-title">Transcript</span>
</div>
<span class="status-badge status-idle" id="statusBadge">
<span class="status-dot" id="statusDot"></span>
<span id="statusText">Idle</span>
</span>
</div>
<div class="card-content">
<div id="transcript" class="placeholder">Press the mic button and start speaking...</div>
</div>
<div class="card-footer">
<span id="timing"></span>
<span>voxtral-realtime-4b &middot; in-browser</span>
</div>
</div>
<div class="footer">
No data leaves your device &middot; Powered by <a href="https://huggingface.co/docs/transformers.js" target="_blank">transformers.js</a> + WebGPU
</div>
</div>
</body>
</html>