Reza2kn's picture
Update Space copy for PhaseB FP16 ONNX
abc8904 verified
Raw
History Blame Contribute Delete
7.75 kB
from pathlib import Path
import json
import gradio as gr
ROOT = Path(__file__).parent
def main() -> gr.Blocks:
tokens = json.loads((ROOT / "tokens.json").read_text(encoding="utf-8"))
mel_filters = json.loads((ROOT / "mel_filters_slaney_80x257.json").read_text(encoding="utf-8"))
app_js = (ROOT / "app.js").read_text(encoding="utf-8")
head = (
"<script type=\"module\">\n"
f"window.VISUALEARS_TOKENS = {json.dumps(tokens, ensure_ascii=False)};\n"
f"window.VISUALEARS_MEL_FILTERS = {json.dumps(mel_filters, separators=(',', ':'))};\n"
f"{app_js}\n"
"</script>"
)
with gr.Blocks(
title="VisualEars WebGPU ASR",
head=head,
css="""
.gradio-container { max-width: 1180px !important; }
#visualears-root {
font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
color: #111827;
}
#visualears-root .topbar {
display: flex;
align-items: center;
justify-content: space-between;
gap: 16px;
margin-bottom: 16px;
}
#visualears-root h1 {
font-size: 28px;
line-height: 1.15;
margin: 0;
letter-spacing: 0;
}
#visualears-root .subtle {
color: #4b5563;
margin: 6px 0 0;
font-size: 14px;
}
#visualears-root .controls {
display: grid;
grid-template-columns: repeat(4, minmax(0, 1fr));
gap: 10px;
align-items: end;
margin: 14px 0;
}
#visualears-root button, #visualears-root select, #visualears-root input {
width: 100%;
min-height: 40px;
border: 1px solid #cbd5e1;
border-radius: 7px;
background: white;
color: #111827;
font: inherit;
padding: 8px 10px;
}
#visualears-root button.primary {
background: #111827;
color: white;
border-color: #111827;
}
#visualears-root button:disabled {
opacity: 0.45;
cursor: not-allowed;
}
#visualears-root label {
display: block;
font-size: 12px;
color: #475569;
margin-bottom: 5px;
}
#visualears-root .status {
min-height: 42px;
overflow-wrap: anywhere;
border: 1px solid #d1d5db;
background: #f8fafc;
border-radius: 7px;
padding: 10px 12px;
margin: 10px 0 14px;
}
#visualears-root .transcript {
min-height: 150px;
direction: rtl;
text-align: right;
font-size: 26px;
line-height: 1.65;
border: 1px solid #cbd5e1;
border-radius: 8px;
padding: 18px;
background: #ffffff;
overflow-wrap: anywhere;
white-space: pre-wrap;
}
#visualears-root .stats {
display: grid;
grid-template-columns: repeat(4, minmax(0, 1fr));
gap: 10px;
margin-top: 14px;
}
#visualears-root .stat {
border: 1px solid #d1d5db;
border-radius: 8px;
padding: 12px;
background: #ffffff;
min-height: 74px;
}
#visualears-root .stat b {
display: block;
font-size: 12px;
color: #64748b;
margin-bottom: 8px;
font-weight: 600;
}
#visualears-root .stat span {
font-size: 20px;
color: #111827;
overflow-wrap: anywhere;
}
#visualears-root .meter {
height: 8px;
background: #e5e7eb;
overflow: hidden;
border-radius: 999px;
margin-top: 8px;
}
#visualears-root .meter > div {
height: 100%;
width: 0%;
background: #10b981;
transition: width 120ms linear;
}
@media (max-width: 820px) {
#visualears-root .topbar { align-items: flex-start; flex-direction: column; }
#visualears-root .controls, #visualears-root .stats { grid-template-columns: 1fr 1fr; }
#visualears-root .transcript { font-size: 22px; }
}
@media (max-width: 520px) {
#visualears-root .controls, #visualears-root .stats { grid-template-columns: 1fr; }
}
""",
) as demo:
gr.HTML(
"""
<div id="visualears-root">
<div class="topbar">
<div>
<h1>VisualEars PhaseB Persian ASR FP16</h1>
<p class="subtle">Browser-only WebGPU decode with the PhaseB FP16 ONNX export.</p>
</div>
</div>
<div class="controls">
<button id="load-model" class="primary">Load WebGPU model</button>
<button id="start-mic" disabled>Start mic</button>
<button id="stop-mic" disabled>Stop</button>
<div>
<label for="decode-every">Decode every</label>
<select id="decode-every">
<option value="1.5">1.5 s</option>
<option value="2.5" selected>2.5 s</option>
<option value="4">4 s</option>
</select>
</div>
<div>
<label for="window-seconds">Max utterance</label>
<select id="window-seconds">
<option value="8">8 s</option>
<option value="12" selected>12 s</option>
<option value="18">18 s</option>
<option value="20">20 s</option>
</select>
</div>
<div>
<label for="provider">Provider</label>
<select id="provider">
<option value="webgpu" selected>WebGPU</option>
<option value="wasm">WASM CPU fallback</option>
</select>
</div>
<div>
<label for="noise-gate">Speech gate</label>
<input id="noise-gate" type="range" min="0" max="0.04" value="0.006" step="0.001" />
</div>
<div>
<label>Input level</label>
<div class="meter"><div id="level-bar"></div></div>
</div>
</div>
<div id="status" class="status">Idle. Load the PhaseB FP16 model first; the first load downloads about 232 MB and then uses browser cache.</div>
<div id="transcript" class="transcript">Final:
...
Partial:
...</div>
<div class="stats">
<div class="stat"><b>Decode Time</b><span id="stat-decode">-</span></div>
<div class="stat"><b>RTF</b><span id="stat-rtf">-</span></div>
<div class="stat"><b>Realtime Speed</b><span id="stat-speed">-</span></div>
<div class="stat"><b>Audio Window</b><span id="stat-audio">-</span></div>
<div class="stat"><b>Feature Frames</b><span id="stat-frames">-</span></div>
<div class="stat"><b>Provider</b><span id="stat-provider">-</span></div>
<div class="stat"><b>Heap Used</b><span id="stat-heap">-</span></div>
<div class="stat"><b>GPU Adapter</b><span id="stat-gpu">-</span></div>
<div class="stat"><b>Last Decode</b><span id="stat-last">-</span></div>
</div>
</div>
"""
)
return demo
if __name__ == "__main__":
main().launch()