| from pathlib import Path |
| import json |
|
|
| import gradio as gr |
|
|
|
|
| ROOT = Path(__file__).parent |
|
|
|
|
| def main() -> gr.Blocks: |
| tokens = json.loads((ROOT / "tokens.json").read_text(encoding="utf-8")) |
| mel_filters = json.loads((ROOT / "mel_filters_slaney_80x257.json").read_text(encoding="utf-8")) |
| app_js = (ROOT / "app.js").read_text(encoding="utf-8") |
| head = ( |
| "<script type=\"module\">\n" |
| f"window.VISUALEARS_TOKENS = {json.dumps(tokens, ensure_ascii=False)};\n" |
| f"window.VISUALEARS_MEL_FILTERS = {json.dumps(mel_filters, separators=(',', ':'))};\n" |
| f"{app_js}\n" |
| "</script>" |
| ) |
| with gr.Blocks( |
| title="VisualEars WebGPU ASR", |
| head=head, |
| css=""" |
| .gradio-container { max-width: 1180px !important; } |
| #visualears-root { |
| font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; |
| color: #111827; |
| } |
| #visualears-root .topbar { |
| display: flex; |
| align-items: center; |
| justify-content: space-between; |
| gap: 16px; |
| margin-bottom: 16px; |
| } |
| #visualears-root h1 { |
| font-size: 28px; |
| line-height: 1.15; |
| margin: 0; |
| letter-spacing: 0; |
| } |
| #visualears-root .subtle { |
| color: #4b5563; |
| margin: 6px 0 0; |
| font-size: 14px; |
| } |
| #visualears-root .controls { |
| display: grid; |
| grid-template-columns: repeat(4, minmax(0, 1fr)); |
| gap: 10px; |
| align-items: end; |
| margin: 14px 0; |
| } |
| #visualears-root button, #visualears-root select, #visualears-root input { |
| width: 100%; |
| min-height: 40px; |
| border: 1px solid #cbd5e1; |
| border-radius: 7px; |
| background: white; |
| color: #111827; |
| font: inherit; |
| padding: 8px 10px; |
| } |
| #visualears-root button.primary { |
| background: #111827; |
| color: white; |
| border-color: #111827; |
| } |
| #visualears-root button:disabled { |
| opacity: 0.45; |
| cursor: not-allowed; |
| } |
| #visualears-root label { |
| display: block; |
| font-size: 12px; |
| color: #475569; |
| margin-bottom: 5px; |
| } |
| #visualears-root .status { |
| min-height: 42px; |
| overflow-wrap: anywhere; |
| border: 1px solid #d1d5db; |
| background: #f8fafc; |
| border-radius: 7px; |
| padding: 10px 12px; |
| margin: 10px 0 14px; |
| } |
| #visualears-root .transcript { |
| min-height: 150px; |
| direction: rtl; |
| text-align: right; |
| font-size: 26px; |
| line-height: 1.65; |
| border: 1px solid #cbd5e1; |
| border-radius: 8px; |
| padding: 18px; |
| background: #ffffff; |
| overflow-wrap: anywhere; |
| white-space: pre-wrap; |
| } |
| #visualears-root .stats { |
| display: grid; |
| grid-template-columns: repeat(4, minmax(0, 1fr)); |
| gap: 10px; |
| margin-top: 14px; |
| } |
| #visualears-root .stat { |
| border: 1px solid #d1d5db; |
| border-radius: 8px; |
| padding: 12px; |
| background: #ffffff; |
| min-height: 74px; |
| } |
| #visualears-root .stat b { |
| display: block; |
| font-size: 12px; |
| color: #64748b; |
| margin-bottom: 8px; |
| font-weight: 600; |
| } |
| #visualears-root .stat span { |
| font-size: 20px; |
| color: #111827; |
| overflow-wrap: anywhere; |
| } |
| #visualears-root .meter { |
| height: 8px; |
| background: #e5e7eb; |
| overflow: hidden; |
| border-radius: 999px; |
| margin-top: 8px; |
| } |
| #visualears-root .meter > div { |
| height: 100%; |
| width: 0%; |
| background: #10b981; |
| transition: width 120ms linear; |
| } |
| @media (max-width: 820px) { |
| #visualears-root .topbar { align-items: flex-start; flex-direction: column; } |
| #visualears-root .controls, #visualears-root .stats { grid-template-columns: 1fr 1fr; } |
| #visualears-root .transcript { font-size: 22px; } |
| } |
| @media (max-width: 520px) { |
| #visualears-root .controls, #visualears-root .stats { grid-template-columns: 1fr; } |
| } |
| """, |
| ) as demo: |
| gr.HTML( |
| """ |
| <div id="visualears-root"> |
| <div class="topbar"> |
| <div> |
| <h1>VisualEars PhaseB Persian ASR FP16</h1> |
| <p class="subtle">Browser-only WebGPU decode with the PhaseB FP16 ONNX export.</p> |
| </div> |
| </div> |
| |
| <div class="controls"> |
| <button id="load-model" class="primary">Load WebGPU model</button> |
| <button id="start-mic" disabled>Start mic</button> |
| <button id="stop-mic" disabled>Stop</button> |
| <div> |
| <label for="decode-every">Decode every</label> |
| <select id="decode-every"> |
| <option value="1.5">1.5 s</option> |
| <option value="2.5" selected>2.5 s</option> |
| <option value="4">4 s</option> |
| </select> |
| </div> |
| <div> |
| <label for="window-seconds">Max utterance</label> |
| <select id="window-seconds"> |
| <option value="8">8 s</option> |
| <option value="12" selected>12 s</option> |
| <option value="18">18 s</option> |
| <option value="20">20 s</option> |
| </select> |
| </div> |
| <div> |
| <label for="provider">Provider</label> |
| <select id="provider"> |
| <option value="webgpu" selected>WebGPU</option> |
| <option value="wasm">WASM CPU fallback</option> |
| </select> |
| </div> |
| <div> |
| <label for="noise-gate">Speech gate</label> |
| <input id="noise-gate" type="range" min="0" max="0.04" value="0.006" step="0.001" /> |
| </div> |
| <div> |
| <label>Input level</label> |
| <div class="meter"><div id="level-bar"></div></div> |
| </div> |
| </div> |
| |
| <div id="status" class="status">Idle. Load the PhaseB FP16 model first; the first load downloads about 232 MB and then uses browser cache.</div> |
| <div id="transcript" class="transcript">Final: |
| ... |
| |
| Partial: |
| ...</div> |
| |
| <div class="stats"> |
| <div class="stat"><b>Decode Time</b><span id="stat-decode">-</span></div> |
| <div class="stat"><b>RTF</b><span id="stat-rtf">-</span></div> |
| <div class="stat"><b>Realtime Speed</b><span id="stat-speed">-</span></div> |
| <div class="stat"><b>Audio Window</b><span id="stat-audio">-</span></div> |
| <div class="stat"><b>Feature Frames</b><span id="stat-frames">-</span></div> |
| <div class="stat"><b>Provider</b><span id="stat-provider">-</span></div> |
| <div class="stat"><b>Heap Used</b><span id="stat-heap">-</span></div> |
| <div class="stat"><b>GPU Adapter</b><span id="stat-gpu">-</span></div> |
| <div class="stat"><b>Last Decode</b><span id="stat-last">-</span></div> |
| </div> |
| </div> |
| """ |
| ) |
|
|
| return demo |
|
|
|
|
| if __name__ == "__main__": |
| main().launch() |
|
|