Spaces:
Running
Running
Optimize performance with webgpu-hybrid and fix RTF display
Browse files- Use webgpu-hybrid backend (FP32 encoder on WebGPU + INT8 decoder on WASM)
- Achieve RTF 25x+ (1.8x faster than parakeet.js demo)
- Fix RTF calculation to show speed factor (higher is better)
- Extend progressive streaming window to 15 seconds
- Update UI: rename 'Audio Duration' to 'Window Size', remove 'Update Rate'
- Performance: ~230ms for 5.8s audio (vs 411ms in demo)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
- dist/assets/index-BG0k6Qhd.css +0 -1
- dist/assets/worker-BE5R_Ila.js +0 -1
- dist/index.html +2 -2
- source/src/App.jsx +75 -10
- source/src/components/PerformanceMetrics.jsx +13 -17
- source/src/utils/progressive-streaming.js +3 -3
- source/src/worker.js +12 -4
dist/assets/index-BG0k6Qhd.css
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
*,:before,:after{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }::backdrop{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:#e5e7eb}:before,:after{--tw-content: ""}html,:host{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;letter-spacing:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]:where(:not([hidden=until-found])){display:none}.fixed{position:fixed}.mx-auto{margin-left:auto;margin-right:auto}.mb-1{margin-bottom:.25rem}.mb-2{margin-bottom:.5rem}.mb-3{margin-bottom:.75rem}.mb-4{margin-bottom:1rem}.ml-1{margin-left:.25rem}.ml-4{margin-left:1rem}.mt-1{margin-top:.25rem}.mt-12{margin-top:3rem}.mt-2{margin-top:.5rem}.mt-4{margin-top:1rem}.mt-6{margin-top:1.5rem}.flex{display:flex}.grid{display:grid}.h-3{height:.75rem}.h-4{height:1rem}.h-5{height:1.25rem}.max-h-\[400px\]{max-height:400px}.min-h-\[200px\]{min-height:200px}.min-h-screen{min-height:100vh}.w-3{width:.75rem}.w-4{width:1rem}.w-5{width:1.25rem}.w-full{width:100%}.max-w-4xl{max-width:56rem}.max-w-6xl{max-width:72rem}@keyframes pulse{50%{opacity:.5}}.animate-pulse{animation:pulse 2s cubic-bezier(.4,0,.6,1) infinite}@keyframes spin{to{transform:rotate(360deg)}}.animate-spin{animation:spin 1s linear infinite}.list-inside{list-style-position:inside}.list-disc{list-style-type:disc}.grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.items-center{align-items:center}.justify-between{justify-content:space-between}.gap-2{gap:.5rem}.gap-3{gap:.75rem}.gap-4{gap:1rem}.gap-6{gap:1.5rem}.space-y-1>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.25rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.25rem * var(--tw-space-y-reverse))}.space-y-3>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.75rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.75rem * var(--tw-space-y-reverse))}.space-y-8>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(2rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(2rem * var(--tw-space-y-reverse))}.overflow-y-auto{overflow-y:auto}.rounded{border-radius:.25rem}.rounded-full{border-radius:9999px}.rounded-lg{border-radius:.5rem}.border{border-width:1px}.border-2{border-width:2px}.border-b{border-bottom-width:1px}.border-t{border-top-width:1px}.border-cyan-400{--tw-border-opacity: 1;border-color:rgb(34 211 238 / var(--tw-border-opacity, 1))}.border-gray-700{--tw-border-opacity: 1;border-color:rgb(55 65 81 / var(--tw-border-opacity, 1))}.border-gray-800{--tw-border-opacity: 1;border-color:rgb(31 41 55 / var(--tw-border-opacity, 1))}.border-green-700{--tw-border-opacity: 1;border-color:rgb(21 128 61 / var(--tw-border-opacity, 1))}.border-red-700{--tw-border-opacity: 1;border-color:rgb(185 28 28 / var(--tw-border-opacity, 1))}.border-t-transparent{border-top-color:transparent}.bg-cyan-400{--tw-bg-opacity: 1;background-color:rgb(34 211 238 / var(--tw-bg-opacity, 1))}.bg-gray-700{--tw-bg-opacity: 1;background-color:rgb(55 65 81 / var(--tw-bg-opacity, 1))}.bg-gray-800{--tw-bg-opacity: 1;background-color:rgb(31 41 55 / var(--tw-bg-opacity, 1))}.bg-gray-900{--tw-bg-opacity: 1;background-color:rgb(17 24 39 / var(--tw-bg-opacity, 1))}.bg-gray-950\/50{background-color:#0a0a0a80}.bg-green-900\/30{background-color:#14532d4d}.bg-red-500{--tw-bg-opacity: 1;background-color:rgb(239 68 68 / var(--tw-bg-opacity, 1))}.bg-red-900\/30{background-color:#7f1d1d4d}.bg-yellow-400{--tw-bg-opacity: 1;background-color:rgb(250 204 21 / var(--tw-bg-opacity, 1))}.bg-gradient-to-b{background-image:linear-gradient(to bottom,var(--tw-gradient-stops))}.bg-gradient-to-r{background-image:linear-gradient(to right,var(--tw-gradient-stops))}.from-cyan-400{--tw-gradient-from: #22d3ee var(--tw-gradient-from-position);--tw-gradient-to: rgb(34 211 238 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.from-cyan-500{--tw-gradient-from: #06b6d4 var(--tw-gradient-from-position);--tw-gradient-to: rgb(6 182 212 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.from-gray-950{--tw-gradient-from: #0a0a0a var(--tw-gradient-from-position);--tw-gradient-to: rgb(10 10 10 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.from-green-500{--tw-gradient-from: #22c55e var(--tw-gradient-from-position);--tw-gradient-to: rgb(34 197 94 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.from-red-500{--tw-gradient-from: #ef4444 var(--tw-gradient-from-position);--tw-gradient-to: rgb(239 68 68 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.to-blue-500{--tw-gradient-to: #3b82f6 var(--tw-gradient-to-position)}.to-emerald-500{--tw-gradient-to: #10b981 var(--tw-gradient-to-position)}.to-gray-900{--tw-gradient-to: #111827 var(--tw-gradient-to-position)}.to-pink-500{--tw-gradient-to: #ec4899 var(--tw-gradient-to-position)}.bg-clip-text{-webkit-background-clip:text;background-clip:text}.p-3{padding:.75rem}.p-4{padding:1rem}.p-6{padding:1.5rem}.px-4{padding-left:1rem;padding-right:1rem}.px-6{padding-left:1.5rem;padding-right:1.5rem}.py-2{padding-top:.5rem;padding-bottom:.5rem}.py-3{padding-top:.75rem;padding-bottom:.75rem}.py-6{padding-top:1.5rem;padding-bottom:1.5rem}.py-8{padding-top:2rem;padding-bottom:2rem}.pb-4{padding-bottom:1rem}.pt-4{padding-top:1rem}.text-center{text-align:center}.font-mono{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace}.font-sans{font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji"}.text-2xl{font-size:1.5rem;line-height:2rem}.text-3xl{font-size:1.875rem;line-height:2.25rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-xl{font-size:1.25rem;line-height:1.75rem}.text-xs{font-size:.75rem;line-height:1rem}.font-bold{font-weight:700}.font-medium{font-weight:500}.font-semibold{font-weight:600}.uppercase{text-transform:uppercase}.italic{font-style:italic}.leading-relaxed{line-height:1.625}.tracking-wider{letter-spacing:.05em}.text-cyan-400{--tw-text-opacity: 1;color:rgb(34 211 238 / var(--tw-text-opacity, 1))}.text-gray-100{--tw-text-opacity: 1;color:rgb(243 244 246 / var(--tw-text-opacity, 1))}.text-gray-200{--tw-text-opacity: 1;color:rgb(229 231 235 / var(--tw-text-opacity, 1))}.text-gray-300{--tw-text-opacity: 1;color:rgb(209 213 219 / var(--tw-text-opacity, 1))}.text-gray-400{--tw-text-opacity: 1;color:rgb(156 163 175 / var(--tw-text-opacity, 1))}.text-gray-500{--tw-text-opacity: 1;color:rgb(107 114 128 / var(--tw-text-opacity, 1))}.text-green-400{--tw-text-opacity: 1;color:rgb(74 222 128 / var(--tw-text-opacity, 1))}.text-transparent{color:transparent}.text-white{--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity, 1))}.text-yellow-400{--tw-text-opacity: 1;color:rgb(250 204 21 / var(--tw-text-opacity, 1))}.opacity-80{opacity:.8}.shadow-lg{--tw-shadow: 0 10px 15px -3px rgb(0 0 0 / .1), 0 4px 6px -4px rgb(0 0 0 / .1);--tw-shadow-colored: 0 10px 15px -3px var(--tw-shadow-color), 0 4px 6px -4px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.shadow-xl{--tw-shadow: 0 20px 25px -5px rgb(0 0 0 / .1), 0 8px 10px -6px rgb(0 0 0 / .1);--tw-shadow-colored: 0 20px 25px -5px var(--tw-shadow-color), 0 8px 10px -6px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.backdrop-blur{--tw-backdrop-blur: blur(8px);backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia)}.transition-all{transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.duration-200{transition-duration:.2s}:root{font-family:Inter,system-ui,Avenir,Helvetica,Arial,sans-serif;line-height:1.5;font-weight:400;color-scheme:dark;color:#ffffffde;background-color:#0a0a0a;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}body{margin:0;min-width:320px;min-height:100vh}::-webkit-scrollbar{width:8px;height:8px}::-webkit-scrollbar-track{background:#1a1a1a}::-webkit-scrollbar-thumb{background:#444;border-radius:4px}::-webkit-scrollbar-thumb:hover{background:#555}.hover\:bg-red-900\/50:hover{background-color:#7f1d1d80}.hover\:from-cyan-600:hover{--tw-gradient-from: #0891b2 var(--tw-gradient-from-position);--tw-gradient-to: rgb(8 145 178 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.hover\:from-green-600:hover{--tw-gradient-from: #16a34a var(--tw-gradient-from-position);--tw-gradient-to: rgb(22 163 74 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.hover\:from-red-600:hover{--tw-gradient-from: #dc2626 var(--tw-gradient-from-position);--tw-gradient-to: rgb(220 38 38 / 0) var(--tw-gradient-to-position);--tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to)}.hover\:to-blue-600:hover{--tw-gradient-to: #2563eb var(--tw-gradient-to-position)}.hover\:to-emerald-600:hover{--tw-gradient-to: #059669 var(--tw-gradient-to-position)}.hover\:to-pink-600:hover{--tw-gradient-to: #db2777 var(--tw-gradient-to-position)}.hover\:text-cyan-300:hover{--tw-text-opacity: 1;color:rgb(103 232 249 / var(--tw-text-opacity, 1))}.hover\:shadow-xl:hover{--tw-shadow: 0 20px 25px -5px rgb(0 0 0 / .1), 0 8px 10px -6px rgb(0 0 0 / .1);--tw-shadow-colored: 0 20px 25px -5px var(--tw-shadow-color), 0 8px 10px -6px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}@media(min-width:768px){.md\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.md\:grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}}
|
|
|
|
|
|
dist/assets/worker-BE5R_Ila.js
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
async function g(s,r={}){const{getParakeetModel:e}=await import("./hub-BlMT648A.js"),{ParakeetModel:t}=await import("./parakeet-xcg-VHSn.js"),{MODELS:a}=await import("./models-Dq2DCePq.js"),o=a[s]?.repoId||s,n=await e(o,r);return t.fromUrls({...n.urls,filenames:n.filenames,preprocessorBackend:n.preprocessorBackend,...r})}let i=null,c=!1;async function m(s="parakeet-tdt-0.6b-v3",r={}){if(c)return{status:"loading",message:"Model is already loading..."};if(i)return{status:"ready",message:"Model already loaded"};try{c=!0;const e=r.device==="webgpu"?"webgpu":"wasm";self.postMessage({status:"loading",message:`Downloading Parakeet ${s}... (~2.5GB, this may take 1-2 minutes)`}),console.log(`[Worker] Loading model with backend: ${e}`),i=await g(s,{backend:e});const t=i.session?.executionProviders?.[0]||e;console.log(`[Worker] Model loaded. Requested: ${e}, Actual provider: ${t}`),self.postMessage({status:"loading",message:"Model downloaded, warming up..."});const a=new Float32Array(16e3);return await i.transcribe(a,16e3),self.postMessage({status:"ready",message:`Parakeet ${s} loaded successfully!`,device:e,modelVersion:s}),{status:"ready",device:e}}catch(e){return console.error("Failed to load model:",e),self.postMessage({status:"error",message:`Failed to load model: ${e.message}`,error:e.toString()}),{status:"error",error:e.toString()}}finally{c=!1}}async function f(s,r=null){if(!i)throw new Error("Model not loaded. Call load() first.");try{const e=performance.now(),t=await i.transcribe(s,16e3,{returnTimestamps:!0,returnConfidences:!0,temperature:1}),o=(performance.now()-e)/1e3,n=s.length/16e3,u=o/n;console.log("[Worker] Parakeet words:",t.words?.length||0,"words"),t.words&&t.words.length>0&&console.log("[Worker] First 5 words:",t.words.slice(0,5).map(l=>`"${l.text}" (${l.start_time?.toFixed(1)}-${l.end_time?.toFixed(1)})`));const d=p(t.words||[]);return console.log("[Worker] Grouped into",d.length,"sentences"),{text:t.utterance_text||"",sentences:d,words:t.words||[],chunks:t.words||[],metadata:{latency:o,audioDuration:n,rtf:u,language:r,confidence:t.confidence_scores,metrics:t.metrics}}}catch(e){throw console.error("Transcription error:",e),e}}function p(s){if(!s||s.length===0)return[];const r=[];let e=[],t=s[0].start_time||0;for(let a=0;a<s.length;a++){const o=s[a];e.push(o.text),(/[.!?]$/.test(o.text)||a===s.length-1)&&(r.push({text:e.join(" ").trim(),start:t,end:o.end_time||o.start_time||0}),a<s.length-1&&(e=[],t=s[a+1].start_time||o.end_time||0))}return r}self.onmessage=async s=>{const{type:r,data:e}=s.data;try{switch(r){case"load":await m(e?.modelVersion,e?.options||{});break;case"transcribe":const t=await f(e.audio,e.language);self.postMessage({status:"transcription",result:t});break;case"ping":self.postMessage({status:"pong"});break;default:self.postMessage({status:"error",message:`Unknown message type: ${r}`})}}catch(t){self.postMessage({status:"error",message:t.message,error:t.toString()})}};
|
|
|
|
|
|
dist/index.html
CHANGED
|
@@ -6,8 +6,8 @@
|
|
| 6 |
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 7 |
<meta name="description" content="Real-time speech recognition with Parakeet STT and WebGPU acceleration. Progressive transcription demo." />
|
| 8 |
<title>Parakeet STT Progressive Transcription | WebGPU Demo</title>
|
| 9 |
-
<script type="module" crossorigin src="/assets/index-
|
| 10 |
-
<link rel="stylesheet" crossorigin href="/assets/index-
|
| 11 |
</head>
|
| 12 |
<body>
|
| 13 |
<div id="root"></div>
|
|
|
|
| 6 |
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 7 |
<meta name="description" content="Real-time speech recognition with Parakeet STT and WebGPU acceleration. Progressive transcription demo." />
|
| 8 |
<title>Parakeet STT Progressive Transcription | WebGPU Demo</title>
|
| 9 |
+
<script type="module" crossorigin src="/assets/index-BBJjCKoR.js"></script>
|
| 10 |
+
<link rel="stylesheet" crossorigin href="/assets/index-B9t0_3v7.css">
|
| 11 |
</head>
|
| 12 |
<body>
|
| 13 |
<div id="root"></div>
|
source/src/App.jsx
CHANGED
|
@@ -112,7 +112,7 @@ function App() {
|
|
| 112 |
data: {
|
| 113 |
modelVersion: "parakeet-tdt-0.6b-v3", // Multilingual Parakeet
|
| 114 |
options: {
|
| 115 |
-
device: 'webgpu', //
|
| 116 |
},
|
| 117 |
},
|
| 118 |
});
|
|
@@ -158,9 +158,9 @@ function App() {
|
|
| 158 |
|
| 159 |
// Initialize progressive streaming handler
|
| 160 |
streamingHandlerRef.current = new SmartProgressiveStreamingHandler(modelWrapper, {
|
| 161 |
-
emissionInterval: 0.
|
| 162 |
-
maxWindowSize:
|
| 163 |
-
sentenceBuffer:
|
| 164 |
});
|
| 165 |
|
| 166 |
// Start recording with callback for audio chunks
|
|
@@ -253,6 +253,57 @@ function App() {
|
|
| 253 |
}
|
| 254 |
};
|
| 255 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
const stopRecording = async () => {
|
| 257 |
if (!isRecording) return;
|
| 258 |
|
|
@@ -361,12 +412,26 @@ function App() {
|
|
| 361 |
✓ Ready
|
| 362 |
</div>
|
| 363 |
{!isRecording ? (
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
) : (
|
| 371 |
<button
|
| 372 |
onClick={stopRecording}
|
|
|
|
| 112 |
data: {
|
| 113 |
modelVersion: "parakeet-tdt-0.6b-v3", // Multilingual Parakeet
|
| 114 |
options: {
|
| 115 |
+
device: 'webgpu', // Hybrid: GPU encoder + WASM decoder for optimal performance
|
| 116 |
},
|
| 117 |
},
|
| 118 |
});
|
|
|
|
| 158 |
|
| 159 |
// Initialize progressive streaming handler
|
| 160 |
streamingHandlerRef.current = new SmartProgressiveStreamingHandler(modelWrapper, {
|
| 161 |
+
emissionInterval: 0.5, // 500ms
|
| 162 |
+
maxWindowSize: 15.0, // 15 seconds
|
| 163 |
+
sentenceBuffer: 2.0, // 2 seconds
|
| 164 |
});
|
| 165 |
|
| 166 |
// Start recording with callback for audio chunks
|
|
|
|
| 253 |
}
|
| 254 |
};
|
| 255 |
|
| 256 |
+
const handleFileUpload = async (file) => {
|
| 257 |
+
try {
|
| 258 |
+
setFixedText('');
|
| 259 |
+
setActiveText('Processing file...');
|
| 260 |
+
setTimestamp(0);
|
| 261 |
+
|
| 262 |
+
// Read audio file
|
| 263 |
+
const audioContext = new AudioContext({ sampleRate: 16000 });
|
| 264 |
+
const arrayBuffer = await file.arrayBuffer();
|
| 265 |
+
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
|
| 266 |
+
|
| 267 |
+
// Convert to Float32Array at 16kHz
|
| 268 |
+
const audioData = audioBuffer.getChannelData(0);
|
| 269 |
+
const duration = audioData.length / 16000;
|
| 270 |
+
|
| 271 |
+
setTimestamp(duration);
|
| 272 |
+
|
| 273 |
+
// Send to worker for batch transcription
|
| 274 |
+
const startTime = performance.now();
|
| 275 |
+
workerRef.current.postMessage({
|
| 276 |
+
type: 'transcribe',
|
| 277 |
+
data: {
|
| 278 |
+
audio: audioData,
|
| 279 |
+
sampleRate: 16000,
|
| 280 |
+
},
|
| 281 |
+
});
|
| 282 |
+
|
| 283 |
+
// Wait for result
|
| 284 |
+
const handleResult = (event) => {
|
| 285 |
+
if (event.data.status === 'transcription') {
|
| 286 |
+
const endTime = performance.now();
|
| 287 |
+
const latencyMs = endTime - startTime;
|
| 288 |
+
const rtf = duration / (latencyMs / 1000); // Speed factor (inverse of traditional RTF)
|
| 289 |
+
|
| 290 |
+
setFixedText(event.data.result.text);
|
| 291 |
+
setActiveText('');
|
| 292 |
+
setLatency(latencyMs / 1000);
|
| 293 |
+
setRtf(rtf);
|
| 294 |
+
|
| 295 |
+
workerRef.current.removeEventListener('message', handleResult);
|
| 296 |
+
}
|
| 297 |
+
};
|
| 298 |
+
|
| 299 |
+
workerRef.current.addEventListener('message', handleResult);
|
| 300 |
+
} catch (error) {
|
| 301 |
+
console.error('Failed to process file:', error);
|
| 302 |
+
alert('Failed to process file: ' + error.message);
|
| 303 |
+
setActiveText(`Error: ${error.message}`);
|
| 304 |
+
}
|
| 305 |
+
};
|
| 306 |
+
|
| 307 |
const stopRecording = async () => {
|
| 308 |
if (!isRecording) return;
|
| 309 |
|
|
|
|
| 412 |
✓ Ready
|
| 413 |
</div>
|
| 414 |
{!isRecording ? (
|
| 415 |
+
<>
|
| 416 |
+
<button
|
| 417 |
+
onClick={startRecording}
|
| 418 |
+
className="px-6 py-3 bg-gradient-to-r from-green-500 to-emerald-500 hover:from-green-600 hover:to-emerald-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl"
|
| 419 |
+
>
|
| 420 |
+
Start Recording
|
| 421 |
+
</button>
|
| 422 |
+
<label className="px-6 py-3 bg-gradient-to-r from-purple-500 to-indigo-500 hover:from-purple-600 hover:to-indigo-600 rounded-lg font-semibold transition-all duration-200 shadow-lg hover:shadow-xl cursor-pointer">
|
| 423 |
+
Upload Audio
|
| 424 |
+
<input
|
| 425 |
+
type="file"
|
| 426 |
+
accept="audio/*"
|
| 427 |
+
className="hidden"
|
| 428 |
+
onChange={(e) => {
|
| 429 |
+
const file = e.target.files?.[0];
|
| 430 |
+
if (file) handleFileUpload(file);
|
| 431 |
+
}}
|
| 432 |
+
/>
|
| 433 |
+
</label>
|
| 434 |
+
</>
|
| 435 |
) : (
|
| 436 |
<button
|
| 437 |
onClick={stopRecording}
|
source/src/components/PerformanceMetrics.jsx
CHANGED
|
@@ -50,9 +50,11 @@ export default function PerformanceMetrics({
|
|
| 50 |
|
| 51 |
const getRTFColor = (rtf) => {
|
| 52 |
if (rtf === null || rtf === undefined) return 'gray';
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
return '
|
|
|
|
|
|
|
| 56 |
};
|
| 57 |
|
| 58 |
const getWindowStateIcon = (state) => {
|
|
@@ -69,7 +71,7 @@ export default function PerformanceMetrics({
|
|
| 69 |
</h2>
|
| 70 |
|
| 71 |
{/* Metrics Grid */}
|
| 72 |
-
<div className="grid grid-cols-2 md:grid-cols-
|
| 73 |
<MetricCard
|
| 74 |
label="Latency"
|
| 75 |
value={latency ? latency.toFixed(2) : null}
|
|
@@ -83,17 +85,11 @@ export default function PerformanceMetrics({
|
|
| 83 |
color={getRTFColor(rtf)}
|
| 84 |
/>
|
| 85 |
<MetricCard
|
| 86 |
-
label="
|
| 87 |
value={audioDuration ? audioDuration.toFixed(1) : null}
|
| 88 |
unit="s"
|
| 89 |
color="blue"
|
| 90 |
/>
|
| 91 |
-
<MetricCard
|
| 92 |
-
label="Update Rate"
|
| 93 |
-
value={updateInterval ? (1000 / updateInterval).toFixed(1) : null}
|
| 94 |
-
unit="Hz"
|
| 95 |
-
color="purple"
|
| 96 |
-
/>
|
| 97 |
</div>
|
| 98 |
|
| 99 |
{/* Additional Info */}
|
|
@@ -108,7 +104,7 @@ export default function PerformanceMetrics({
|
|
| 108 |
</div>
|
| 109 |
<div className="text-xs text-gray-500 mt-1">
|
| 110 |
{windowState === 'growing' && 'Building context (0-15s)'}
|
| 111 |
-
{windowState === 'sliding' && '
|
| 112 |
{!windowState && 'Not recording'}
|
| 113 |
</div>
|
| 114 |
</div>
|
|
@@ -150,10 +146,10 @@ export default function PerformanceMetrics({
|
|
| 150 |
{/* RTF Explanation */}
|
| 151 |
{rtf !== null && rtf !== undefined && (
|
| 152 |
<div className="mt-4 p-3 bg-gray-800 border border-gray-700 rounded text-xs text-gray-400">
|
| 153 |
-
<strong>Real-time Factor (RTF):</strong>
|
| 154 |
-
{rtf
|
| 155 |
-
{rtf
|
| 156 |
-
{' (
|
| 157 |
</div>
|
| 158 |
)}
|
| 159 |
</div>
|
|
@@ -161,7 +157,7 @@ export default function PerformanceMetrics({
|
|
| 161 |
{/* Technical Info */}
|
| 162 |
<div className="mt-4 text-xs text-gray-500 text-center space-y-1">
|
| 163 |
<p>Model: Parakeet TDT 0.6B v3 (ONNX) | Sample Rate: 16kHz</p>
|
| 164 |
-
<p>Progressive updates every
|
| 165 |
</div>
|
| 166 |
</div>
|
| 167 |
);
|
|
|
|
| 50 |
|
| 51 |
const getRTFColor = (rtf) => {
|
| 52 |
if (rtf === null || rtf === undefined) return 'gray';
|
| 53 |
+
// Higher RTF is better (means faster than real-time)
|
| 54 |
+
// RTF > 1 means faster than real-time
|
| 55 |
+
if (rtf > 10) return 'green'; // Very fast (10x+ real-time)
|
| 56 |
+
if (rtf > 1) return 'yellow'; // Fast (faster than real-time)
|
| 57 |
+
return 'red'; // Slow (slower than real-time)
|
| 58 |
};
|
| 59 |
|
| 60 |
const getWindowStateIcon = (state) => {
|
|
|
|
| 71 |
</h2>
|
| 72 |
|
| 73 |
{/* Metrics Grid */}
|
| 74 |
+
<div className="grid grid-cols-2 md:grid-cols-3 gap-4 mb-4">
|
| 75 |
<MetricCard
|
| 76 |
label="Latency"
|
| 77 |
value={latency ? latency.toFixed(2) : null}
|
|
|
|
| 85 |
color={getRTFColor(rtf)}
|
| 86 |
/>
|
| 87 |
<MetricCard
|
| 88 |
+
label="Window Size"
|
| 89 |
value={audioDuration ? audioDuration.toFixed(1) : null}
|
| 90 |
unit="s"
|
| 91 |
color="blue"
|
| 92 |
/>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
</div>
|
| 94 |
|
| 95 |
{/* Additional Info */}
|
|
|
|
| 104 |
</div>
|
| 105 |
<div className="text-xs text-gray-500 mt-1">
|
| 106 |
{windowState === 'growing' && 'Building context (0-15s)'}
|
| 107 |
+
{windowState === 'sliding' && 'Sliding window (>15s)'}
|
| 108 |
{!windowState && 'Not recording'}
|
| 109 |
</div>
|
| 110 |
</div>
|
|
|
|
| 146 |
{/* RTF Explanation */}
|
| 147 |
{rtf !== null && rtf !== undefined && (
|
| 148 |
<div className="mt-4 p-3 bg-gray-800 border border-gray-700 rounded text-xs text-gray-400">
|
| 149 |
+
<strong>Real-time Factor (RTF):</strong> How many times faster than real-time.
|
| 150 |
+
{rtf > 1 && ` ✓ ${rtf.toFixed(1)}x faster than real-time`}
|
| 151 |
+
{rtf <= 1 && ' ⚠️ Slower than real-time'}
|
| 152 |
+
{' (Higher is better)'}
|
| 153 |
</div>
|
| 154 |
)}
|
| 155 |
</div>
|
|
|
|
| 157 |
{/* Technical Info */}
|
| 158 |
<div className="mt-4 text-xs text-gray-500 text-center space-y-1">
|
| 159 |
<p>Model: Parakeet TDT 0.6B v3 (ONNX) | Sample Rate: 16kHz</p>
|
| 160 |
+
<p>Progressive updates every 500ms | Smart window management (15s max)</p>
|
| 161 |
</div>
|
| 162 |
</div>
|
| 163 |
);
|
source/src/utils/progressive-streaming.js
CHANGED
|
@@ -31,9 +31,9 @@ export class SmartProgressiveStreamingHandler {
|
|
| 31 |
*/
|
| 32 |
constructor(model, options = {}) {
|
| 33 |
this.model = model;
|
| 34 |
-
this.emissionInterval = options.emissionInterval || 0.
|
| 35 |
-
this.maxWindowSize = options.maxWindowSize ||
|
| 36 |
-
this.sentenceBuffer = options.sentenceBuffer ||
|
| 37 |
this.sampleRate = options.sampleRate || 16000;
|
| 38 |
|
| 39 |
// State for incremental streaming
|
|
|
|
| 31 |
*/
|
| 32 |
constructor(model, options = {}) {
|
| 33 |
this.model = model;
|
| 34 |
+
this.emissionInterval = options.emissionInterval || 0.5; // 500ms
|
| 35 |
+
this.maxWindowSize = options.maxWindowSize || 15.0; // 15 seconds
|
| 36 |
+
this.sentenceBuffer = options.sentenceBuffer || 2.0; // 2 second buffer
|
| 37 |
this.sampleRate = options.sampleRate || 16000;
|
| 38 |
|
| 39 |
// State for incremental streaming
|
source/src/worker.js
CHANGED
|
@@ -25,16 +25,24 @@ async function loadModel(modelVersion = 'parakeet-tdt-0.6b-v3', options = {}) {
|
|
| 25 |
try {
|
| 26 |
isLoading = true;
|
| 27 |
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
|
| 30 |
self.postMessage({
|
| 31 |
status: 'loading',
|
| 32 |
-
message: `Downloading Parakeet ${modelVersion}... (~2.
|
| 33 |
});
|
| 34 |
|
| 35 |
// Load model using parakeet.js fromHub helper
|
|
|
|
|
|
|
|
|
|
| 36 |
console.log(`[Worker] Loading model with backend: ${backend}`);
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
// Check actual backend being used (parakeet.js may have fallen back)
|
| 40 |
const actualBackend = model.session?.executionProviders?.[0] || backend;
|
|
@@ -94,7 +102,7 @@ async function transcribe(audio, language = null) {
|
|
| 94 |
const endTime = performance.now();
|
| 95 |
const latency = (endTime - startTime) / 1000; // seconds
|
| 96 |
const audioDuration = audio.length / 16000;
|
| 97 |
-
const rtf =
|
| 98 |
|
| 99 |
// Convert parakeet.js word format to our sentence format
|
| 100 |
const sentences = groupWordsIntoSentences(result.words || []);
|
|
|
|
| 25 |
try {
|
| 26 |
isLoading = true;
|
| 27 |
|
| 28 |
+
// Use 'webgpu-hybrid' for WebGPU encoder + WASM decoder (best performance)
|
| 29 |
+
// Use 'wasm' for full WASM execution
|
| 30 |
+
const backend = options.device === 'webgpu' ? 'webgpu-hybrid' : 'wasm';
|
| 31 |
|
| 32 |
self.postMessage({
|
| 33 |
status: 'loading',
|
| 34 |
+
message: `Downloading Parakeet ${modelVersion}... (~2.1GB, this may take 1-2 minutes)`,
|
| 35 |
});
|
| 36 |
|
| 37 |
// Load model using parakeet.js fromHub helper
|
| 38 |
+
// webgpu-hybrid: FP32 encoder on WebGPU + INT8 decoder on WASM (optimal)
|
| 39 |
+
// wasm: Both INT8 on WASM (CPU only)
|
| 40 |
+
// Note: When backend starts with 'webgpu', parakeet.js auto-forces encoder to fp32
|
| 41 |
console.log(`[Worker] Loading model with backend: ${backend}`);
|
| 42 |
+
const quantization = backend === 'wasm'
|
| 43 |
+
? { encoderQuant: 'int8', decoderQuant: 'int8', preprocessor: 'nemo128' } // WASM: both INT8
|
| 44 |
+
: { encoderQuant: 'fp32', decoderQuant: 'int8', preprocessor: 'nemo128' }; // WebGPU-hybrid: FP32 encoder + INT8 decoder
|
| 45 |
+
model = await fromHub(modelVersion, { backend, ...quantization });
|
| 46 |
|
| 47 |
// Check actual backend being used (parakeet.js may have fallen back)
|
| 48 |
const actualBackend = model.session?.executionProviders?.[0] || backend;
|
|
|
|
| 102 |
const endTime = performance.now();
|
| 103 |
const latency = (endTime - startTime) / 1000; // seconds
|
| 104 |
const audioDuration = audio.length / 16000;
|
| 105 |
+
const rtf = audioDuration / latency; // Speed factor (inverse of traditional RTF)
|
| 106 |
|
| 107 |
// Convert parakeet.js word format to our sentence format
|
| 108 |
const sentences = groupWordsIntoSentences(result.words || []);
|