Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Soprano TTS — Browser ONNX</title> | |
| <style> | |
| *,*::before,*::after{box-sizing:border-box;margin:0;padding:0} | |
| :root{--bg:#0a0a0f;--surface:#13131f;--surface2:#1a1a2e;--border:#2a2a40;--text:#e4e4ef;--text2:#9898b0;--accent:#6366f1;--accent2:#818cf8;--green:#22c55e;--red:#ef4444;--yellow:#eab308;--font:'Segoe UI',system-ui,-apple-system,sans-serif;--mono:'SF Mono','Fira Code',monospace} | |
| body{background:var(--bg);color:var(--text);font-family:var(--font);min-height:100vh;display:flex;justify-content:center;padding:1.5rem} | |
| .app{max-width:720px;width:100%;display:flex;flex-direction:column;gap:1.5rem} | |
| header{text-align:center;padding:1rem 0} | |
| header h1{font-size:1.8rem;background:linear-gradient(135deg,var(--accent),#a78bfa);-webkit-background-clip:text;-webkit-text-fill-color:transparent;font-weight:700} | |
| header p{color:var(--text2);font-size:.85rem;margin-top:.35rem} | |
| .card{background:var(--surface);border:1px solid var(--border);border-radius:12px;padding:1.25rem} | |
| textarea{width:100%;min-height:100px;background:var(--surface2);border:1px solid var(--border);border-radius:8px;color:var(--text);font-family:var(--font);font-size:.95rem;padding:.75rem;resize:vertical;outline:none;transition:border-color .2s} | |
| textarea:focus{border-color:var(--accent)} | |
| .meta{display:flex;justify-content:space-between;align-items:center;margin-top:.5rem;font-size:.75rem;color:var(--text2)} | |
| .samples{display:flex;flex-wrap:wrap;gap:.4rem;margin-top:.75rem} | |
| .samples button{background:var(--surface2);border:1px solid var(--border);color:var(--text2);border-radius:6px;padding:.3rem .6rem;font-size:.72rem;cursor:pointer;transition:all .15s} | |
| .samples button:hover{border-color:var(--accent);color:var(--text)} | |
| .controls{display:flex;gap:.6rem;margin-top:1rem} | |
| .btn{display:flex;align-items:center;gap:.45rem;padding:.6rem 1.2rem;border-radius:8px;font-size:.9rem;font-weight:600;cursor:pointer;border:none;transition:all .15s} | |
| .btn-primary{background:var(--accent);color:#fff} | |
| .btn-primary:hover:not(:disabled){background:var(--accent2)} | |
| .btn-primary:disabled{opacity:.45;cursor:not-allowed} | |
| .btn-stop{background:var(--surface2);color:var(--text2);border:1px solid var(--border)} | |
| .btn-stop:hover:not(:disabled){border-color:var(--red);color:var(--red)} | |
| .btn-stop:disabled{opacity:.3;cursor:not-allowed} | |
| .status-row{display:flex;align-items:center;gap:.5rem;font-size:.8rem} | |
| .dot{width:8px;height:8px;border-radius:50%;background:var(--text2);transition:background .3s} | |
| .dot.loading{background:var(--yellow);animation:pulse 1s infinite} | |
| .dot.ready{background:var(--green)} | |
| .dot.running{background:var(--accent);animation:pulse .7s infinite} | |
| .dot.error{background:var(--red)} | |
| @keyframes pulse{0%,100%{opacity:1}50%{opacity:.4}} | |
| .metrics{display:grid;grid-template-columns:1fr 1fr;gap:.75rem;margin-top:1rem} | |
| .metric-card{background:var(--surface2);border-radius:8px;padding:.65rem .85rem} | |
| .metric-card label{font-size:.7rem;color:var(--text2);text-transform:uppercase;letter-spacing:.04em} | |
| .metric-card .val{font-size:1.3rem;font-weight:700;font-family:var(--mono);margin-top:.15rem} | |
| .progress-outer{width:100%;height:6px;background:var(--surface2);border-radius:3px;margin-top:.75rem;overflow:hidden;display:none} | |
| .progress-inner{height:100%;background:linear-gradient(90deg,var(--accent),#a78bfa);border-radius:3px;transition:width .3s;width:0%} | |
| canvas{width:100%;height:100px;border-radius:8px;background:var(--surface2);margin-top:.75rem} | |
| .log{margin-top:.5rem;max-height:80px;overflow-y:auto;font-family:var(--mono);font-size:.7rem;color:var(--text2);line-height:1.5;padding:.5rem;background:var(--surface2);border-radius:6px;display:none} | |
| footer{text-align:center;font-size:.7rem;color:var(--text2);padding:.5rem 0} | |
| footer a{color:var(--accent2);text-decoration:none} | |
| </style> | |
| </head> | |
| <body> | |
| <div class="app"> | |
| <header> | |
| <h1>🎧 Soprano TTS</h1> | |
| <p>Real-time neural text-to-speech — running entirely in your browser via ONNX Runtime WASM</p> | |
| </header> | |
| <div class="card"> | |
| <textarea id="text-input" placeholder="Type or paste text to synthesize..." maxlength="500">Hello, welcome to Soprano. This is a demonstration of real-time text to speech running entirely in your browser.</textarea> | |
| <div class="meta"> | |
| <span id="char-count">0</span>/500 | |
| <span id="model-label">int8 backbone + int8 decoder</span> | |
| </div> | |
| <div class="samples"> | |
| <button data-t="Hello, welcome to Soprano. This is a demonstration of real-time text to speech running entirely in your browser.">Demo greeting</button> | |
| <button data-t="Wow, congratulations! That's absolutely fantastic news! I'm so thrilled for you!">Excited</button> | |
| <button data-t="I completely understand how frustrating this must be for you. Let me take care of this right away and make sure we get it resolved.">Empathetic</button> | |
| <button data-t="Great question! I'd be happy to walk you through this step by step. First, let's start with the basics.">Helpful guide</button> | |
| </div> | |
| <div class="controls"> | |
| <button class="btn btn-primary" id="btn-gen" disabled>Loading models…</button> | |
| <button class="btn btn-stop" id="btn-stop" disabled>■ Stop</button> | |
| </div> | |
| <div class="progress-outer" id="progress-bar"><div class="progress-inner" id="progress-fill"></div></div> | |
| </div> | |
| <div class="card"> | |
| <div class="status-row"> | |
| <span class="dot" id="dot"></span> | |
| <span id="status-text">Initializing…</span> | |
| </div> | |
| <div class="metrics"> | |
| <div class="metric-card"><label>Time to First Byte</label><div class="val" id="m-ttfb">--<small style="font-size:.6em;font-weight:400"> ms</small></div></div> | |
| <div class="metric-card"><label>Real-Time Factor</label><div class="val" id="m-rtf">--<small style="font-size:.6em;font-weight:400"> ×</small></div></div> | |
| </div> | |
| <canvas id="viz"></canvas> | |
| <div class="log" id="log"></div> | |
| </div> | |
| <footer>Single-file by <a href="https://huggingface.co/Nekochu" target="_blank">Nekochu</a> · Soprano by <a href="https://github.com/ekwek1/soprano" target="_blank">ekwek1</a> · ONNX port by <a href="https://huggingface.co/KevinAHM" target="_blank">KevinAHM</a> · Apache-2.0</footer> | |
| </div> | |
| <!-- ═══════════════════════════════════════════════════════ --> | |
| <!-- INFERENCE WORKER — stored as text, loaded as blob URL --> | |
| <!-- ═══════════════════════════════════════════════════════ --> | |
| <script id="worker-src" type="text/js-worker"> | |
| /* Soprano Inference Worker — self-contained */ | |
| console.log('[Worker] Starting…'); | |
| self.postMessage({type:'status',status:'Worker started',state:'idle'}); | |
| const ORT_VER = '1.20.0'; | |
| try { importScripts('https://cdn.jsdelivr.net/npm/onnxruntime-web@'+ORT_VER+'/dist/ort.min.js'); } | |
| catch(e){ console.error('ORT load fail',e); } | |
| const HF_BASE = 'https://huggingface.co/spaces/KevinAHM/soprano-web-onnx/resolve/main'; | |
| const MODELS = { | |
| backbone: HF_BASE + '/onnx/soprano_backbone_kv_int8.onnx', | |
| decoder: HF_BASE + '/onnx/soprano_decoder_int8.onnx' | |
| }; | |
| const RECEPTIVE_FIELD = 4, TOKEN_SIZE = 2048, SAMPLE_RATE = 32000; | |
| let backboneSession=null, decoderSession=null, tokenizer=null, isGenerating=false, isReady=false; | |
| /* ── Text preprocessing ─────────────────────────────────── */ | |
| const ONES=['','one','two','three','four','five','six','seven','eight','nine','ten','eleven','twelve','thirteen','fourteen','fifteen','sixteen','seventeen','eighteen','nineteen']; | |
| const TENS=['','','twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety']; | |
| const ORD1=['','first','second','third','fourth','fifth','sixth','seventh','eighth','ninth','tenth','eleventh','twelfth','thirteenth','fourteenth','fifteenth','sixteenth','seventeenth','eighteenth','nineteenth']; | |
| const ORD10=['','','twentieth','thirtieth','fortieth','fiftieth','sixtieth','seventieth','eightieth','ninetieth']; | |
| function n2w(num,o={}){const{andword='',zero='zero',group=0}=o;if(num===0)return zero;const c=n=>{if(n<20)return ONES[n];if(n<100)return TENS[n/10|0]+(n%10?' '+ONES[n%10]:'');if(n<1e3)return ONES[n/100|0]+' hundred'+(n%100?(andword?' '+andword+' ':' ')+c(n%100):'');if(n<1e6){const t=n/1e3|0;return c(t)+' thousand'+(n%1e3?' '+c(n%1e3):'');}if(n<1e9){const m=n/1e6|0;return c(m)+' million'+(n%1e6?' '+c(n%1e6):'');}const b=n/1e9|0;return c(b)+' billion'+(n%1e9?' '+c(n%1e9):'');};if(group===2&&num>1e3&&num<1e4){const h=num/100|0,l=num%100;if(l===0)return c(h)+' hundred';if(l<10)return c(h)+' '+(zero==='oh'?'oh':zero)+' '+ONES[l];return c(h)+' '+c(l);}return c(num);} | |
| function o2w(num){if(num<20)return ORD1[num]||n2w(num)+'th';if(num<100){const t=num/10|0,o=num%10;if(o===0)return ORD10[t];return TENS[t]+' '+ORD1[o];}const c=n2w(num);if(c.endsWith('y'))return c.slice(0,-1)+'ieth';if(c.endsWith('one'))return c.slice(0,-3)+'first';if(c.endsWith('two'))return c.slice(0,-3)+'second';if(c.endsWith('three'))return c.slice(0,-5)+'third';if(c.endsWith('ve'))return c.slice(0,-2)+'fth';if(c.endsWith('e'))return c.slice(0,-1)+'th';if(c.endsWith('t'))return c+'h';return c+'th';} | |
| const UMAP={'à':'a','á':'a','â':'a','ã':'a','ä':'a','å':'a','æ':'ae','ç':'c','è':'e','é':'e','ê':'e','ë':'e','ì':'i','í':'i','î':'i','ï':'i','ñ':'n','ò':'o','ó':'o','ô':'o','õ':'o','ö':'o','ø':'o','ù':'u','ú':'u','û':'u','ü':'u','ý':'y','ÿ':'y','ß':'ss','œ':'oe','ð':'d','þ':'th','À':'A','Á':'A','Â':'A','Ã':'A','Ä':'A','Å':'A','Æ':'AE','Ç':'C','È':'E','É':'E','Ê':'E','Ë':'E','Ì':'I','Í':'I','Î':'I','Ï':'I','Ñ':'N','Ò':'O','Ó':'O','Ô':'O','Õ':'O','Ö':'O','Ø':'O','Ù':'U','Ú':'U','Û':'U','Ü':'U','Ý':'Y','\u201C':'"','\u201D':'"','\u2018':"'",'\u2019':"'",'\u2026':'...','\u2013':'-','\u2014':'-'}; | |
| function toAscii(t){return t.split('').map(c=>UMAP[c]||c).join('').normalize('NFD').replace(/[\u0300-\u036f]/g,'');} | |
| const ABBR=[[/\bmrs\./gi,'misuss'],[/\bms\./gi,'miss'],[/\bmr\./gi,'mister'],[/\bdr\./gi,'doctor'],[/\bst\./gi,'saint'],[/\bco\./gi,'company'],[/\bjr\./gi,'junior'],[/\bmaj\./gi,'major'],[/\bgen\./gi,'general'],[/\bdrs\./gi,'doctors'],[/\brev\./gi,'reverend'],[/\blt\./gi,'lieutenant'],[/\bhon\./gi,'honorable'],[/\bsgt\./gi,'sergeant'],[/\bcapt\./gi,'captain'],[/\besq\./gi,'esquire'],[/\bltd\./gi,'limited'],[/\bcol\./gi,'colonel'],[/\bft\./gi,'fort']]; | |
| const CABBR=[[/\bTTS\b/g,'text to speech'],[/\bHz\b/g,'hertz'],[/\bkHz\b/g,'kilohertz'],[/\bKB\b/g,'kilobyte'],[/\bMB\b/g,'megabyte'],[/\bGB\b/g,'gigabyte'],[/\bTB\b/g,'terabyte'],[/\bAPI\b/g,'a p i'],[/\bCLI\b/g,'c l i'],[/\bCPU\b/g,'c p u'],[/\bGPU\b/g,'g p u'],[/\bAve\b/g,'avenue'],[/\betc\b/g,'etcetera']]; | |
| function expAbbr(t){for(const[r,s]of[...ABBR,...CABBR])t=t.replace(r,s);return t;} | |
| function normNums(t){ | |
| t=t.replace(/#(\d)/g,'number $1'); | |
| t=t.replace(/(\d)([KMBT])/gi,(_,n,s)=>{const m={k:'thousand',m:'million',b:'billion',t:'trillion'};return n+' '+m[s.toLowerCase()];}); | |
| for(let i=0;i<2;i++)t=t.replace(/(\d)([a-z])|([a-z])(\d)/gi,(m,d1,l1,l2,d2)=>{if(d1&&l1)return d1+' '+l1;if(l2&&d2)return l2+' '+d2;return m;}); | |
| t=t.replace(/(\d[\d,]+\d)/g,m=>m.replace(/,/g,'')); | |
| t=t.replace(/£([\d,]*\d+)/g,(_,a)=>a.replace(/,/g,'')+' pounds'); | |
| t=t.replace(/\$([\d.,]*\d+)/g,(_,a)=>{const p=a.replace(/,/g,'').split('.');const d=parseInt(p[0])||0;const c=p[1]?parseInt(p[1]):0;if(d&&c)return d+' '+(d===1?'dollar':'dollars')+', '+c+' '+(c===1?'cent':'cents');if(d)return d+' '+(d===1?'dollar':'dollars');if(c)return c+' '+(c===1?'cent':'cents');return'zero dollars';}); | |
| t=t.replace(/(\d)\s?\*\s?(\d)/g,'$1 times $2'); | |
| t=t.replace(/(\d)\s?\/\s?(\d)/g,'$1 over $2'); | |
| t=t.replace(/(\d)\s?\+\s?(\d)/g,'$1 plus $2'); | |
| t=t.replace(/(\d)?\s?-\s?(\d)/g,(_,a,b)=>(a?a:'')+' minus '+b); | |
| t=t.replace(/(\d+)(st|nd|rd|th)/gi,(_,n)=>o2w(parseInt(n))); | |
| t=t.replace(/\d+/g,m=>{const n=parseInt(m);if(n>1e3&&n<3e3){if(n===2e3)return'two thousand';if(n>2e3&&n<2010)return'two thousand '+n2w(n%100);if(n%100===0)return n2w(Math.floor(n/100))+' hundred';return n2w(n,{zero:'oh',group:2});}return n2w(n);}); | |
| return t; | |
| } | |
| const SPEC=[[/@/g,' at '],[/&/g,' and '],[/%/g,' percent '],[/:/g,'.'],[/;/g,','],[/\+/g,' plus '],[/\\/g,' backslash '],[/~/g,' about '],[/<=/g,' less than or equal to '],[/>=/g,' greater than or equal to '],[/</g,' less than '],[/>/g,' greater than '],[/=/g,' equals '],[/\//g,' slash '],[/_/g,' ']]; | |
| function normSpec(t){t=t.replace(/https?:\/\//gi,'h t t p s colon slash slash ');t=t.replace(/(.) - (.)/g,'$1, $2');t=t.replace(/([A-Z])\.([A-Z])/gi,'$1 dot $2');t=t.replace(/[\(\[\{][^\)\]\}]*[\)\]\}](.)?/g,(m,a)=>{let r=m.replace(/[\(\[\{]/g,', ').replace(/[\)\]\}]/g,', ');if(a&&/[$.!?,]/.test(a))r=r.slice(0,-2)+a;return r;});return t;} | |
| function expSpec(t){for(const[r,s]of SPEC)t=t.replace(r,s);return t;} | |
| function cleanText(t){t=toAscii(t);t=t.split('\n').map(l=>{l=l.trim();if(!l)return'';if(!/[.!?]$/.test(l))l+='.';return l;}).join(' ');t=normNums(t);t=normSpec(t);t=expAbbr(t);t=expSpec(t);t=t.toLowerCase();t=t.replace(/[^A-Za-z !\$%&'\*\+,\-./0123456789<>\?_]/g,'');t=t.replace(/[<>\/_+]/g,'');t=t.replace(/\s+/g,' ').replace(/ ([.\?!,])/g,'$1');t=t.replace(/\.\.\.+/g,'[E]').replace(/,+/g,',').replace(/[.,]*\.[.,]*/g,'.').replace(/[.,!]*![.,!]*/g,'!').replace(/[.,!?]*\?[.,!?]*/g,'?').replace(/\[E\]/g,'...');return t.trim();} | |
| function preprocessText(text){ | |
| text=text.trim();const ct=cleanText(text); | |
| let sents=ct.split(/(?<=[.!?])\s+/).filter(s=>s.trim()); | |
| if(!sents.length)return ct?['[STOP][TEXT]'+ct+'[START]']:[]; | |
| if(sents.length>1){const mg=[];for(let i=0;i<sents.length;i++){const c=sents[i];if(c.length<30){if(mg.length>0)mg[mg.length-1]=(mg[mg.length-1]+' '+c).trim();else if(i+1<sents.length)sents[i+1]=(c+' '+sents[i+1]).trim();else mg.push(c);}else mg.push(c);}sents=mg;} | |
| const ps=[];for(let i=0;i<sents.length;i+=3){const b=sents.slice(i,i+3).join(' ');ps.push('[STOP][TEXT]'+b+'[START]');}return ps; | |
| } | |
| /* ── Sampling ───────────────────────────────────────────── */ | |
| const SP={temperature:.3,topK:50,topP:.95,repetitionPenalty:1.2}; | |
| let _tki=null,_tks=null,_tko=null,_tke=null; | |
| function sample(logitsTensor,seenMask){ | |
| let raw=logitsTensor.data;const vs=logitsTensor.dims[2];const off=(logitsTensor.dims[1]-1)*vs; | |
| const data=raw.subarray?raw.subarray(off):raw.slice(off); | |
| const{temperature,topK,topP,repetitionPenalty}=SP; | |
| const useRP=repetitionPenalty!==1;const invT=1/temperature;const k=Math.min(topK,vs); | |
| if(k>0&&k<vs){ | |
| if(!_tki||_tki.length!==k){_tki=new Int32Array(k);_tks=new Float32Array(k);_tke=new Float64Array(k);_tko=Array.from({length:k},(_,i)=>i);} | |
| const hi=_tki,hs=_tks;let sz=0; | |
| for(let t=0;t<vs;t++){ | |
| let s=data[t]*invT;if(useRP&&seenMask[t])s=s<0?s*repetitionPenalty:s/repetitionPenalty; | |
| if(sz<k){let p=sz++;while(p>0){const pr=(p-1)>>1;if(hs[pr]<=s)break;hs[p]=hs[pr];hi[p]=hi[pr];p=pr;}hs[p]=s;hi[p]=t;} | |
| else if(s>hs[0]){let p=0;while(p<(k>>1)){let l=(p<<1)+1,r=l+1,sm=l;if(r<k&&hs[r]<hs[l])sm=r;if(hs[sm]>=s)break;hs[p]=hs[sm];hi[p]=hi[sm];p=sm;}hs[p]=s;hi[p]=t;} | |
| } | |
| const eb=_tke,od=_tko;for(let i=0;i<k;i++)od[i]=i;od.sort((a,b)=>hs[b]-hs[a]); | |
| const mx=hs[od[0]];let se=0;for(let i=0;i<k;i++){const w=Math.exp(hs[od[i]]-mx);eb[i]=w;se+=w;} | |
| let kp=k;if(topP<1){const th=topP*se;let cm=0;for(let i=0;i<k;i++){cm+=eb[i];if(cm>=th){kp=i+1;break;}}} | |
| let sm2=0;for(let i=0;i<kp;i++)sm2+=eb[i]; | |
| let r=Math.random()*sm2;for(let i=0;i<kp;i++){r-=eb[i];if(r<=0)return BigInt(hi[od[i]]);} | |
| return BigInt(hi[od[0]]); | |
| } | |
| return 0n; | |
| } | |
| /* ── Worker message handler ─────────────────────────────── */ | |
| self.onmessage=async(e)=>{ | |
| const{type,data}=e.data; | |
| if(type==='load'){try{await loadModels();postMessage({type:'loaded'});}catch(err){postMessage({type:'error',error:err.toString()});}} | |
| else if(type==='generate'){if(!isReady){postMessage({type:'error',error:'Models not loaded'});return;}if(isGenerating)return;try{await startGen(data.text);}catch(err){postMessage({type:'error',error:err.toString()});}} | |
| else if(type==='stop'){isGenerating=false;postMessage({type:'status',status:'Stopped',state:'idle'});} | |
| }; | |
| async function loadModels(){ | |
| if(backboneSession)return; | |
| postMessage({type:'status',status:'Loading ONNX Runtime…',state:'loading'}); | |
| const cdnBase='https://cdn.jsdelivr.net/npm/onnxruntime-web@'+ORT_VER+'/dist/'; | |
| ort.env.wasm.wasmPaths=cdnBase; | |
| if(!self.crossOriginIsolated)ort.env.wasm.numThreads=1; | |
| else if(typeof navigator!=='undefined'&&navigator.hardwareConcurrency)ort.env.wasm.numThreads=Math.min(navigator.hardwareConcurrency,8); | |
| const bbOpts={executionProviders:['wasm'],freeDimensionOverrides:{batch:1},graphOptimizationLevel:'all'}; | |
| postMessage({type:'status',status:'Downloading backbone (~81 MB)…',state:'loading'}); | |
| postMessage({type:'progress',value:5}); | |
| backboneSession=await ort.InferenceSession.create(MODELS.backbone,bbOpts); | |
| postMessage({type:'progress',value:45}); | |
| postMessage({type:'status',status:'Downloading decoder (~31 MB)…',state:'loading'}); | |
| const decBuf=await fetch(MODELS.decoder).then(r=>{if(!r.ok)throw new Error('Decoder fetch failed: '+r.statusText);return r.arrayBuffer();}); | |
| let dataBuf=null; | |
| try{const dr=await fetch(MODELS.decoder+'.data');if(dr.ok)dataBuf=await dr.arrayBuffer();}catch(e){} | |
| const decOpts={executionProviders:['wasm'],freeDimensionOverrides:{batch:1}}; | |
| if(dataBuf)decOpts.externalData=[{data:new Uint8Array(dataBuf),path:MODELS.decoder.split('/').pop()+'.data'}]; | |
| decoderSession=await ort.InferenceSession.create(new Uint8Array(decBuf),decOpts); | |
| postMessage({type:'progress',value:75}); | |
| postMessage({type:'status',status:'Loading tokenizer…',state:'loading'}); | |
| const transformers=await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0'); | |
| const{AutoTokenizer,env}=transformers; | |
| env.allowLocalModels=false;env.allowRemoteModels=true; | |
| tokenizer=await AutoTokenizer.from_pretrained('KevinAHM/soprano-1.1-onnx'); | |
| postMessage({type:'progress',value:100}); | |
| isReady=true; | |
| postMessage({type:'status',status:'Ready',state:'idle'}); | |
| postMessage({type:'model_status',status:'ready',text:'Ready'}); | |
| } | |
| async function startGen(text){ | |
| isGenerating=true; | |
| postMessage({type:'status',status:'Generating…',state:'running'}); | |
| const prompts=preprocessText(text);let isFirst=true,cumSamples=0; | |
| for(const prompt of prompts){ | |
| if(!isGenerating)break; | |
| const{input_ids}=await tokenizer(prompt); | |
| const bs=await genLoop(input_ids.data,isFirst,cumSamples); | |
| cumSamples+=bs;isFirst=false; | |
| } | |
| if(isGenerating){postMessage({type:'stream_ended'});postMessage({type:'status',status:'Finished',state:'idle'});} | |
| isGenerating=false; | |
| } | |
| async function genLoop(promptTokens,isFirstBatch,cumSamples){ | |
| const batch=1,nLayers=17,hDim=128,pLen=promptTokens.length,vocabSize=8192,maxNew=512; | |
| const seenMask=new Uint8Array(vocabSize); | |
| for(let i=0;i<pLen;i++){const t=Number(promptTokens[i]);if(t>=0&&t<vocabSize)seenMask[t]=1;} | |
| let pkv={}; | |
| for(let i=0;i<nLayers;i++){ | |
| pkv['past_key_values.'+i+'.key']=new ort.Tensor('float32',new Float32Array(0),[batch,1,0,hDim]); | |
| pkv['past_key_values.'+i+'.value']=new ort.Tensor('float32',new Float32Array(0),[batch,1,0,hDim]); | |
| } | |
| const maxSL=pLen+maxNew;const amData=new BigInt64Array(maxSL);amData.fill(1n);let curSL=pLen; | |
| const nidD=new BigInt64Array(1),npidD=new BigInt64Array(1); | |
| const nidT=new ort.Tensor('int64',nidD,[batch,1]),npidT=new ort.Tensor('int64',npidD,[batch,1]); | |
| let cIds=new ort.Tensor('int64',BigInt64Array.from(promptTokens),[batch,pLen]); | |
| let cAM=new ort.Tensor('int64',amData.subarray(0,curSL),[batch,curSL]); | |
| let cPids=new ort.Tensor('int64',BigInt64Array.from({length:pLen},(_,i)=>BigInt(i)),[batch,pLen]); | |
| const hBuf=[];let totalSamp=0;const tgt=8;let chunkCtr=tgt,firstChunk=true; | |
| let lastDecP=Promise.resolve(),chunkBBT=0; | |
| if(isFirstBatch)postMessage({type:'generation_started',data:{time:performance.now()}}); | |
| for(let i=0;i<maxNew;i++){ | |
| if(!isGenerating)break; | |
| if(i%4===0)await new Promise(r=>setTimeout(r,0)); | |
| const inputs={input_ids:cIds,attention_mask:cAM,position_ids:cPids,...pkv}; | |
| const bbS=performance.now(); | |
| const out=await backboneSession.run(inputs); | |
| chunkBBT+=(performance.now()-bbS); | |
| const bn=backboneSession.outputNames; | |
| const logits=out[bn[0]],lhs=out[bn[bn.length-1]]; | |
| for(let j=0;j<nLayers;j++){pkv['past_key_values.'+j+'.key']=out[bn[1+j*2]];pkv['past_key_values.'+j+'.value']=out[bn[2+j*2]];} | |
| const nTok=sample(logits,seenMask);const fin=(nTok===3n);const ntn=Number(nTok); | |
| if(ntn>=0&&ntn<vocabSize)seenMask[ntn]=1; | |
| const sl=lhs.dims[1],hd=lhs.dims[2]; | |
| const ltsRaw=lhs.data.subarray((sl-1)*hd,sl*hd); | |
| const lts=new Float32Array(ltsRaw); | |
| if(i>0&&!fin){hBuf.push(new Float32Array(lts));if(hBuf.length>2*RECEPTIVE_FIELD+tgt)hBuf.splice(0,hBuf.length-(2*RECEPTIVE_FIELD+tgt));} | |
| if(fin||hBuf.length>=RECEPTIVE_FIELD+tgt){ | |
| if(fin||chunkCtr===tgt){ | |
| const win=hBuf.slice(-hBuf.length);const ws=win.length; | |
| const di=new Float32Array(512*ws);for(let w=0;w<ws;w++)for(let d=0;d<512;d++)di[d*ws+w]=win[w][d]; | |
| const isLast=fin,capCC=chunkCtr,capFC=firstChunk,capBB=chunkBBT;chunkBBT=0; | |
| lastDecP=lastDecP.then(async()=>{ | |
| const ds=performance.now(); | |
| const dOut=await decoderSession.run({[decoderSession.inputNames[0]]:new ort.Tensor('float32',di,[1,512,ws])}); | |
| const dd=performance.now()-ds;const audio=dOut[decoderSession.outputNames[0]].data; | |
| let ac; | |
| if(isLast){const si=audio.length-(RECEPTIVE_FIELD+capCC-1)*TOKEN_SIZE+TOKEN_SIZE;ac=audio.subarray(si);} | |
| else{const si=audio.length-(RECEPTIVE_FIELD+tgt)*TOKEN_SIZE+TOKEN_SIZE;const ei=audio.length-RECEPTIVE_FIELD*TOKEN_SIZE+TOKEN_SIZE;ac=audio.subarray(si,ei);} | |
| postMessage({type:'audio_chunk',data:ac,metrics:{bbTime:capBB,decTime:dd,chunkDuration:ac.length/SAMPLE_RATE,isFirst:capFC&&isFirstBatch}},[ac.buffer]); | |
| }); | |
| firstChunk=false;chunkCtr=0; | |
| } | |
| chunkCtr++; | |
| } | |
| if(fin)break; | |
| nidD[0]=nTok;cIds=nidT;curSL+=1; | |
| cAM=new ort.Tensor('int64',amData.subarray(0,curSL),[1,curSL]); | |
| npidD[0]=BigInt(curSL-1);cPids=npidT; | |
| } | |
| await lastDecP;return totalSamp; | |
| } | |
| </script> | |
| <!-- ═══════════════════════════════════════════════════════ --> | |
| <!-- MAIN THREAD --> | |
| <!-- ═══════════════════════════════════════════════════════ --> | |
| <script> | |
| const SAMPLE_RATE = 32000; | |
| /* ── Simple PCM Player (BufferSource scheduling) ─────── */ | |
| class PCMPlayer { | |
| constructor(ctx) { | |
| this.ctx = ctx; | |
| this.t = 0; | |
| this.gain = ctx.createGain(); | |
| this.gain.connect(ctx.destination); | |
| this.analyser = ctx.createAnalyser(); | |
| this.gain.connect(this.analyser); | |
| } | |
| play(f32) { | |
| if (this.ctx.state !== 'running') return; | |
| const buf = this.ctx.createBuffer(1, f32.length, this.ctx.sampleRate); | |
| buf.copyToChannel(f32, 0); | |
| const src = this.ctx.createBufferSource(); | |
| src.buffer = buf; | |
| const g = this.ctx.createGain(); | |
| src.connect(g); g.connect(this.gain); | |
| const now = this.ctx.currentTime; | |
| if (this.t < now) this.t = now; | |
| g.gain.setValueAtTime(0, this.t); | |
| g.gain.linearRampToValueAtTime(1, this.t + 0.005); | |
| const dur = buf.duration; | |
| g.gain.setValueAtTime(1, this.t + dur - 0.005); | |
| g.gain.linearRampToValueAtTime(0, this.t + dur); | |
| src.start(this.t); | |
| this.t += dur; | |
| src.onended = () => { src.disconnect(); g.disconnect(); }; | |
| } | |
| reset() { | |
| this.t = 0; | |
| const old = this.gain; | |
| this.gain = this.ctx.createGain(); | |
| this.gain.connect(this.ctx.destination); | |
| this.analyser = this.ctx.createAnalyser(); | |
| this.gain.connect(this.analyser); | |
| if (old) { const n = this.ctx.currentTime; old.gain.setValueAtTime(old.gain.value, n); old.gain.linearRampToValueAtTime(0, n + 0.05); setTimeout(() => old.disconnect(), 120); } | |
| } | |
| } | |
| /* ── App ─────────────────────────────────────────────── */ | |
| class SopranoApp { | |
| constructor() { | |
| this.els = { | |
| input: document.getElementById('text-input'), | |
| gen: document.getElementById('btn-gen'), | |
| stop: document.getElementById('btn-stop'), | |
| dot: document.getElementById('dot'), | |
| status: document.getElementById('status-text'), | |
| ttfb: document.getElementById('m-ttfb'), | |
| rtf: document.getElementById('m-rtf'), | |
| charCount: document.getElementById('char-count'), | |
| progBar: document.getElementById('progress-bar'), | |
| progFill: document.getElementById('progress-fill'), | |
| canvas: document.getElementById('viz'), | |
| log: document.getElementById('log'), | |
| }; | |
| this.isGen = false; | |
| this.genStart = 0; | |
| this.lastChunkT = 0; | |
| this.rtfAvg = 0; | |
| this.audioCtx = null; | |
| this.player = null; | |
| this.worker = null; | |
| this.bind(); | |
| this.initAudio(); | |
| this.initViz(); | |
| } | |
| bind() { | |
| this.els.gen.onclick = () => this.generate(); | |
| this.els.stop.onclick = () => this.stopGen(); | |
| this.els.input.oninput = () => { this.els.charCount.textContent = this.els.input.value.length; }; | |
| this.els.input.onkeydown = (e) => { if ((e.ctrlKey || e.metaKey) && e.key === 'Enter') this.generate(); }; | |
| document.querySelectorAll('.samples button').forEach(b => { | |
| b.onclick = () => { this.els.input.value = b.dataset.t; this.els.charCount.textContent = b.dataset.t.length; }; | |
| }); | |
| this.els.charCount.textContent = this.els.input.value.length; | |
| } | |
| async initAudio() { | |
| this.audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: SAMPLE_RATE, latencyHint: 'interactive' }); | |
| this.player = new PCMPlayer(this.audioCtx); | |
| this.spawnWorker(); | |
| } | |
| spawnWorker() { | |
| const code = document.getElementById('worker-src').textContent; | |
| const blob = new Blob([code], { type: 'application/javascript' }); | |
| const url = URL.createObjectURL(blob); | |
| this.worker = new Worker(url); | |
| URL.revokeObjectURL(url); | |
| this.worker.onmessage = (e) => { | |
| const { type, data, error, status, state, metrics, value } = e.data; | |
| switch (type) { | |
| case 'status': | |
| this.setStatus(status, state); break; | |
| case 'progress': | |
| this.els.progBar.style.display = 'block'; | |
| this.els.progFill.style.width = value + '%'; | |
| if (value >= 100) setTimeout(() => { this.els.progBar.style.display = 'none'; }, 600); | |
| break; | |
| case 'model_status': | |
| break; | |
| case 'loaded': | |
| this.els.gen.disabled = false; | |
| this.els.gen.textContent = '▶ Generate'; | |
| break; | |
| case 'generation_started': | |
| break; | |
| case 'audio_chunk': | |
| this.onChunk(data, metrics); break; | |
| case 'stream_ended': | |
| this.onEnd(); break; | |
| case 'error': | |
| console.error('Worker error:', error); | |
| this.setStatus('Error: ' + error, 'error'); | |
| this.resetUI(); | |
| break; | |
| } | |
| }; | |
| this.worker.postMessage({ type: 'load' }); | |
| } | |
| async generate() { | |
| this.genStart = performance.now(); | |
| if (this.isGen) return; | |
| if (this.audioCtx.state === 'suspended') await this.audioCtx.resume(); | |
| const text = this.els.input.value.trim(); | |
| if (!text) return; | |
| this.isGen = true; | |
| this.els.gen.disabled = true; | |
| this.els.stop.disabled = false; | |
| this.player.reset(); | |
| this.els.ttfb.innerHTML = '--<small style="font-size:.6em;font-weight:400"> ms</small>'; | |
| this.els.rtf.innerHTML = '--<small style="font-size:.6em;font-weight:400"> ×</small>'; | |
| this.rtfAvg = 0; | |
| this.worker.postMessage({ type: 'generate', data: { text } }); | |
| } | |
| stopGen() { | |
| if (!this.isGen) return; | |
| this.worker.postMessage({ type: 'stop' }); | |
| this.onEnd(); | |
| } | |
| onChunk(audioData, metrics) { | |
| if (!this.isGen) return; | |
| this.player.play(audioData instanceof Float32Array ? audioData : new Float32Array(audioData)); | |
| if (metrics.isFirst) { | |
| const ttfb = Math.round(performance.now() - this.genStart); | |
| this.els.ttfb.innerHTML = ttfb + '<small style="font-size:.6em;font-weight:400"> ms</small>'; | |
| } | |
| const now = performance.now(); | |
| const dt = (now - this.lastChunkT) / 1000; | |
| this.lastChunkT = now; | |
| if (dt > 0 && metrics.chunkDuration) { | |
| const inst = metrics.chunkDuration / dt; | |
| this.rtfAvg = this.rtfAvg === 0 ? inst : this.rtfAvg * 0.8 + inst * 0.2; | |
| const color = this.rtfAvg >= 1 ? 'var(--green)' : 'var(--red)'; | |
| this.els.rtf.innerHTML = this.rtfAvg.toFixed(2) + '<small style="font-size:.6em;font-weight:400;color:' + color + '"> ×</small>'; | |
| } | |
| } | |
| onEnd() { | |
| this.resetUI(); | |
| this.isGen = false; | |
| this.setStatus('Finished', 'ready'); | |
| } | |
| resetUI() { | |
| this.els.gen.disabled = false; | |
| this.els.gen.textContent = '▶ Generate'; | |
| this.els.stop.disabled = true; | |
| } | |
| setStatus(text, state) { | |
| this.els.status.textContent = text; | |
| this.els.dot.className = 'dot ' + (state || ''); | |
| } | |
| /* ── Visualizer ──────────────────────────────────────── */ | |
| initViz() { | |
| const c = this.els.canvas; | |
| if (!c) return; | |
| this.vizCtx = c.getContext('2d'); | |
| this.resizeCanvas(); | |
| window.addEventListener('resize', () => this.resizeCanvas()); | |
| const draw = () => { | |
| requestAnimationFrame(draw); | |
| if (!this.player || !this.player.analyser) return; | |
| const bl = this.player.analyser.frequencyBinCount; | |
| const arr = new Uint8Array(bl); | |
| this.player.analyser.getByteFrequencyData(arr); | |
| const ctx = this.vizCtx, w = c.width / (devicePixelRatio || 1), h = c.height / (devicePixelRatio || 1); | |
| ctx.clearRect(0, 0, w, h); | |
| const bars = 80, bw = w / bars, spb = Math.floor(bl / bars); | |
| for (let i = 0; i < bars; i++) { | |
| let sum = 0; | |
| for (let j = 0; j < spb; j++) sum += arr[i * spb + j]; | |
| const avg = sum / spb; | |
| const bh = (avg / 255) * h * 0.85; | |
| const grd = ctx.createLinearGradient(0, h, 0, h - bh); | |
| grd.addColorStop(0, 'rgba(99,102,241,0.25)'); | |
| grd.addColorStop(1, 'rgba(139,92,246,0.8)'); | |
| ctx.fillStyle = grd; | |
| ctx.beginPath(); | |
| ctx.roundRect(i * bw + 1, h - bh, bw - 2, bh, [2, 2, 0, 0]); | |
| ctx.fill(); | |
| } | |
| }; | |
| requestAnimationFrame(draw); | |
| } | |
| resizeCanvas() { | |
| const c = this.els.canvas; if (!c) return; | |
| const p = c.parentElement, w = p.clientWidth, h = 100, dpr = devicePixelRatio || 1; | |
| c.width = w * dpr; c.height = h * dpr; | |
| c.style.width = w + 'px'; c.style.height = h + 'px'; | |
| this.vizCtx.scale(dpr, dpr); | |
| } | |
| } | |
| document.addEventListener('DOMContentLoaded', () => { window.app = new SopranoApp(); }); | |
| </script> | |
| </body> | |
| </html> |