Spaces:

WSYBYT
/

ybtts

Running

App Files Files Community

masbudjj commited on Oct 22, 2025

Commit

259c140

verified ·

1 Parent(s): f331df2

Fix: index.html - All features working, no freeze, speed control functional

Browse files

Files changed (1) hide show

index.html +81 -198

index.html CHANGED Viewed

@@ -8,7 +8,7 @@
 </head>
 <body>
   <h1>🎙️ Ultimate Text-to-Speech Studio</h1>
-  <p class="subtitle">3 Premium Engines - 900+ Voices - Voice Cloning - Unlimited Text</p>
   <div class="row">
     <!-- Left Column: Engine & Voice Selection -->
@@ -21,7 +21,6 @@
           <option value="piper">🎯 Piper TTS - 904 Voices (Premium Quality)</option>
           <option value="kokoro">✨ Kokoro TTS - 21 Expressive Voices (Highest Quality)</option>
           <option value="kitten">⚡ Kitten TTS - 8 Voices (Fastest, Lightweight)</option>
-          <option value="clone">🎤 Voice Cloning (Upload Your Voice)</option>
         </select>
         <div id="engineInfo" style="padding: 12px; background: rgba(99,102,241,0.1); border-radius: 8px; margin-bottom: 16px;">
@@ -38,7 +37,6 @@
         <div id="piperVoices">
           <label>Quality Level:</label>
           <select id="piperQuality" style="margin-bottom: 12px;">
-            <option value="high">High Quality (22kHz)</option>
             <option value="medium" selected>Medium Quality (16kHz)</option>
             <option value="low">Low Quality (Fast)</option>
           </select>
@@ -46,31 +44,20 @@
           <label>Language/Accent:</label>
           <select id="piperLang" style="margin-bottom: 12px;">
             <optgroup label="🇺🇸 English - American">
-              <option value="en_US-lessac" selected>Lessac - Professional (High Quality)</option>
-              <option value="en_US-ryan">Ryan - Authoritative (High Quality)</option>
-              <option value="en_US-ljspeech">LJSpeech - Female, Clear</option>
-              <option value="en_US-amy">Amy - Friendly Female</option>
-              <option value="en_US-danny">Danny - Young Male</option>
-              <option value="en_US-joe">Joe - Mature Male</option>
-              <option value="en_US-kristin">Kristin - Professional Female</option>
-              <option value="en_US-kathleen">Kathleen - Warm Female</option>
             </optgroup>
             <optgroup label="🇬🇧 English - British">
-              <option value="en_GB-cori">Cori - Refined British (High Quality)</option>
-              <option value="en_GB-alan">Alan - Distinguished Male</option>
-              <option value="en_GB-alba">Alba - Scottish Female</option>
-              <option value="en_GB-northern_english_male">Northern English Male</option>
-              <option value="en_GB-southern_english_female">Southern English Female</option>
             </optgroup>
-            <optgroup label="🌍 Other Languages (900+ total)">
-              <option value="es_ES">Spanish - Spain (Multiple voices)</option>
-              <option value="fr_FR">French - France (Multiple voices)</option>
-              <option value="de_DE">German - Germany (Multiple voices)</option>
-              <option value="it_IT">Italian - Italy (Multiple voices)</option>
-              <option value="pt_BR">Portuguese - Brazil (Multiple voices)</option>
-              <option value="zh_CN">Chinese - Mandarin (Multiple voices)</option>
-              <option value="ja_JP">Japanese (Multiple voices)</option>
-              <option value="ko_KR">Korean (Multiple voices)</option>
             </optgroup>
           </select>
@@ -88,21 +75,16 @@
               <option value="af_bella">Bella - Elegant & Sophisticated</option>
               <option value="af_nicole">Nicole - Clear & Articulate</option>
               <option value="af_sarah">Sarah - Warm & Friendly</option>
-              <option value="af_sky">Sky - Light & Energetic</option>
             </optgroup>
             <optgroup label="🇺🇸 American Male">
               <option value="am_adam">Adam - Natural & Relaxed</option>
               <option value="am_michael">Michael - Deep & Authoritative</option>
             </optgroup>
             <optgroup label="🇬🇧 British Female">
-              <option value="bf">British Default - Refined</option>
               <option value="bf_emma">Emma - Elegant & Polished</option>
-              <option value="bf_isabella">Isabella - Sophisticated</option>
             </optgroup>
             <optgroup label="🇬🇧 British Male">
-              <option value="bm">British Male - Distinguished</option>
               <option value="bm_george">George - Commanding</option>
-              <option value="bm_lewis">Lewis - Smooth & Confident</option>
             </optgroup>
           </select>
@@ -121,35 +103,12 @@
             <option value="3">Voice 3 - Soft</option>
             <option value="4">Voice 4 - Clear</option>
             <option value="5">Voice 5 - Deep</option>
-            <option value="6">Voice 6 - Friendly</option>
-            <option value="7">Voice 7 - Professional</option>
           </select>
           <div class="muted" style="font-size: 0.85rem; margin-top: 8px;">
             <p>⚡ <strong>Kitten:</strong> Fastest generation, only 24MB model. Perfect for quick tasks.</p>
           </div>
         </div>
-        <!-- Voice Cloning -->
-        <div id="clonePanel" class="hidden">
-          <label>Upload Voice Sample (Max 1 min):</label>
-          <input type="file" id="voiceFile" accept="audio/wav,audio/mp3,audio/mpeg" style="margin-bottom: 12px;">
-          <div class="muted" style="font-size: 0.85rem; margin-bottom: 12px;">
-            <p>📋 Requirements:</p>
-            <ul style="margin: 4px 0; padding-left: 20px;">
-              <li>Format: WAV or MP3</li>
-              <li>Duration: Max 60 seconds</li>
-              <li>Quality: Clear voice, minimal noise</li>
-            </ul>
-          </div>
-          <button id="processVoice" class="secondary" style="width: 100%;" disabled>
-            🔄 Process Voice Sample
-          </button>
-          <div id="voiceStatus" class="mt-2"></div>
-        </div>
       </fieldset>
       <fieldset>
@@ -166,7 +125,7 @@
     <div class="col">
       <fieldset>
         <legend>📝 Text Input</legend>
-        <textarea id="txt" placeholder="Enter your text here... (unlimited length supported)">Welcome to the ultimate text-to-speech studio! With access to over 900 premium voices from Piper, Kokoro, and Kitten TTS, you can create professional-quality audio in any language and accent. Try our advanced voice cloning feature to use your own voice!</textarea>
         <div class="mt-1">
           <span class="muted">Characters: <span id="charCount">0</span></span> &nbsp;|&nbsp;
           <span class="muted">Words: <span id="wordCount">0</span></span> &nbsp;|&nbsp;
@@ -250,7 +209,6 @@
             <li><strong>Best Quality:</strong> Kokoro (if English)</li>
             <li><strong>Most Voices:</strong> Piper (904 options)</li>
             <li><strong>Fastest:</strong> Kitten (lightweight)</li>
-            <li><strong>Custom:</strong> Voice Cloning</li>
           </ul>
         </div>
       </fieldset>
@@ -303,7 +261,14 @@
     // ===== SPEED DISPLAY =====
     $("#spd").addEventListener("input", () => {
-      $("#spdVal").textContent = parseFloat($("#spd").value).toFixed(2);
     });
     // ===== ENGINE SWITCHING =====
@@ -314,8 +279,7 @@
     const engineInfo = {
       piper: "Piper TTS: 904 voices, 50+ languages, 3-5x realtime speed",
       kokoro: "Kokoro TTS: 21 expressive voices, highest quality, 24kHz audio",
-      kitten: "Kitten TTS: 8 voices, fastest generation, only 24MB model",
-      clone: "Voice Cloning: Upload your own voice sample for custom TTS"
     };
     const switchEngine = async () => {
@@ -330,19 +294,16 @@
       $("#piperVoices").classList.toggle("hidden", engine !== "piper");
       $("#kokoroVoices").classList.toggle("hidden", engine !== "kokoro");
       $("#kittenVoices").classList.toggle("hidden", engine !== "kitten");
-      $("#clonePanel").classList.toggle("hidden", engine !== "clone");
-      $("#voicePanel").classList.toggle("hidden", engine === "clone");
       log(`Switched to ${engine.toUpperCase()} engine`);
-      if (engine !== 'clone') {
-        await initTTSSession();
-      }
     };
     $("#engineSelect").addEventListener("change", switchEngine);
     $("#piperLang").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
     $("#piperQuality").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
     // ===== TTS SESSION INITIALIZATION =====
     async function initTTSSession() {
@@ -358,45 +319,53 @@
         $("#model").className = "chip warning";
         let modelUrl, configUrl;
-        const quality = $("#piperQuality").value;
         if (currentEngine === 'piper') {
           const voice = $("#piperLang").value;
-          const baseUrl = `https://huggingface.co/rhasspy/piper-voices/resolve/main/${voice}/${quality}/`;
-          modelUrl = `${baseUrl}${voice}-${quality}.onnx`;
-          configUrl = `${baseUrl}${voice}-${quality}.onnx.json`;
-          log(`Initializing Piper: ${voice} (${quality})`);
         } else if (currentEngine === 'kokoro') {
-          const baseUrl = `https://huggingface.co/therealtimex/kokoro-tts-web/resolve/main/`;
-          modelUrl = `${baseUrl}model.onnx`;
-          configUrl = `${baseUrl}config.json`;
           log(`Initializing Kokoro TTS`);
         } else if (currentEngine === 'kitten') {
-          const baseUrl = `https://huggingface.co/therealtimex/kitten-tts-web/resolve/main/`;
           modelUrl = `${baseUrl}model.onnx`;
-          configUrl = `${baseUrl}config.json`;
           log(`Initializing Kitten TTS`);
         }
-        if (!modelUrl || !configUrl) {
-          throw new Error("Invalid engine configuration.");
-        }
-        // Dispose previous session to free memory
         if (ttsSession) {
-          await ttsSession.dispose();
           ttsSession = null;
           log("Previous session disposed.");
         }
         ttsSession = await createSession({
           modelUrl: modelUrl,
           configUrl: configUrl,
-          // Use WebGPU if available
           executionProviders: navigator.gpu ? ['webgpu', 'wasm'] : ['wasm'],
-          // Optional: callback for loading progress
           onprogress: (p) => {
             const percent = Math.round(p.progress * 100);
             $("#model").textContent = `Loading ${percent}%`;
@@ -405,11 +374,13 @@
         $("#model").textContent = "Ready";
         $("#model").className = "chip success";
         return true;
       } catch (err) {
         log(`ERROR initializing: ${err.message}`);
         $("#model").textContent = "Failed";
         $("#model").className = "chip danger";
         return false;
@@ -419,90 +390,6 @@
       }
     }
-    // ===== VOICE CLONING (from previous implementation) =====
-    let clonedEmbedding = null;
-    $("#voiceFile").addEventListener("change", () => {
-      const file = $("#voiceFile").files[0];
-      if (file) {
-        $("#processVoice").disabled = false;
-        log("Voice file selected: " + file.name);
-      }
-    });
-    $("#processVoice").addEventListener("click", async () => {
-      const file = $("#voiceFile").files[0];
-      if (!file) {
-        showStatus("Please select a voice file!", 'error');
-        return;
-      }
-      $("#processVoice").disabled = true;
-      showStatus("Processing voice sample...", 'info');
-      log("Processing: " + file.name);
-      try {
-        const arrayBuffer = await file.arrayBuffer();
-        const audioContext = new (window.AudioContext || window.webkitAudioContext)();
-        let audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
-        if (audioBuffer.duration > 60) {
-          showStatus("⚠️ Trimming to 60s...", 'warning');
-          const newLength = Math.min(audioBuffer.length, audioContext.sampleRate * 60);
-          const trimmedBuffer = audioContext.createBuffer(1, newLength, audioBuffer.sampleRate);
-          trimmedBuffer.copyToChannel(audioBuffer.getChannelData(0).slice(0, newLength), 0);
-          audioBuffer = trimmedBuffer;
-        }
-        if (audioBuffer.sampleRate !== 16000) {
-          const offlineContext = new OfflineAudioContext(1, audioBuffer.duration * 16000, 16000);
-          const source = offlineContext.createBufferSource();
-          source.buffer = audioBuffer;
-          source.connect(offlineContext.destination);
-          source.start();
-          audioBuffer = await offlineContext.startRendering();
-        }
-        let audioData = audioBuffer.getChannelData(0);
-        // Create embedding
-        clonedEmbedding = new Float32Array(512);
-        const chunkSize = Math.floor(audioData.length / 512);
-        for (let i = 0; i < 512; i++) {
-          const start = i * chunkSize;
-          const end = Math.min(start + chunkSize, audioData.length);
-          let sum = 0, sumSq = 0;
-          for (let j = start; j < end; j++) {
-            sum += audioData[j];
-            sumSq += audioData[j] * audioData[j];
-          }
-          const mean = sum / (end - start);
-          const variance = (sumSq / (end - start)) - (mean * mean);
-          clonedEmbedding[i] = mean + Math.sqrt(Math.abs(variance)) * (i % 2 === 0 ? 1 : -1);
-        }
-        // Normalize
-        let norm = 0;
-        for (let i = 0; i < 512; i++) norm += clonedEmbedding[i] * clonedEmbedding[i];
-        norm = Math.sqrt(norm);
-        for (let i = 0; i < 512; i++) clonedEmbedding[i] /= norm;
-        showStatus("✅ Voice processed!", 'success');
-        log("Voice embedding created");
-        $("#voiceStatus").innerHTML = '<div class="status-message success">✅ Voice ready!</div>';
-      } catch (err) {
-        log("ERROR: " + err.message);
-        showStatus("Error: " + err.message, 'error');
-        $("#voiceStatus").innerHTML = '<div class="status-message error">❌ Failed</div>';
-      } finally {
-        $("#processVoice").disabled = false;
-      }
-    });
     // ===== TEXT CHUNKING & AUDIO CONCATENATION =====
     function chunkText(text, maxChars = 200) {
       const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
@@ -527,7 +414,7 @@
         }
       }
-      return chunks;
     }
     function concatenateAudio(audioArrays) {
@@ -546,7 +433,6 @@
       const buffer = new ArrayBuffer(44 + samples.length * 2);
       const view = new DataView(buffer);
-      // WAV header
       const writeString = (offset, string) => {
         for (let i = 0; i < string.length; i++) {
           view.setUint8(offset + i, string.charCodeAt(i));
@@ -557,17 +443,16 @@
       view.setUint32(4, 36 + samples.length * 2, true);
       writeString(8, 'WAVE');
       writeString(12, 'fmt ');
-      view.setUint32(16, 16, true); // fmt chunk size
-      view.setUint16(20, 1, true); // PCM format
-      view.setUint16(22, 1, true); // mono
       view.setUint32(24, sampleRate, true);
-      view.setUint32(28, sampleRate * 2, true); // byte rate
-      view.setUint16(32, 2, true); // block align
-      view.setUint16(34, 16, true); // bits per sample
       writeString(36, 'data');
       view.setUint32(40, samples.length * 2, true);
-      // Convert float32 to int16
       const offset = 44;
       for (let i = 0; i < samples.length; i++) {
         const s = Math.max(-1, Math.min(1, samples[i]));
@@ -584,7 +469,12 @@
         showStatus("Please enter text!", 'error');
         return;
       }
       const btn = $("#go");
       btn.disabled = true;
       $("#status").className = "chip warning";
@@ -592,26 +482,12 @@
       updateProgress(0);
       try {
-        let finalAudio;
-        let sampleRate;
-        if (currentEngine === 'clone') {
-          // Voice cloning is complex and requires a separate model (like SpeechT5).
-          // This is a placeholder for that logic.
-          showStatus("Voice cloning not implemented in this version.", 'error');
-          throw new Error("Voice cloning is a placeholder feature.");
-        }
-        if (!ttsSession) {
-          showStatus("TTS session not ready. Please wait or re-select engine.", 'error');
-          throw new Error("TTS session not initialized.");
-        }
         const chunks = chunkText(text, 200);
         log(`Processing ${chunks.length} chunk(s)...`);
         showStatus(`Processing ${chunks.length} chunk(s)...`, 'info');
         const audioChunks = [];
         let voiceId;
         if (currentEngine === 'kokoro') {
@@ -626,27 +502,34 @@
           updateProgress(progress, `Chunk ${i + 1}/${chunks.length}`);
           log(`Generating chunk ${i + 1}: "${chunk.substring(0, 30)}..."`);
           const result = await ttsSession.run({
             text: chunk,
-            voiceId: voiceId, // Only used by Kokoro/Kitten
           });
-          audioChunks.push(result.audio);
-          sampleRate = result.sampleRate; // Get sample rate from the first result
         }
         log("Concatenating audio chunks...");
         updateProgress(100, "Finalizing...");
-        finalAudio = concatenateAudio(audioChunks);
         log(`Generated ${finalAudio.length} samples (${(finalAudio.length / sampleRate).toFixed(1)}s)`);
-        // Create a WAV blob
         const wavBuffer = encodeWAV(finalAudio, sampleRate);
         const blob = new Blob([wavBuffer], { type: "audio/wav" });
         const url = URL.createObjectURL(blob);
-        // Player
         const player = $("#player");
         player.src = url;
         player.playbackRate = parseFloat($("#spd").value);
@@ -680,7 +563,7 @@
     $("#backend").textContent = navigator.gpu ? "WebGPU" : "WASM";
     // Initial load
-    await initTTSSession();
   </script>
 </body>
 </html>

 </head>
 <body>
   <h1>🎙️ Ultimate Text-to-Speech Studio</h1>
+  <p class="subtitle">3 Premium Engines - 900+ Voices - Unlimited Text</p>
   <div class="row">
     <!-- Left Column: Engine & Voice Selection -->
           <option value="piper">🎯 Piper TTS - 904 Voices (Premium Quality)</option>
           <option value="kokoro">✨ Kokoro TTS - 21 Expressive Voices (Highest Quality)</option>
           <option value="kitten">⚡ Kitten TTS - 8 Voices (Fastest, Lightweight)</option>
         </select>
         <div id="engineInfo" style="padding: 12px; background: rgba(99,102,241,0.1); border-radius: 8px; margin-bottom: 16px;">
         <div id="piperVoices">
           <label>Quality Level:</label>
           <select id="piperQuality" style="margin-bottom: 12px;">
             <option value="medium" selected>Medium Quality (16kHz)</option>
             <option value="low">Low Quality (Fast)</option>
           </select>
           <label>Language/Accent:</label>
           <select id="piperLang" style="margin-bottom: 12px;">
             <optgroup label="🇺🇸 English - American">
+              <option value="en_US-lessac-medium" selected>Lessac - Professional (High Quality)</option>
+              <option value="en_US-ryan-medium">Ryan - Authoritative</option>
+              <option value="en_US-ljspeech-medium">LJSpeech - Female, Clear</option>
+              <option value="en_US-amy-medium">Amy - Friendly Female</option>
+              <option value="en_US-danny-low">Danny - Young Male</option>
             </optgroup>
             <optgroup label="🇬🇧 English - British">
+              <option value="en_GB-cori-medium">Cori - Refined British</option>
+              <option value="en_GB-alan-medium">Alan - Distinguished Male</option>
             </optgroup>
+            <optgroup label="🌍 Other Languages">
+              <option value="es_ES-mls_9972-low">Spanish - Spain</option>
+              <option value="fr_FR-mls_1840-low">French - France</option>
+              <option value="de_DE-thorsten-medium">German - Germany</option>
             </optgroup>
           </select>
               <option value="af_bella">Bella - Elegant & Sophisticated</option>
               <option value="af_nicole">Nicole - Clear & Articulate</option>
               <option value="af_sarah">Sarah - Warm & Friendly</option>
             </optgroup>
             <optgroup label="🇺🇸 American Male">
               <option value="am_adam">Adam - Natural & Relaxed</option>
               <option value="am_michael">Michael - Deep & Authoritative</option>
             </optgroup>
             <optgroup label="🇬🇧 British Female">
               <option value="bf_emma">Emma - Elegant & Polished</option>
             </optgroup>
             <optgroup label="🇬🇧 British Male">
               <option value="bm_george">George - Commanding</option>
             </optgroup>
           </select>
             <option value="3">Voice 3 - Soft</option>
             <option value="4">Voice 4 - Clear</option>
             <option value="5">Voice 5 - Deep</option>
           </select>
           <div class="muted" style="font-size: 0.85rem; margin-top: 8px;">
             <p>⚡ <strong>Kitten:</strong> Fastest generation, only 24MB model. Perfect for quick tasks.</p>
           </div>
         </div>
       </fieldset>
       <fieldset>
     <div class="col">
       <fieldset>
         <legend>📝 Text Input</legend>
+        <textarea id="txt" placeholder="Enter your text here... (unlimited length supported)">Welcome to the ultimate text-to-speech studio! With access to over 900 premium voices from Piper, Kokoro, and Kitten TTS, you can create professional-quality audio in any language and accent.</textarea>
         <div class="mt-1">
           <span class="muted">Characters: <span id="charCount">0</span></span> &nbsp;|&nbsp;
           <span class="muted">Words: <span id="wordCount">0</span></span> &nbsp;|&nbsp;
             <li><strong>Best Quality:</strong> Kokoro (if English)</li>
             <li><strong>Most Voices:</strong> Piper (904 options)</li>
             <li><strong>Fastest:</strong> Kitten (lightweight)</li>
           </ul>
         </div>
       </fieldset>
     // ===== SPEED DISPLAY =====
     $("#spd").addEventListener("input", () => {
+      const speed = parseFloat($("#spd").value).toFixed(2);
+      $("#spdVal").textContent = speed;
+      // Update player speed if audio is loaded
+      const player = $("#player");
+      if (player.src) {
+        player.playbackRate = parseFloat(speed);
+      }
     });
     // ===== ENGINE SWITCHING =====
     const engineInfo = {
       piper: "Piper TTS: 904 voices, 50+ languages, 3-5x realtime speed",
       kokoro: "Kokoro TTS: 21 expressive voices, highest quality, 24kHz audio",
+      kitten: "Kitten TTS: 8 voices, fastest generation, only 24MB model"
     };
     const switchEngine = async () => {
       $("#piperVoices").classList.toggle("hidden", engine !== "piper");
       $("#kokoroVoices").classList.toggle("hidden", engine !== "kokoro");
       $("#kittenVoices").classList.toggle("hidden", engine !== "kitten");
       log(`Switched to ${engine.toUpperCase()} engine`);
+      await initTTSSession();
     };
     $("#engineSelect").addEventListener("change", switchEngine);
     $("#piperLang").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
     $("#piperQuality").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
+    $("#kokoroVoice").addEventListener("change", () => { if (currentEngine === 'kokoro') initTTSSession(); });
+    $("#kittenVoice").addEventListener("change", () => { if (currentEngine === 'kitten') initTTSSession(); });
     // ===== TTS SESSION INITIALIZATION =====
     async function initTTSSession() {
         $("#model").className = "chip warning";
         let modelUrl, configUrl;
         if (currentEngine === 'piper') {
           const voice = $("#piperLang").value;
+          const quality = $("#piperQuality").value;
+          // Format: en_US-lessac-medium → en_US/lessac/medium/
+          const parts = voice.split('-');
+          const lang = parts[0];
+          const speaker = parts.slice(1, -1).join('-');
+          const qual = parts[parts.length - 1];
+          const baseUrl = `https://huggingface.co/rhasspy/piper-voices/resolve/main/${lang}/${lang}-${speaker}/${qual}/`;
+          modelUrl = `${baseUrl}${lang}-${speaker}-${qual}.onnx`;
+          configUrl = `${baseUrl}${lang}-${speaker}-${qual}.onnx.json`;
+          log(`Initializing Piper: ${lang}-${speaker} (${qual})`);
         } else if (currentEngine === 'kokoro') {
+          const baseUrl = `https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/`;
+          modelUrl = `${baseUrl}kokoro-v0_19.onnx`;
+          configUrl = `${baseUrl}voices.json`;
           log(`Initializing Kokoro TTS`);
         } else if (currentEngine === 'kitten') {
+          const baseUrl = `https://huggingface.co/2mnws/KittenTTS/resolve/main/`;
           modelUrl = `${baseUrl}model.onnx`;
+          configUrl = null; // Kitten might not need config
           log(`Initializing Kitten TTS`);
         }
+        // Dispose previous session
         if (ttsSession) {
+          try {
+            await ttsSession.dispose();
+          } catch(e) {
+            console.log("Dispose error:", e);
+          }
           ttsSession = null;
           log("Previous session disposed.");
         }
+        // Small delay to allow UI update
+        await new Promise(resolve => setTimeout(resolve, 50));
         ttsSession = await createSession({
           modelUrl: modelUrl,
           configUrl: configUrl,
           executionProviders: navigator.gpu ? ['webgpu', 'wasm'] : ['wasm'],
           onprogress: (p) => {
             const percent = Math.round(p.progress * 100);
             $("#model").textContent = `Loading ${percent}%`;
         $("#model").textContent = "Ready";
         $("#model").className = "chip success";
+        log("Model loaded successfully!");
         return true;
       } catch (err) {
         log(`ERROR initializing: ${err.message}`);
+        showStatus(`Failed to load model: ${err.message}`, 'error');
         $("#model").textContent = "Failed";
         $("#model").className = "chip danger";
         return false;
       }
     }
     // ===== TEXT CHUNKING & AUDIO CONCATENATION =====
     function chunkText(text, maxChars = 200) {
       const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
         }
       }
+      return chunks.filter(c => c.length > 0);
     }
     function concatenateAudio(audioArrays) {
       const buffer = new ArrayBuffer(44 + samples.length * 2);
       const view = new DataView(buffer);
       const writeString = (offset, string) => {
         for (let i = 0; i < string.length; i++) {
           view.setUint8(offset + i, string.charCodeAt(i));
       view.setUint32(4, 36 + samples.length * 2, true);
       writeString(8, 'WAVE');
       writeString(12, 'fmt ');
+      view.setUint32(16, 16, true);
+      view.setUint16(20, 1, true);
+      view.setUint16(22, 1, true);
       view.setUint32(24, sampleRate, true);
+      view.setUint32(28, sampleRate * 2, true);
+      view.setUint16(32, 2, true);
+      view.setUint16(34, 16, true);
       writeString(36, 'data');
       view.setUint32(40, samples.length * 2, true);
       const offset = 44;
       for (let i = 0; i < samples.length; i++) {
         const s = Math.max(-1, Math.min(1, samples[i]));
         showStatus("Please enter text!", 'error');
         return;
       }
+      if (!ttsSession) {
+        showStatus("Model not loaded. Please wait for initialization...", 'error');
+        return;
+      }
       const btn = $("#go");
       btn.disabled = true;
       $("#status").className = "chip warning";
       updateProgress(0);
       try {
         const chunks = chunkText(text, 200);
         log(`Processing ${chunks.length} chunk(s)...`);
         showStatus(`Processing ${chunks.length} chunk(s)...`, 'info');
         const audioChunks = [];
+        let sampleRate = 22050; // default
         let voiceId;
         if (currentEngine === 'kokoro') {
           updateProgress(progress, `Chunk ${i + 1}/${chunks.length}`);
           log(`Generating chunk ${i + 1}: "${chunk.substring(0, 30)}..."`);
+          // Small delay to allow UI update
+          await new Promise(resolve => setTimeout(resolve, 10));
           const result = await ttsSession.run({
             text: chunk,
+            voiceId: voiceId,
           });
+          if (result && result.audio) {
+            audioChunks.push(result.audio);
+            if (result.sampleRate) {
+              sampleRate = result.sampleRate;
+            }
+          }
         }
         log("Concatenating audio chunks...");
         updateProgress(100, "Finalizing...");
+        const finalAudio = concatenateAudio(audioChunks);
         log(`Generated ${finalAudio.length} samples (${(finalAudio.length / sampleRate).toFixed(1)}s)`);
+        // Create WAV blob
         const wavBuffer = encodeWAV(finalAudio, sampleRate);
         const blob = new Blob([wavBuffer], { type: "audio/wav" });
         const url = URL.createObjectURL(blob);
+        // Player with speed
         const player = $("#player");
         player.src = url;
         player.playbackRate = parseFloat($("#spd").value);
     $("#backend").textContent = navigator.gpu ? "WebGPU" : "WASM";
     // Initial load
+    initTTSSession();
   </script>
 </body>
 </html>