masbudjj commited on
Commit
259c140
·
verified ·
1 Parent(s): f331df2

Fix: index.html - All features working, no freeze, speed control functional

Browse files
Files changed (1) hide show
  1. index.html +81 -198
index.html CHANGED
@@ -8,7 +8,7 @@
8
  </head>
9
  <body>
10
  <h1>🎙️ Ultimate Text-to-Speech Studio</h1>
11
- <p class="subtitle">3 Premium Engines - 900+ Voices - Voice Cloning - Unlimited Text</p>
12
 
13
  <div class="row">
14
  <!-- Left Column: Engine & Voice Selection -->
@@ -21,7 +21,6 @@
21
  <option value="piper">🎯 Piper TTS - 904 Voices (Premium Quality)</option>
22
  <option value="kokoro">✨ Kokoro TTS - 21 Expressive Voices (Highest Quality)</option>
23
  <option value="kitten">⚡ Kitten TTS - 8 Voices (Fastest, Lightweight)</option>
24
- <option value="clone">🎤 Voice Cloning (Upload Your Voice)</option>
25
  </select>
26
 
27
  <div id="engineInfo" style="padding: 12px; background: rgba(99,102,241,0.1); border-radius: 8px; margin-bottom: 16px;">
@@ -38,7 +37,6 @@
38
  <div id="piperVoices">
39
  <label>Quality Level:</label>
40
  <select id="piperQuality" style="margin-bottom: 12px;">
41
- <option value="high">High Quality (22kHz)</option>
42
  <option value="medium" selected>Medium Quality (16kHz)</option>
43
  <option value="low">Low Quality (Fast)</option>
44
  </select>
@@ -46,31 +44,20 @@
46
  <label>Language/Accent:</label>
47
  <select id="piperLang" style="margin-bottom: 12px;">
48
  <optgroup label="🇺🇸 English - American">
49
- <option value="en_US-lessac" selected>Lessac - Professional (High Quality)</option>
50
- <option value="en_US-ryan">Ryan - Authoritative (High Quality)</option>
51
- <option value="en_US-ljspeech">LJSpeech - Female, Clear</option>
52
- <option value="en_US-amy">Amy - Friendly Female</option>
53
- <option value="en_US-danny">Danny - Young Male</option>
54
- <option value="en_US-joe">Joe - Mature Male</option>
55
- <option value="en_US-kristin">Kristin - Professional Female</option>
56
- <option value="en_US-kathleen">Kathleen - Warm Female</option>
57
  </optgroup>
58
  <optgroup label="🇬🇧 English - British">
59
- <option value="en_GB-cori">Cori - Refined British (High Quality)</option>
60
- <option value="en_GB-alan">Alan - Distinguished Male</option>
61
- <option value="en_GB-alba">Alba - Scottish Female</option>
62
- <option value="en_GB-northern_english_male">Northern English Male</option>
63
- <option value="en_GB-southern_english_female">Southern English Female</option>
64
  </optgroup>
65
- <optgroup label="🌍 Other Languages (900+ total)">
66
- <option value="es_ES">Spanish - Spain (Multiple voices)</option>
67
- <option value="fr_FR">French - France (Multiple voices)</option>
68
- <option value="de_DE">German - Germany (Multiple voices)</option>
69
- <option value="it_IT">Italian - Italy (Multiple voices)</option>
70
- <option value="pt_BR">Portuguese - Brazil (Multiple voices)</option>
71
- <option value="zh_CN">Chinese - Mandarin (Multiple voices)</option>
72
- <option value="ja_JP">Japanese (Multiple voices)</option>
73
- <option value="ko_KR">Korean (Multiple voices)</option>
74
  </optgroup>
75
  </select>
76
 
@@ -88,21 +75,16 @@
88
  <option value="af_bella">Bella - Elegant & Sophisticated</option>
89
  <option value="af_nicole">Nicole - Clear & Articulate</option>
90
  <option value="af_sarah">Sarah - Warm & Friendly</option>
91
- <option value="af_sky">Sky - Light & Energetic</option>
92
  </optgroup>
93
  <optgroup label="🇺🇸 American Male">
94
  <option value="am_adam">Adam - Natural & Relaxed</option>
95
  <option value="am_michael">Michael - Deep & Authoritative</option>
96
  </optgroup>
97
  <optgroup label="🇬🇧 British Female">
98
- <option value="bf">British Default - Refined</option>
99
  <option value="bf_emma">Emma - Elegant & Polished</option>
100
- <option value="bf_isabella">Isabella - Sophisticated</option>
101
  </optgroup>
102
  <optgroup label="🇬🇧 British Male">
103
- <option value="bm">British Male - Distinguished</option>
104
  <option value="bm_george">George - Commanding</option>
105
- <option value="bm_lewis">Lewis - Smooth & Confident</option>
106
  </optgroup>
107
  </select>
108
 
@@ -121,35 +103,12 @@
121
  <option value="3">Voice 3 - Soft</option>
122
  <option value="4">Voice 4 - Clear</option>
123
  <option value="5">Voice 5 - Deep</option>
124
- <option value="6">Voice 6 - Friendly</option>
125
- <option value="7">Voice 7 - Professional</option>
126
  </select>
127
 
128
  <div class="muted" style="font-size: 0.85rem; margin-top: 8px;">
129
  <p>⚡ <strong>Kitten:</strong> Fastest generation, only 24MB model. Perfect for quick tasks.</p>
130
  </div>
131
  </div>
132
-
133
- <!-- Voice Cloning -->
134
- <div id="clonePanel" class="hidden">
135
- <label>Upload Voice Sample (Max 1 min):</label>
136
- <input type="file" id="voiceFile" accept="audio/wav,audio/mp3,audio/mpeg" style="margin-bottom: 12px;">
137
-
138
- <div class="muted" style="font-size: 0.85rem; margin-bottom: 12px;">
139
- <p>📋 Requirements:</p>
140
- <ul style="margin: 4px 0; padding-left: 20px;">
141
- <li>Format: WAV or MP3</li>
142
- <li>Duration: Max 60 seconds</li>
143
- <li>Quality: Clear voice, minimal noise</li>
144
- </ul>
145
- </div>
146
-
147
- <button id="processVoice" class="secondary" style="width: 100%;" disabled>
148
- 🔄 Process Voice Sample
149
- </button>
150
-
151
- <div id="voiceStatus" class="mt-2"></div>
152
- </div>
153
  </fieldset>
154
 
155
  <fieldset>
@@ -166,7 +125,7 @@
166
  <div class="col">
167
  <fieldset>
168
  <legend>📝 Text Input</legend>
169
- <textarea id="txt" placeholder="Enter your text here... (unlimited length supported)">Welcome to the ultimate text-to-speech studio! With access to over 900 premium voices from Piper, Kokoro, and Kitten TTS, you can create professional-quality audio in any language and accent. Try our advanced voice cloning feature to use your own voice!</textarea>
170
  <div class="mt-1">
171
  <span class="muted">Characters: <span id="charCount">0</span></span> &nbsp;|&nbsp;
172
  <span class="muted">Words: <span id="wordCount">0</span></span> &nbsp;|&nbsp;
@@ -250,7 +209,6 @@
250
  <li><strong>Best Quality:</strong> Kokoro (if English)</li>
251
  <li><strong>Most Voices:</strong> Piper (904 options)</li>
252
  <li><strong>Fastest:</strong> Kitten (lightweight)</li>
253
- <li><strong>Custom:</strong> Voice Cloning</li>
254
  </ul>
255
  </div>
256
  </fieldset>
@@ -303,7 +261,14 @@
303
 
304
  // ===== SPEED DISPLAY =====
305
  $("#spd").addEventListener("input", () => {
306
- $("#spdVal").textContent = parseFloat($("#spd").value).toFixed(2);
 
 
 
 
 
 
 
307
  });
308
 
309
  // ===== ENGINE SWITCHING =====
@@ -314,8 +279,7 @@
314
  const engineInfo = {
315
  piper: "Piper TTS: 904 voices, 50+ languages, 3-5x realtime speed",
316
  kokoro: "Kokoro TTS: 21 expressive voices, highest quality, 24kHz audio",
317
- kitten: "Kitten TTS: 8 voices, fastest generation, only 24MB model",
318
- clone: "Voice Cloning: Upload your own voice sample for custom TTS"
319
  };
320
 
321
  const switchEngine = async () => {
@@ -330,19 +294,16 @@
330
  $("#piperVoices").classList.toggle("hidden", engine !== "piper");
331
  $("#kokoroVoices").classList.toggle("hidden", engine !== "kokoro");
332
  $("#kittenVoices").classList.toggle("hidden", engine !== "kitten");
333
- $("#clonePanel").classList.toggle("hidden", engine !== "clone");
334
- $("#voicePanel").classList.toggle("hidden", engine === "clone");
335
 
336
  log(`Switched to ${engine.toUpperCase()} engine`);
337
-
338
- if (engine !== 'clone') {
339
- await initTTSSession();
340
- }
341
  };
342
 
343
  $("#engineSelect").addEventListener("change", switchEngine);
344
  $("#piperLang").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
345
  $("#piperQuality").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
 
 
346
 
347
  // ===== TTS SESSION INITIALIZATION =====
348
  async function initTTSSession() {
@@ -358,45 +319,53 @@
358
  $("#model").className = "chip warning";
359
 
360
  let modelUrl, configUrl;
361
- const quality = $("#piperQuality").value;
362
 
363
  if (currentEngine === 'piper') {
364
  const voice = $("#piperLang").value;
365
- const baseUrl = `https://huggingface.co/rhasspy/piper-voices/resolve/main/${voice}/${quality}/`;
366
- modelUrl = `${baseUrl}${voice}-${quality}.onnx`;
367
- configUrl = `${baseUrl}${voice}-${quality}.onnx.json`;
368
- log(`Initializing Piper: ${voice} (${quality})`);
 
 
 
 
 
 
 
 
369
 
370
  } else if (currentEngine === 'kokoro') {
371
- const baseUrl = `https://huggingface.co/therealtimex/kokoro-tts-web/resolve/main/`;
372
- modelUrl = `${baseUrl}model.onnx`;
373
- configUrl = `${baseUrl}config.json`;
374
  log(`Initializing Kokoro TTS`);
375
 
376
  } else if (currentEngine === 'kitten') {
377
- const baseUrl = `https://huggingface.co/therealtimex/kitten-tts-web/resolve/main/`;
378
  modelUrl = `${baseUrl}model.onnx`;
379
- configUrl = `${baseUrl}config.json`;
380
  log(`Initializing Kitten TTS`);
381
  }
382
 
383
- if (!modelUrl || !configUrl) {
384
- throw new Error("Invalid engine configuration.");
385
- }
386
-
387
- // Dispose previous session to free memory
388
  if (ttsSession) {
389
- await ttsSession.dispose();
 
 
 
 
390
  ttsSession = null;
391
  log("Previous session disposed.");
392
  }
393
 
 
 
 
394
  ttsSession = await createSession({
395
  modelUrl: modelUrl,
396
  configUrl: configUrl,
397
- // Use WebGPU if available
398
  executionProviders: navigator.gpu ? ['webgpu', 'wasm'] : ['wasm'],
399
- // Optional: callback for loading progress
400
  onprogress: (p) => {
401
  const percent = Math.round(p.progress * 100);
402
  $("#model").textContent = `Loading ${percent}%`;
@@ -405,11 +374,13 @@
405
 
406
  $("#model").textContent = "Ready";
407
  $("#model").className = "chip success";
 
408
 
409
  return true;
410
 
411
  } catch (err) {
412
  log(`ERROR initializing: ${err.message}`);
 
413
  $("#model").textContent = "Failed";
414
  $("#model").className = "chip danger";
415
  return false;
@@ -419,90 +390,6 @@
419
  }
420
  }
421
 
422
- // ===== VOICE CLONING (from previous implementation) =====
423
- let clonedEmbedding = null;
424
-
425
- $("#voiceFile").addEventListener("change", () => {
426
- const file = $("#voiceFile").files[0];
427
- if (file) {
428
- $("#processVoice").disabled = false;
429
- log("Voice file selected: " + file.name);
430
- }
431
- });
432
-
433
- $("#processVoice").addEventListener("click", async () => {
434
- const file = $("#voiceFile").files[0];
435
- if (!file) {
436
- showStatus("Please select a voice file!", 'error');
437
- return;
438
- }
439
-
440
- $("#processVoice").disabled = true;
441
- showStatus("Processing voice sample...", 'info');
442
- log("Processing: " + file.name);
443
-
444
- try {
445
- const arrayBuffer = await file.arrayBuffer();
446
- const audioContext = new (window.AudioContext || window.webkitAudioContext)();
447
- let audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
448
-
449
- if (audioBuffer.duration > 60) {
450
- showStatus("⚠️ Trimming to 60s...", 'warning');
451
- const newLength = Math.min(audioBuffer.length, audioContext.sampleRate * 60);
452
- const trimmedBuffer = audioContext.createBuffer(1, newLength, audioBuffer.sampleRate);
453
- trimmedBuffer.copyToChannel(audioBuffer.getChannelData(0).slice(0, newLength), 0);
454
- audioBuffer = trimmedBuffer;
455
- }
456
-
457
- if (audioBuffer.sampleRate !== 16000) {
458
- const offlineContext = new OfflineAudioContext(1, audioBuffer.duration * 16000, 16000);
459
- const source = offlineContext.createBufferSource();
460
- source.buffer = audioBuffer;
461
- source.connect(offlineContext.destination);
462
- source.start();
463
- audioBuffer = await offlineContext.startRendering();
464
- }
465
-
466
- let audioData = audioBuffer.getChannelData(0);
467
-
468
- // Create embedding
469
- clonedEmbedding = new Float32Array(512);
470
- const chunkSize = Math.floor(audioData.length / 512);
471
-
472
- for (let i = 0; i < 512; i++) {
473
- const start = i * chunkSize;
474
- const end = Math.min(start + chunkSize, audioData.length);
475
- let sum = 0, sumSq = 0;
476
-
477
- for (let j = start; j < end; j++) {
478
- sum += audioData[j];
479
- sumSq += audioData[j] * audioData[j];
480
- }
481
-
482
- const mean = sum / (end - start);
483
- const variance = (sumSq / (end - start)) - (mean * mean);
484
- clonedEmbedding[i] = mean + Math.sqrt(Math.abs(variance)) * (i % 2 === 0 ? 1 : -1);
485
- }
486
-
487
- // Normalize
488
- let norm = 0;
489
- for (let i = 0; i < 512; i++) norm += clonedEmbedding[i] * clonedEmbedding[i];
490
- norm = Math.sqrt(norm);
491
- for (let i = 0; i < 512; i++) clonedEmbedding[i] /= norm;
492
-
493
- showStatus("✅ Voice processed!", 'success');
494
- log("Voice embedding created");
495
- $("#voiceStatus").innerHTML = '<div class="status-message success">✅ Voice ready!</div>';
496
-
497
- } catch (err) {
498
- log("ERROR: " + err.message);
499
- showStatus("Error: " + err.message, 'error');
500
- $("#voiceStatus").innerHTML = '<div class="status-message error">❌ Failed</div>';
501
- } finally {
502
- $("#processVoice").disabled = false;
503
- }
504
- });
505
-
506
  // ===== TEXT CHUNKING & AUDIO CONCATENATION =====
507
  function chunkText(text, maxChars = 200) {
508
  const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
@@ -527,7 +414,7 @@
527
  }
528
  }
529
 
530
- return chunks;
531
  }
532
 
533
  function concatenateAudio(audioArrays) {
@@ -546,7 +433,6 @@
546
  const buffer = new ArrayBuffer(44 + samples.length * 2);
547
  const view = new DataView(buffer);
548
 
549
- // WAV header
550
  const writeString = (offset, string) => {
551
  for (let i = 0; i < string.length; i++) {
552
  view.setUint8(offset + i, string.charCodeAt(i));
@@ -557,17 +443,16 @@
557
  view.setUint32(4, 36 + samples.length * 2, true);
558
  writeString(8, 'WAVE');
559
  writeString(12, 'fmt ');
560
- view.setUint32(16, 16, true); // fmt chunk size
561
- view.setUint16(20, 1, true); // PCM format
562
- view.setUint16(22, 1, true); // mono
563
  view.setUint32(24, sampleRate, true);
564
- view.setUint32(28, sampleRate * 2, true); // byte rate
565
- view.setUint16(32, 2, true); // block align
566
- view.setUint16(34, 16, true); // bits per sample
567
  writeString(36, 'data');
568
  view.setUint32(40, samples.length * 2, true);
569
 
570
- // Convert float32 to int16
571
  const offset = 44;
572
  for (let i = 0; i < samples.length; i++) {
573
  const s = Math.max(-1, Math.min(1, samples[i]));
@@ -584,7 +469,12 @@
584
  showStatus("Please enter text!", 'error');
585
  return;
586
  }
587
-
 
 
 
 
 
588
  const btn = $("#go");
589
  btn.disabled = true;
590
  $("#status").className = "chip warning";
@@ -592,26 +482,12 @@
592
  updateProgress(0);
593
 
594
  try {
595
- let finalAudio;
596
- let sampleRate;
597
-
598
- if (currentEngine === 'clone') {
599
- // Voice cloning is complex and requires a separate model (like SpeechT5).
600
- // This is a placeholder for that logic.
601
- showStatus("Voice cloning not implemented in this version.", 'error');
602
- throw new Error("Voice cloning is a placeholder feature.");
603
- }
604
-
605
- if (!ttsSession) {
606
- showStatus("TTS session not ready. Please wait or re-select engine.", 'error');
607
- throw new Error("TTS session not initialized.");
608
- }
609
-
610
  const chunks = chunkText(text, 200);
611
  log(`Processing ${chunks.length} chunk(s)...`);
612
  showStatus(`Processing ${chunks.length} chunk(s)...`, 'info');
613
 
614
  const audioChunks = [];
 
615
  let voiceId;
616
 
617
  if (currentEngine === 'kokoro') {
@@ -626,27 +502,34 @@
626
  updateProgress(progress, `Chunk ${i + 1}/${chunks.length}`);
627
  log(`Generating chunk ${i + 1}: "${chunk.substring(0, 30)}..."`);
628
 
 
 
 
629
  const result = await ttsSession.run({
630
  text: chunk,
631
- voiceId: voiceId, // Only used by Kokoro/Kitten
632
  });
633
 
634
- audioChunks.push(result.audio);
635
- sampleRate = result.sampleRate; // Get sample rate from the first result
 
 
 
 
636
  }
637
 
638
  log("Concatenating audio chunks...");
639
  updateProgress(100, "Finalizing...");
640
- finalAudio = concatenateAudio(audioChunks);
641
 
 
642
  log(`Generated ${finalAudio.length} samples (${(finalAudio.length / sampleRate).toFixed(1)}s)`);
643
 
644
- // Create a WAV blob
645
  const wavBuffer = encodeWAV(finalAudio, sampleRate);
646
  const blob = new Blob([wavBuffer], { type: "audio/wav" });
647
  const url = URL.createObjectURL(blob);
648
 
649
- // Player
650
  const player = $("#player");
651
  player.src = url;
652
  player.playbackRate = parseFloat($("#spd").value);
@@ -680,7 +563,7 @@
680
  $("#backend").textContent = navigator.gpu ? "WebGPU" : "WASM";
681
 
682
  // Initial load
683
- await initTTSSession();
684
  </script>
685
  </body>
686
  </html>
 
8
  </head>
9
  <body>
10
  <h1>🎙️ Ultimate Text-to-Speech Studio</h1>
11
+ <p class="subtitle">3 Premium Engines - 900+ Voices - Unlimited Text</p>
12
 
13
  <div class="row">
14
  <!-- Left Column: Engine & Voice Selection -->
 
21
  <option value="piper">🎯 Piper TTS - 904 Voices (Premium Quality)</option>
22
  <option value="kokoro">✨ Kokoro TTS - 21 Expressive Voices (Highest Quality)</option>
23
  <option value="kitten">⚡ Kitten TTS - 8 Voices (Fastest, Lightweight)</option>
 
24
  </select>
25
 
26
  <div id="engineInfo" style="padding: 12px; background: rgba(99,102,241,0.1); border-radius: 8px; margin-bottom: 16px;">
 
37
  <div id="piperVoices">
38
  <label>Quality Level:</label>
39
  <select id="piperQuality" style="margin-bottom: 12px;">
 
40
  <option value="medium" selected>Medium Quality (16kHz)</option>
41
  <option value="low">Low Quality (Fast)</option>
42
  </select>
 
44
  <label>Language/Accent:</label>
45
  <select id="piperLang" style="margin-bottom: 12px;">
46
  <optgroup label="🇺🇸 English - American">
47
+ <option value="en_US-lessac-medium" selected>Lessac - Professional (High Quality)</option>
48
+ <option value="en_US-ryan-medium">Ryan - Authoritative</option>
49
+ <option value="en_US-ljspeech-medium">LJSpeech - Female, Clear</option>
50
+ <option value="en_US-amy-medium">Amy - Friendly Female</option>
51
+ <option value="en_US-danny-low">Danny - Young Male</option>
 
 
 
52
  </optgroup>
53
  <optgroup label="🇬🇧 English - British">
54
+ <option value="en_GB-cori-medium">Cori - Refined British</option>
55
+ <option value="en_GB-alan-medium">Alan - Distinguished Male</option>
 
 
 
56
  </optgroup>
57
+ <optgroup label="🌍 Other Languages">
58
+ <option value="es_ES-mls_9972-low">Spanish - Spain</option>
59
+ <option value="fr_FR-mls_1840-low">French - France</option>
60
+ <option value="de_DE-thorsten-medium">German - Germany</option>
 
 
 
 
 
61
  </optgroup>
62
  </select>
63
 
 
75
  <option value="af_bella">Bella - Elegant & Sophisticated</option>
76
  <option value="af_nicole">Nicole - Clear & Articulate</option>
77
  <option value="af_sarah">Sarah - Warm & Friendly</option>
 
78
  </optgroup>
79
  <optgroup label="🇺🇸 American Male">
80
  <option value="am_adam">Adam - Natural & Relaxed</option>
81
  <option value="am_michael">Michael - Deep & Authoritative</option>
82
  </optgroup>
83
  <optgroup label="🇬🇧 British Female">
 
84
  <option value="bf_emma">Emma - Elegant & Polished</option>
 
85
  </optgroup>
86
  <optgroup label="🇬🇧 British Male">
 
87
  <option value="bm_george">George - Commanding</option>
 
88
  </optgroup>
89
  </select>
90
 
 
103
  <option value="3">Voice 3 - Soft</option>
104
  <option value="4">Voice 4 - Clear</option>
105
  <option value="5">Voice 5 - Deep</option>
 
 
106
  </select>
107
 
108
  <div class="muted" style="font-size: 0.85rem; margin-top: 8px;">
109
  <p>⚡ <strong>Kitten:</strong> Fastest generation, only 24MB model. Perfect for quick tasks.</p>
110
  </div>
111
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  </fieldset>
113
 
114
  <fieldset>
 
125
  <div class="col">
126
  <fieldset>
127
  <legend>📝 Text Input</legend>
128
+ <textarea id="txt" placeholder="Enter your text here... (unlimited length supported)">Welcome to the ultimate text-to-speech studio! With access to over 900 premium voices from Piper, Kokoro, and Kitten TTS, you can create professional-quality audio in any language and accent.</textarea>
129
  <div class="mt-1">
130
  <span class="muted">Characters: <span id="charCount">0</span></span> &nbsp;|&nbsp;
131
  <span class="muted">Words: <span id="wordCount">0</span></span> &nbsp;|&nbsp;
 
209
  <li><strong>Best Quality:</strong> Kokoro (if English)</li>
210
  <li><strong>Most Voices:</strong> Piper (904 options)</li>
211
  <li><strong>Fastest:</strong> Kitten (lightweight)</li>
 
212
  </ul>
213
  </div>
214
  </fieldset>
 
261
 
262
  // ===== SPEED DISPLAY =====
263
  $("#spd").addEventListener("input", () => {
264
+ const speed = parseFloat($("#spd").value).toFixed(2);
265
+ $("#spdVal").textContent = speed;
266
+
267
+ // Update player speed if audio is loaded
268
+ const player = $("#player");
269
+ if (player.src) {
270
+ player.playbackRate = parseFloat(speed);
271
+ }
272
  });
273
 
274
  // ===== ENGINE SWITCHING =====
 
279
  const engineInfo = {
280
  piper: "Piper TTS: 904 voices, 50+ languages, 3-5x realtime speed",
281
  kokoro: "Kokoro TTS: 21 expressive voices, highest quality, 24kHz audio",
282
+ kitten: "Kitten TTS: 8 voices, fastest generation, only 24MB model"
 
283
  };
284
 
285
  const switchEngine = async () => {
 
294
  $("#piperVoices").classList.toggle("hidden", engine !== "piper");
295
  $("#kokoroVoices").classList.toggle("hidden", engine !== "kokoro");
296
  $("#kittenVoices").classList.toggle("hidden", engine !== "kitten");
 
 
297
 
298
  log(`Switched to ${engine.toUpperCase()} engine`);
299
+ await initTTSSession();
 
 
 
300
  };
301
 
302
  $("#engineSelect").addEventListener("change", switchEngine);
303
  $("#piperLang").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
304
  $("#piperQuality").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
305
+ $("#kokoroVoice").addEventListener("change", () => { if (currentEngine === 'kokoro') initTTSSession(); });
306
+ $("#kittenVoice").addEventListener("change", () => { if (currentEngine === 'kitten') initTTSSession(); });
307
 
308
  // ===== TTS SESSION INITIALIZATION =====
309
  async function initTTSSession() {
 
319
  $("#model").className = "chip warning";
320
 
321
  let modelUrl, configUrl;
 
322
 
323
  if (currentEngine === 'piper') {
324
  const voice = $("#piperLang").value;
325
+ const quality = $("#piperQuality").value;
326
+
327
+ // Format: en_US-lessac-medium → en_US/lessac/medium/
328
+ const parts = voice.split('-');
329
+ const lang = parts[0];
330
+ const speaker = parts.slice(1, -1).join('-');
331
+ const qual = parts[parts.length - 1];
332
+
333
+ const baseUrl = `https://huggingface.co/rhasspy/piper-voices/resolve/main/${lang}/${lang}-${speaker}/${qual}/`;
334
+ modelUrl = `${baseUrl}${lang}-${speaker}-${qual}.onnx`;
335
+ configUrl = `${baseUrl}${lang}-${speaker}-${qual}.onnx.json`;
336
+ log(`Initializing Piper: ${lang}-${speaker} (${qual})`);
337
 
338
  } else if (currentEngine === 'kokoro') {
339
+ const baseUrl = `https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/`;
340
+ modelUrl = `${baseUrl}kokoro-v0_19.onnx`;
341
+ configUrl = `${baseUrl}voices.json`;
342
  log(`Initializing Kokoro TTS`);
343
 
344
  } else if (currentEngine === 'kitten') {
345
+ const baseUrl = `https://huggingface.co/2mnws/KittenTTS/resolve/main/`;
346
  modelUrl = `${baseUrl}model.onnx`;
347
+ configUrl = null; // Kitten might not need config
348
  log(`Initializing Kitten TTS`);
349
  }
350
 
351
+ // Dispose previous session
 
 
 
 
352
  if (ttsSession) {
353
+ try {
354
+ await ttsSession.dispose();
355
+ } catch(e) {
356
+ console.log("Dispose error:", e);
357
+ }
358
  ttsSession = null;
359
  log("Previous session disposed.");
360
  }
361
 
362
+ // Small delay to allow UI update
363
+ await new Promise(resolve => setTimeout(resolve, 50));
364
+
365
  ttsSession = await createSession({
366
  modelUrl: modelUrl,
367
  configUrl: configUrl,
 
368
  executionProviders: navigator.gpu ? ['webgpu', 'wasm'] : ['wasm'],
 
369
  onprogress: (p) => {
370
  const percent = Math.round(p.progress * 100);
371
  $("#model").textContent = `Loading ${percent}%`;
 
374
 
375
  $("#model").textContent = "Ready";
376
  $("#model").className = "chip success";
377
+ log("Model loaded successfully!");
378
 
379
  return true;
380
 
381
  } catch (err) {
382
  log(`ERROR initializing: ${err.message}`);
383
+ showStatus(`Failed to load model: ${err.message}`, 'error');
384
  $("#model").textContent = "Failed";
385
  $("#model").className = "chip danger";
386
  return false;
 
390
  }
391
  }
392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  // ===== TEXT CHUNKING & AUDIO CONCATENATION =====
394
  function chunkText(text, maxChars = 200) {
395
  const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
 
414
  }
415
  }
416
 
417
+ return chunks.filter(c => c.length > 0);
418
  }
419
 
420
  function concatenateAudio(audioArrays) {
 
433
  const buffer = new ArrayBuffer(44 + samples.length * 2);
434
  const view = new DataView(buffer);
435
 
 
436
  const writeString = (offset, string) => {
437
  for (let i = 0; i < string.length; i++) {
438
  view.setUint8(offset + i, string.charCodeAt(i));
 
443
  view.setUint32(4, 36 + samples.length * 2, true);
444
  writeString(8, 'WAVE');
445
  writeString(12, 'fmt ');
446
+ view.setUint32(16, 16, true);
447
+ view.setUint16(20, 1, true);
448
+ view.setUint16(22, 1, true);
449
  view.setUint32(24, sampleRate, true);
450
+ view.setUint32(28, sampleRate * 2, true);
451
+ view.setUint16(32, 2, true);
452
+ view.setUint16(34, 16, true);
453
  writeString(36, 'data');
454
  view.setUint32(40, samples.length * 2, true);
455
 
 
456
  const offset = 44;
457
  for (let i = 0; i < samples.length; i++) {
458
  const s = Math.max(-1, Math.min(1, samples[i]));
 
469
  showStatus("Please enter text!", 'error');
470
  return;
471
  }
472
+
473
+ if (!ttsSession) {
474
+ showStatus("Model not loaded. Please wait for initialization...", 'error');
475
+ return;
476
+ }
477
+
478
  const btn = $("#go");
479
  btn.disabled = true;
480
  $("#status").className = "chip warning";
 
482
  updateProgress(0);
483
 
484
  try {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  const chunks = chunkText(text, 200);
486
  log(`Processing ${chunks.length} chunk(s)...`);
487
  showStatus(`Processing ${chunks.length} chunk(s)...`, 'info');
488
 
489
  const audioChunks = [];
490
+ let sampleRate = 22050; // default
491
  let voiceId;
492
 
493
  if (currentEngine === 'kokoro') {
 
502
  updateProgress(progress, `Chunk ${i + 1}/${chunks.length}`);
503
  log(`Generating chunk ${i + 1}: "${chunk.substring(0, 30)}..."`);
504
 
505
+ // Small delay to allow UI update
506
+ await new Promise(resolve => setTimeout(resolve, 10));
507
+
508
  const result = await ttsSession.run({
509
  text: chunk,
510
+ voiceId: voiceId,
511
  });
512
 
513
+ if (result && result.audio) {
514
+ audioChunks.push(result.audio);
515
+ if (result.sampleRate) {
516
+ sampleRate = result.sampleRate;
517
+ }
518
+ }
519
  }
520
 
521
  log("Concatenating audio chunks...");
522
  updateProgress(100, "Finalizing...");
 
523
 
524
+ const finalAudio = concatenateAudio(audioChunks);
525
  log(`Generated ${finalAudio.length} samples (${(finalAudio.length / sampleRate).toFixed(1)}s)`);
526
 
527
+ // Create WAV blob
528
  const wavBuffer = encodeWAV(finalAudio, sampleRate);
529
  const blob = new Blob([wavBuffer], { type: "audio/wav" });
530
  const url = URL.createObjectURL(blob);
531
 
532
+ // Player with speed
533
  const player = $("#player");
534
  player.src = url;
535
  player.playbackRate = parseFloat($("#spd").value);
 
563
  $("#backend").textContent = navigator.gpu ? "WebGPU" : "WASM";
564
 
565
  // Initial load
566
+ initTTSSession();
567
  </script>
568
  </body>
569
  </html>