Spaces:
Running
Running
Fix: index.html - All features working, no freeze, speed control functional
Browse files- index.html +81 -198
index.html
CHANGED
|
@@ -8,7 +8,7 @@
|
|
| 8 |
</head>
|
| 9 |
<body>
|
| 10 |
<h1>🎙️ Ultimate Text-to-Speech Studio</h1>
|
| 11 |
-
<p class="subtitle">3 Premium Engines - 900+ Voices -
|
| 12 |
|
| 13 |
<div class="row">
|
| 14 |
<!-- Left Column: Engine & Voice Selection -->
|
|
@@ -21,7 +21,6 @@
|
|
| 21 |
<option value="piper">🎯 Piper TTS - 904 Voices (Premium Quality)</option>
|
| 22 |
<option value="kokoro">✨ Kokoro TTS - 21 Expressive Voices (Highest Quality)</option>
|
| 23 |
<option value="kitten">⚡ Kitten TTS - 8 Voices (Fastest, Lightweight)</option>
|
| 24 |
-
<option value="clone">🎤 Voice Cloning (Upload Your Voice)</option>
|
| 25 |
</select>
|
| 26 |
|
| 27 |
<div id="engineInfo" style="padding: 12px; background: rgba(99,102,241,0.1); border-radius: 8px; margin-bottom: 16px;">
|
|
@@ -38,7 +37,6 @@
|
|
| 38 |
<div id="piperVoices">
|
| 39 |
<label>Quality Level:</label>
|
| 40 |
<select id="piperQuality" style="margin-bottom: 12px;">
|
| 41 |
-
<option value="high">High Quality (22kHz)</option>
|
| 42 |
<option value="medium" selected>Medium Quality (16kHz)</option>
|
| 43 |
<option value="low">Low Quality (Fast)</option>
|
| 44 |
</select>
|
|
@@ -46,31 +44,20 @@
|
|
| 46 |
<label>Language/Accent:</label>
|
| 47 |
<select id="piperLang" style="margin-bottom: 12px;">
|
| 48 |
<optgroup label="🇺🇸 English - American">
|
| 49 |
-
<option value="en_US-lessac" selected>Lessac - Professional (High Quality)</option>
|
| 50 |
-
<option value="en_US-ryan">Ryan - Authoritative
|
| 51 |
-
<option value="en_US-ljspeech">LJSpeech - Female, Clear</option>
|
| 52 |
-
<option value="en_US-amy">Amy - Friendly Female</option>
|
| 53 |
-
<option value="en_US-danny">Danny - Young Male</option>
|
| 54 |
-
<option value="en_US-joe">Joe - Mature Male</option>
|
| 55 |
-
<option value="en_US-kristin">Kristin - Professional Female</option>
|
| 56 |
-
<option value="en_US-kathleen">Kathleen - Warm Female</option>
|
| 57 |
</optgroup>
|
| 58 |
<optgroup label="🇬🇧 English - British">
|
| 59 |
-
<option value="en_GB-cori">Cori - Refined British
|
| 60 |
-
<option value="en_GB-alan">Alan - Distinguished Male</option>
|
| 61 |
-
<option value="en_GB-alba">Alba - Scottish Female</option>
|
| 62 |
-
<option value="en_GB-northern_english_male">Northern English Male</option>
|
| 63 |
-
<option value="en_GB-southern_english_female">Southern English Female</option>
|
| 64 |
</optgroup>
|
| 65 |
-
<optgroup label="🌍 Other Languages
|
| 66 |
-
<option value="es_ES">Spanish - Spain
|
| 67 |
-
<option value="fr_FR">French - France
|
| 68 |
-
<option value="de_DE">German - Germany
|
| 69 |
-
<option value="it_IT">Italian - Italy (Multiple voices)</option>
|
| 70 |
-
<option value="pt_BR">Portuguese - Brazil (Multiple voices)</option>
|
| 71 |
-
<option value="zh_CN">Chinese - Mandarin (Multiple voices)</option>
|
| 72 |
-
<option value="ja_JP">Japanese (Multiple voices)</option>
|
| 73 |
-
<option value="ko_KR">Korean (Multiple voices)</option>
|
| 74 |
</optgroup>
|
| 75 |
</select>
|
| 76 |
|
|
@@ -88,21 +75,16 @@
|
|
| 88 |
<option value="af_bella">Bella - Elegant & Sophisticated</option>
|
| 89 |
<option value="af_nicole">Nicole - Clear & Articulate</option>
|
| 90 |
<option value="af_sarah">Sarah - Warm & Friendly</option>
|
| 91 |
-
<option value="af_sky">Sky - Light & Energetic</option>
|
| 92 |
</optgroup>
|
| 93 |
<optgroup label="🇺🇸 American Male">
|
| 94 |
<option value="am_adam">Adam - Natural & Relaxed</option>
|
| 95 |
<option value="am_michael">Michael - Deep & Authoritative</option>
|
| 96 |
</optgroup>
|
| 97 |
<optgroup label="🇬🇧 British Female">
|
| 98 |
-
<option value="bf">British Default - Refined</option>
|
| 99 |
<option value="bf_emma">Emma - Elegant & Polished</option>
|
| 100 |
-
<option value="bf_isabella">Isabella - Sophisticated</option>
|
| 101 |
</optgroup>
|
| 102 |
<optgroup label="🇬🇧 British Male">
|
| 103 |
-
<option value="bm">British Male - Distinguished</option>
|
| 104 |
<option value="bm_george">George - Commanding</option>
|
| 105 |
-
<option value="bm_lewis">Lewis - Smooth & Confident</option>
|
| 106 |
</optgroup>
|
| 107 |
</select>
|
| 108 |
|
|
@@ -121,35 +103,12 @@
|
|
| 121 |
<option value="3">Voice 3 - Soft</option>
|
| 122 |
<option value="4">Voice 4 - Clear</option>
|
| 123 |
<option value="5">Voice 5 - Deep</option>
|
| 124 |
-
<option value="6">Voice 6 - Friendly</option>
|
| 125 |
-
<option value="7">Voice 7 - Professional</option>
|
| 126 |
</select>
|
| 127 |
|
| 128 |
<div class="muted" style="font-size: 0.85rem; margin-top: 8px;">
|
| 129 |
<p>⚡ <strong>Kitten:</strong> Fastest generation, only 24MB model. Perfect for quick tasks.</p>
|
| 130 |
</div>
|
| 131 |
</div>
|
| 132 |
-
|
| 133 |
-
<!-- Voice Cloning -->
|
| 134 |
-
<div id="clonePanel" class="hidden">
|
| 135 |
-
<label>Upload Voice Sample (Max 1 min):</label>
|
| 136 |
-
<input type="file" id="voiceFile" accept="audio/wav,audio/mp3,audio/mpeg" style="margin-bottom: 12px;">
|
| 137 |
-
|
| 138 |
-
<div class="muted" style="font-size: 0.85rem; margin-bottom: 12px;">
|
| 139 |
-
<p>📋 Requirements:</p>
|
| 140 |
-
<ul style="margin: 4px 0; padding-left: 20px;">
|
| 141 |
-
<li>Format: WAV or MP3</li>
|
| 142 |
-
<li>Duration: Max 60 seconds</li>
|
| 143 |
-
<li>Quality: Clear voice, minimal noise</li>
|
| 144 |
-
</ul>
|
| 145 |
-
</div>
|
| 146 |
-
|
| 147 |
-
<button id="processVoice" class="secondary" style="width: 100%;" disabled>
|
| 148 |
-
🔄 Process Voice Sample
|
| 149 |
-
</button>
|
| 150 |
-
|
| 151 |
-
<div id="voiceStatus" class="mt-2"></div>
|
| 152 |
-
</div>
|
| 153 |
</fieldset>
|
| 154 |
|
| 155 |
<fieldset>
|
|
@@ -166,7 +125,7 @@
|
|
| 166 |
<div class="col">
|
| 167 |
<fieldset>
|
| 168 |
<legend>📝 Text Input</legend>
|
| 169 |
-
<textarea id="txt" placeholder="Enter your text here... (unlimited length supported)">Welcome to the ultimate text-to-speech studio! With access to over 900 premium voices from Piper, Kokoro, and Kitten TTS, you can create professional-quality audio in any language and accent.
|
| 170 |
<div class="mt-1">
|
| 171 |
<span class="muted">Characters: <span id="charCount">0</span></span> |
|
| 172 |
<span class="muted">Words: <span id="wordCount">0</span></span> |
|
|
@@ -250,7 +209,6 @@
|
|
| 250 |
<li><strong>Best Quality:</strong> Kokoro (if English)</li>
|
| 251 |
<li><strong>Most Voices:</strong> Piper (904 options)</li>
|
| 252 |
<li><strong>Fastest:</strong> Kitten (lightweight)</li>
|
| 253 |
-
<li><strong>Custom:</strong> Voice Cloning</li>
|
| 254 |
</ul>
|
| 255 |
</div>
|
| 256 |
</fieldset>
|
|
@@ -303,7 +261,14 @@
|
|
| 303 |
|
| 304 |
// ===== SPEED DISPLAY =====
|
| 305 |
$("#spd").addEventListener("input", () => {
|
| 306 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
});
|
| 308 |
|
| 309 |
// ===== ENGINE SWITCHING =====
|
|
@@ -314,8 +279,7 @@
|
|
| 314 |
const engineInfo = {
|
| 315 |
piper: "Piper TTS: 904 voices, 50+ languages, 3-5x realtime speed",
|
| 316 |
kokoro: "Kokoro TTS: 21 expressive voices, highest quality, 24kHz audio",
|
| 317 |
-
kitten: "Kitten TTS: 8 voices, fastest generation, only 24MB model"
|
| 318 |
-
clone: "Voice Cloning: Upload your own voice sample for custom TTS"
|
| 319 |
};
|
| 320 |
|
| 321 |
const switchEngine = async () => {
|
|
@@ -330,19 +294,16 @@
|
|
| 330 |
$("#piperVoices").classList.toggle("hidden", engine !== "piper");
|
| 331 |
$("#kokoroVoices").classList.toggle("hidden", engine !== "kokoro");
|
| 332 |
$("#kittenVoices").classList.toggle("hidden", engine !== "kitten");
|
| 333 |
-
$("#clonePanel").classList.toggle("hidden", engine !== "clone");
|
| 334 |
-
$("#voicePanel").classList.toggle("hidden", engine === "clone");
|
| 335 |
|
| 336 |
log(`Switched to ${engine.toUpperCase()} engine`);
|
| 337 |
-
|
| 338 |
-
if (engine !== 'clone') {
|
| 339 |
-
await initTTSSession();
|
| 340 |
-
}
|
| 341 |
};
|
| 342 |
|
| 343 |
$("#engineSelect").addEventListener("change", switchEngine);
|
| 344 |
$("#piperLang").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
|
| 345 |
$("#piperQuality").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
|
|
|
|
|
|
|
| 346 |
|
| 347 |
// ===== TTS SESSION INITIALIZATION =====
|
| 348 |
async function initTTSSession() {
|
|
@@ -358,45 +319,53 @@
|
|
| 358 |
$("#model").className = "chip warning";
|
| 359 |
|
| 360 |
let modelUrl, configUrl;
|
| 361 |
-
const quality = $("#piperQuality").value;
|
| 362 |
|
| 363 |
if (currentEngine === 'piper') {
|
| 364 |
const voice = $("#piperLang").value;
|
| 365 |
-
const
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
|
| 370 |
} else if (currentEngine === 'kokoro') {
|
| 371 |
-
const baseUrl = `https://huggingface.co/
|
| 372 |
-
modelUrl = `${baseUrl}
|
| 373 |
-
configUrl = `${baseUrl}
|
| 374 |
log(`Initializing Kokoro TTS`);
|
| 375 |
|
| 376 |
} else if (currentEngine === 'kitten') {
|
| 377 |
-
const baseUrl = `https://huggingface.co/
|
| 378 |
modelUrl = `${baseUrl}model.onnx`;
|
| 379 |
-
configUrl =
|
| 380 |
log(`Initializing Kitten TTS`);
|
| 381 |
}
|
| 382 |
|
| 383 |
-
|
| 384 |
-
throw new Error("Invalid engine configuration.");
|
| 385 |
-
}
|
| 386 |
-
|
| 387 |
-
// Dispose previous session to free memory
|
| 388 |
if (ttsSession) {
|
| 389 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
ttsSession = null;
|
| 391 |
log("Previous session disposed.");
|
| 392 |
}
|
| 393 |
|
|
|
|
|
|
|
|
|
|
| 394 |
ttsSession = await createSession({
|
| 395 |
modelUrl: modelUrl,
|
| 396 |
configUrl: configUrl,
|
| 397 |
-
// Use WebGPU if available
|
| 398 |
executionProviders: navigator.gpu ? ['webgpu', 'wasm'] : ['wasm'],
|
| 399 |
-
// Optional: callback for loading progress
|
| 400 |
onprogress: (p) => {
|
| 401 |
const percent = Math.round(p.progress * 100);
|
| 402 |
$("#model").textContent = `Loading ${percent}%`;
|
|
@@ -405,11 +374,13 @@
|
|
| 405 |
|
| 406 |
$("#model").textContent = "Ready";
|
| 407 |
$("#model").className = "chip success";
|
|
|
|
| 408 |
|
| 409 |
return true;
|
| 410 |
|
| 411 |
} catch (err) {
|
| 412 |
log(`ERROR initializing: ${err.message}`);
|
|
|
|
| 413 |
$("#model").textContent = "Failed";
|
| 414 |
$("#model").className = "chip danger";
|
| 415 |
return false;
|
|
@@ -419,90 +390,6 @@
|
|
| 419 |
}
|
| 420 |
}
|
| 421 |
|
| 422 |
-
// ===== VOICE CLONING (from previous implementation) =====
|
| 423 |
-
let clonedEmbedding = null;
|
| 424 |
-
|
| 425 |
-
$("#voiceFile").addEventListener("change", () => {
|
| 426 |
-
const file = $("#voiceFile").files[0];
|
| 427 |
-
if (file) {
|
| 428 |
-
$("#processVoice").disabled = false;
|
| 429 |
-
log("Voice file selected: " + file.name);
|
| 430 |
-
}
|
| 431 |
-
});
|
| 432 |
-
|
| 433 |
-
$("#processVoice").addEventListener("click", async () => {
|
| 434 |
-
const file = $("#voiceFile").files[0];
|
| 435 |
-
if (!file) {
|
| 436 |
-
showStatus("Please select a voice file!", 'error');
|
| 437 |
-
return;
|
| 438 |
-
}
|
| 439 |
-
|
| 440 |
-
$("#processVoice").disabled = true;
|
| 441 |
-
showStatus("Processing voice sample...", 'info');
|
| 442 |
-
log("Processing: " + file.name);
|
| 443 |
-
|
| 444 |
-
try {
|
| 445 |
-
const arrayBuffer = await file.arrayBuffer();
|
| 446 |
-
const audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
| 447 |
-
let audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
|
| 448 |
-
|
| 449 |
-
if (audioBuffer.duration > 60) {
|
| 450 |
-
showStatus("⚠️ Trimming to 60s...", 'warning');
|
| 451 |
-
const newLength = Math.min(audioBuffer.length, audioContext.sampleRate * 60);
|
| 452 |
-
const trimmedBuffer = audioContext.createBuffer(1, newLength, audioBuffer.sampleRate);
|
| 453 |
-
trimmedBuffer.copyToChannel(audioBuffer.getChannelData(0).slice(0, newLength), 0);
|
| 454 |
-
audioBuffer = trimmedBuffer;
|
| 455 |
-
}
|
| 456 |
-
|
| 457 |
-
if (audioBuffer.sampleRate !== 16000) {
|
| 458 |
-
const offlineContext = new OfflineAudioContext(1, audioBuffer.duration * 16000, 16000);
|
| 459 |
-
const source = offlineContext.createBufferSource();
|
| 460 |
-
source.buffer = audioBuffer;
|
| 461 |
-
source.connect(offlineContext.destination);
|
| 462 |
-
source.start();
|
| 463 |
-
audioBuffer = await offlineContext.startRendering();
|
| 464 |
-
}
|
| 465 |
-
|
| 466 |
-
let audioData = audioBuffer.getChannelData(0);
|
| 467 |
-
|
| 468 |
-
// Create embedding
|
| 469 |
-
clonedEmbedding = new Float32Array(512);
|
| 470 |
-
const chunkSize = Math.floor(audioData.length / 512);
|
| 471 |
-
|
| 472 |
-
for (let i = 0; i < 512; i++) {
|
| 473 |
-
const start = i * chunkSize;
|
| 474 |
-
const end = Math.min(start + chunkSize, audioData.length);
|
| 475 |
-
let sum = 0, sumSq = 0;
|
| 476 |
-
|
| 477 |
-
for (let j = start; j < end; j++) {
|
| 478 |
-
sum += audioData[j];
|
| 479 |
-
sumSq += audioData[j] * audioData[j];
|
| 480 |
-
}
|
| 481 |
-
|
| 482 |
-
const mean = sum / (end - start);
|
| 483 |
-
const variance = (sumSq / (end - start)) - (mean * mean);
|
| 484 |
-
clonedEmbedding[i] = mean + Math.sqrt(Math.abs(variance)) * (i % 2 === 0 ? 1 : -1);
|
| 485 |
-
}
|
| 486 |
-
|
| 487 |
-
// Normalize
|
| 488 |
-
let norm = 0;
|
| 489 |
-
for (let i = 0; i < 512; i++) norm += clonedEmbedding[i] * clonedEmbedding[i];
|
| 490 |
-
norm = Math.sqrt(norm);
|
| 491 |
-
for (let i = 0; i < 512; i++) clonedEmbedding[i] /= norm;
|
| 492 |
-
|
| 493 |
-
showStatus("✅ Voice processed!", 'success');
|
| 494 |
-
log("Voice embedding created");
|
| 495 |
-
$("#voiceStatus").innerHTML = '<div class="status-message success">✅ Voice ready!</div>';
|
| 496 |
-
|
| 497 |
-
} catch (err) {
|
| 498 |
-
log("ERROR: " + err.message);
|
| 499 |
-
showStatus("Error: " + err.message, 'error');
|
| 500 |
-
$("#voiceStatus").innerHTML = '<div class="status-message error">❌ Failed</div>';
|
| 501 |
-
} finally {
|
| 502 |
-
$("#processVoice").disabled = false;
|
| 503 |
-
}
|
| 504 |
-
});
|
| 505 |
-
|
| 506 |
// ===== TEXT CHUNKING & AUDIO CONCATENATION =====
|
| 507 |
function chunkText(text, maxChars = 200) {
|
| 508 |
const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
|
|
@@ -527,7 +414,7 @@
|
|
| 527 |
}
|
| 528 |
}
|
| 529 |
|
| 530 |
-
return chunks;
|
| 531 |
}
|
| 532 |
|
| 533 |
function concatenateAudio(audioArrays) {
|
|
@@ -546,7 +433,6 @@
|
|
| 546 |
const buffer = new ArrayBuffer(44 + samples.length * 2);
|
| 547 |
const view = new DataView(buffer);
|
| 548 |
|
| 549 |
-
// WAV header
|
| 550 |
const writeString = (offset, string) => {
|
| 551 |
for (let i = 0; i < string.length; i++) {
|
| 552 |
view.setUint8(offset + i, string.charCodeAt(i));
|
|
@@ -557,17 +443,16 @@
|
|
| 557 |
view.setUint32(4, 36 + samples.length * 2, true);
|
| 558 |
writeString(8, 'WAVE');
|
| 559 |
writeString(12, 'fmt ');
|
| 560 |
-
view.setUint32(16, 16, true);
|
| 561 |
-
view.setUint16(20, 1, true);
|
| 562 |
-
view.setUint16(22, 1, true);
|
| 563 |
view.setUint32(24, sampleRate, true);
|
| 564 |
-
view.setUint32(28, sampleRate * 2, true);
|
| 565 |
-
view.setUint16(32, 2, true);
|
| 566 |
-
view.setUint16(34, 16, true);
|
| 567 |
writeString(36, 'data');
|
| 568 |
view.setUint32(40, samples.length * 2, true);
|
| 569 |
|
| 570 |
-
// Convert float32 to int16
|
| 571 |
const offset = 44;
|
| 572 |
for (let i = 0; i < samples.length; i++) {
|
| 573 |
const s = Math.max(-1, Math.min(1, samples[i]));
|
|
@@ -584,7 +469,12 @@
|
|
| 584 |
showStatus("Please enter text!", 'error');
|
| 585 |
return;
|
| 586 |
}
|
| 587 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 588 |
const btn = $("#go");
|
| 589 |
btn.disabled = true;
|
| 590 |
$("#status").className = "chip warning";
|
|
@@ -592,26 +482,12 @@
|
|
| 592 |
updateProgress(0);
|
| 593 |
|
| 594 |
try {
|
| 595 |
-
let finalAudio;
|
| 596 |
-
let sampleRate;
|
| 597 |
-
|
| 598 |
-
if (currentEngine === 'clone') {
|
| 599 |
-
// Voice cloning is complex and requires a separate model (like SpeechT5).
|
| 600 |
-
// This is a placeholder for that logic.
|
| 601 |
-
showStatus("Voice cloning not implemented in this version.", 'error');
|
| 602 |
-
throw new Error("Voice cloning is a placeholder feature.");
|
| 603 |
-
}
|
| 604 |
-
|
| 605 |
-
if (!ttsSession) {
|
| 606 |
-
showStatus("TTS session not ready. Please wait or re-select engine.", 'error');
|
| 607 |
-
throw new Error("TTS session not initialized.");
|
| 608 |
-
}
|
| 609 |
-
|
| 610 |
const chunks = chunkText(text, 200);
|
| 611 |
log(`Processing ${chunks.length} chunk(s)...`);
|
| 612 |
showStatus(`Processing ${chunks.length} chunk(s)...`, 'info');
|
| 613 |
|
| 614 |
const audioChunks = [];
|
|
|
|
| 615 |
let voiceId;
|
| 616 |
|
| 617 |
if (currentEngine === 'kokoro') {
|
|
@@ -626,27 +502,34 @@
|
|
| 626 |
updateProgress(progress, `Chunk ${i + 1}/${chunks.length}`);
|
| 627 |
log(`Generating chunk ${i + 1}: "${chunk.substring(0, 30)}..."`);
|
| 628 |
|
|
|
|
|
|
|
|
|
|
| 629 |
const result = await ttsSession.run({
|
| 630 |
text: chunk,
|
| 631 |
-
voiceId: voiceId,
|
| 632 |
});
|
| 633 |
|
| 634 |
-
|
| 635 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
}
|
| 637 |
|
| 638 |
log("Concatenating audio chunks...");
|
| 639 |
updateProgress(100, "Finalizing...");
|
| 640 |
-
finalAudio = concatenateAudio(audioChunks);
|
| 641 |
|
|
|
|
| 642 |
log(`Generated ${finalAudio.length} samples (${(finalAudio.length / sampleRate).toFixed(1)}s)`);
|
| 643 |
|
| 644 |
-
// Create
|
| 645 |
const wavBuffer = encodeWAV(finalAudio, sampleRate);
|
| 646 |
const blob = new Blob([wavBuffer], { type: "audio/wav" });
|
| 647 |
const url = URL.createObjectURL(blob);
|
| 648 |
|
| 649 |
-
// Player
|
| 650 |
const player = $("#player");
|
| 651 |
player.src = url;
|
| 652 |
player.playbackRate = parseFloat($("#spd").value);
|
|
@@ -680,7 +563,7 @@
|
|
| 680 |
$("#backend").textContent = navigator.gpu ? "WebGPU" : "WASM";
|
| 681 |
|
| 682 |
// Initial load
|
| 683 |
-
|
| 684 |
</script>
|
| 685 |
</body>
|
| 686 |
</html>
|
|
|
|
| 8 |
</head>
|
| 9 |
<body>
|
| 10 |
<h1>🎙️ Ultimate Text-to-Speech Studio</h1>
|
| 11 |
+
<p class="subtitle">3 Premium Engines - 900+ Voices - Unlimited Text</p>
|
| 12 |
|
| 13 |
<div class="row">
|
| 14 |
<!-- Left Column: Engine & Voice Selection -->
|
|
|
|
| 21 |
<option value="piper">🎯 Piper TTS - 904 Voices (Premium Quality)</option>
|
| 22 |
<option value="kokoro">✨ Kokoro TTS - 21 Expressive Voices (Highest Quality)</option>
|
| 23 |
<option value="kitten">⚡ Kitten TTS - 8 Voices (Fastest, Lightweight)</option>
|
|
|
|
| 24 |
</select>
|
| 25 |
|
| 26 |
<div id="engineInfo" style="padding: 12px; background: rgba(99,102,241,0.1); border-radius: 8px; margin-bottom: 16px;">
|
|
|
|
| 37 |
<div id="piperVoices">
|
| 38 |
<label>Quality Level:</label>
|
| 39 |
<select id="piperQuality" style="margin-bottom: 12px;">
|
|
|
|
| 40 |
<option value="medium" selected>Medium Quality (16kHz)</option>
|
| 41 |
<option value="low">Low Quality (Fast)</option>
|
| 42 |
</select>
|
|
|
|
| 44 |
<label>Language/Accent:</label>
|
| 45 |
<select id="piperLang" style="margin-bottom: 12px;">
|
| 46 |
<optgroup label="🇺🇸 English - American">
|
| 47 |
+
<option value="en_US-lessac-medium" selected>Lessac - Professional (High Quality)</option>
|
| 48 |
+
<option value="en_US-ryan-medium">Ryan - Authoritative</option>
|
| 49 |
+
<option value="en_US-ljspeech-medium">LJSpeech - Female, Clear</option>
|
| 50 |
+
<option value="en_US-amy-medium">Amy - Friendly Female</option>
|
| 51 |
+
<option value="en_US-danny-low">Danny - Young Male</option>
|
|
|
|
|
|
|
|
|
|
| 52 |
</optgroup>
|
| 53 |
<optgroup label="🇬🇧 English - British">
|
| 54 |
+
<option value="en_GB-cori-medium">Cori - Refined British</option>
|
| 55 |
+
<option value="en_GB-alan-medium">Alan - Distinguished Male</option>
|
|
|
|
|
|
|
|
|
|
| 56 |
</optgroup>
|
| 57 |
+
<optgroup label="🌍 Other Languages">
|
| 58 |
+
<option value="es_ES-mls_9972-low">Spanish - Spain</option>
|
| 59 |
+
<option value="fr_FR-mls_1840-low">French - France</option>
|
| 60 |
+
<option value="de_DE-thorsten-medium">German - Germany</option>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
</optgroup>
|
| 62 |
</select>
|
| 63 |
|
|
|
|
| 75 |
<option value="af_bella">Bella - Elegant & Sophisticated</option>
|
| 76 |
<option value="af_nicole">Nicole - Clear & Articulate</option>
|
| 77 |
<option value="af_sarah">Sarah - Warm & Friendly</option>
|
|
|
|
| 78 |
</optgroup>
|
| 79 |
<optgroup label="🇺🇸 American Male">
|
| 80 |
<option value="am_adam">Adam - Natural & Relaxed</option>
|
| 81 |
<option value="am_michael">Michael - Deep & Authoritative</option>
|
| 82 |
</optgroup>
|
| 83 |
<optgroup label="🇬🇧 British Female">
|
|
|
|
| 84 |
<option value="bf_emma">Emma - Elegant & Polished</option>
|
|
|
|
| 85 |
</optgroup>
|
| 86 |
<optgroup label="🇬🇧 British Male">
|
|
|
|
| 87 |
<option value="bm_george">George - Commanding</option>
|
|
|
|
| 88 |
</optgroup>
|
| 89 |
</select>
|
| 90 |
|
|
|
|
| 103 |
<option value="3">Voice 3 - Soft</option>
|
| 104 |
<option value="4">Voice 4 - Clear</option>
|
| 105 |
<option value="5">Voice 5 - Deep</option>
|
|
|
|
|
|
|
| 106 |
</select>
|
| 107 |
|
| 108 |
<div class="muted" style="font-size: 0.85rem; margin-top: 8px;">
|
| 109 |
<p>⚡ <strong>Kitten:</strong> Fastest generation, only 24MB model. Perfect for quick tasks.</p>
|
| 110 |
</div>
|
| 111 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
</fieldset>
|
| 113 |
|
| 114 |
<fieldset>
|
|
|
|
| 125 |
<div class="col">
|
| 126 |
<fieldset>
|
| 127 |
<legend>📝 Text Input</legend>
|
| 128 |
+
<textarea id="txt" placeholder="Enter your text here... (unlimited length supported)">Welcome to the ultimate text-to-speech studio! With access to over 900 premium voices from Piper, Kokoro, and Kitten TTS, you can create professional-quality audio in any language and accent.</textarea>
|
| 129 |
<div class="mt-1">
|
| 130 |
<span class="muted">Characters: <span id="charCount">0</span></span> |
|
| 131 |
<span class="muted">Words: <span id="wordCount">0</span></span> |
|
|
|
|
| 209 |
<li><strong>Best Quality:</strong> Kokoro (if English)</li>
|
| 210 |
<li><strong>Most Voices:</strong> Piper (904 options)</li>
|
| 211 |
<li><strong>Fastest:</strong> Kitten (lightweight)</li>
|
|
|
|
| 212 |
</ul>
|
| 213 |
</div>
|
| 214 |
</fieldset>
|
|
|
|
| 261 |
|
| 262 |
// ===== SPEED DISPLAY =====
|
| 263 |
$("#spd").addEventListener("input", () => {
|
| 264 |
+
const speed = parseFloat($("#spd").value).toFixed(2);
|
| 265 |
+
$("#spdVal").textContent = speed;
|
| 266 |
+
|
| 267 |
+
// Update player speed if audio is loaded
|
| 268 |
+
const player = $("#player");
|
| 269 |
+
if (player.src) {
|
| 270 |
+
player.playbackRate = parseFloat(speed);
|
| 271 |
+
}
|
| 272 |
});
|
| 273 |
|
| 274 |
// ===== ENGINE SWITCHING =====
|
|
|
|
| 279 |
const engineInfo = {
|
| 280 |
piper: "Piper TTS: 904 voices, 50+ languages, 3-5x realtime speed",
|
| 281 |
kokoro: "Kokoro TTS: 21 expressive voices, highest quality, 24kHz audio",
|
| 282 |
+
kitten: "Kitten TTS: 8 voices, fastest generation, only 24MB model"
|
|
|
|
| 283 |
};
|
| 284 |
|
| 285 |
const switchEngine = async () => {
|
|
|
|
| 294 |
$("#piperVoices").classList.toggle("hidden", engine !== "piper");
|
| 295 |
$("#kokoroVoices").classList.toggle("hidden", engine !== "kokoro");
|
| 296 |
$("#kittenVoices").classList.toggle("hidden", engine !== "kitten");
|
|
|
|
|
|
|
| 297 |
|
| 298 |
log(`Switched to ${engine.toUpperCase()} engine`);
|
| 299 |
+
await initTTSSession();
|
|
|
|
|
|
|
|
|
|
| 300 |
};
|
| 301 |
|
| 302 |
$("#engineSelect").addEventListener("change", switchEngine);
|
| 303 |
$("#piperLang").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
|
| 304 |
$("#piperQuality").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
|
| 305 |
+
$("#kokoroVoice").addEventListener("change", () => { if (currentEngine === 'kokoro') initTTSSession(); });
|
| 306 |
+
$("#kittenVoice").addEventListener("change", () => { if (currentEngine === 'kitten') initTTSSession(); });
|
| 307 |
|
| 308 |
// ===== TTS SESSION INITIALIZATION =====
|
| 309 |
async function initTTSSession() {
|
|
|
|
| 319 |
$("#model").className = "chip warning";
|
| 320 |
|
| 321 |
let modelUrl, configUrl;
|
|
|
|
| 322 |
|
| 323 |
if (currentEngine === 'piper') {
|
| 324 |
const voice = $("#piperLang").value;
|
| 325 |
+
const quality = $("#piperQuality").value;
|
| 326 |
+
|
| 327 |
+
// Format: en_US-lessac-medium → en_US/lessac/medium/
|
| 328 |
+
const parts = voice.split('-');
|
| 329 |
+
const lang = parts[0];
|
| 330 |
+
const speaker = parts.slice(1, -1).join('-');
|
| 331 |
+
const qual = parts[parts.length - 1];
|
| 332 |
+
|
| 333 |
+
const baseUrl = `https://huggingface.co/rhasspy/piper-voices/resolve/main/${lang}/${lang}-${speaker}/${qual}/`;
|
| 334 |
+
modelUrl = `${baseUrl}${lang}-${speaker}-${qual}.onnx`;
|
| 335 |
+
configUrl = `${baseUrl}${lang}-${speaker}-${qual}.onnx.json`;
|
| 336 |
+
log(`Initializing Piper: ${lang}-${speaker} (${qual})`);
|
| 337 |
|
| 338 |
} else if (currentEngine === 'kokoro') {
|
| 339 |
+
const baseUrl = `https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/`;
|
| 340 |
+
modelUrl = `${baseUrl}kokoro-v0_19.onnx`;
|
| 341 |
+
configUrl = `${baseUrl}voices.json`;
|
| 342 |
log(`Initializing Kokoro TTS`);
|
| 343 |
|
| 344 |
} else if (currentEngine === 'kitten') {
|
| 345 |
+
const baseUrl = `https://huggingface.co/2mnws/KittenTTS/resolve/main/`;
|
| 346 |
modelUrl = `${baseUrl}model.onnx`;
|
| 347 |
+
configUrl = null; // Kitten might not need config
|
| 348 |
log(`Initializing Kitten TTS`);
|
| 349 |
}
|
| 350 |
|
| 351 |
+
// Dispose previous session
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
if (ttsSession) {
|
| 353 |
+
try {
|
| 354 |
+
await ttsSession.dispose();
|
| 355 |
+
} catch(e) {
|
| 356 |
+
console.log("Dispose error:", e);
|
| 357 |
+
}
|
| 358 |
ttsSession = null;
|
| 359 |
log("Previous session disposed.");
|
| 360 |
}
|
| 361 |
|
| 362 |
+
// Small delay to allow UI update
|
| 363 |
+
await new Promise(resolve => setTimeout(resolve, 50));
|
| 364 |
+
|
| 365 |
ttsSession = await createSession({
|
| 366 |
modelUrl: modelUrl,
|
| 367 |
configUrl: configUrl,
|
|
|
|
| 368 |
executionProviders: navigator.gpu ? ['webgpu', 'wasm'] : ['wasm'],
|
|
|
|
| 369 |
onprogress: (p) => {
|
| 370 |
const percent = Math.round(p.progress * 100);
|
| 371 |
$("#model").textContent = `Loading ${percent}%`;
|
|
|
|
| 374 |
|
| 375 |
$("#model").textContent = "Ready";
|
| 376 |
$("#model").className = "chip success";
|
| 377 |
+
log("Model loaded successfully!");
|
| 378 |
|
| 379 |
return true;
|
| 380 |
|
| 381 |
} catch (err) {
|
| 382 |
log(`ERROR initializing: ${err.message}`);
|
| 383 |
+
showStatus(`Failed to load model: ${err.message}`, 'error');
|
| 384 |
$("#model").textContent = "Failed";
|
| 385 |
$("#model").className = "chip danger";
|
| 386 |
return false;
|
|
|
|
| 390 |
}
|
| 391 |
}
|
| 392 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
// ===== TEXT CHUNKING & AUDIO CONCATENATION =====
|
| 394 |
function chunkText(text, maxChars = 200) {
|
| 395 |
const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
|
|
|
|
| 414 |
}
|
| 415 |
}
|
| 416 |
|
| 417 |
+
return chunks.filter(c => c.length > 0);
|
| 418 |
}
|
| 419 |
|
| 420 |
function concatenateAudio(audioArrays) {
|
|
|
|
| 433 |
const buffer = new ArrayBuffer(44 + samples.length * 2);
|
| 434 |
const view = new DataView(buffer);
|
| 435 |
|
|
|
|
| 436 |
const writeString = (offset, string) => {
|
| 437 |
for (let i = 0; i < string.length; i++) {
|
| 438 |
view.setUint8(offset + i, string.charCodeAt(i));
|
|
|
|
| 443 |
view.setUint32(4, 36 + samples.length * 2, true);
|
| 444 |
writeString(8, 'WAVE');
|
| 445 |
writeString(12, 'fmt ');
|
| 446 |
+
view.setUint32(16, 16, true);
|
| 447 |
+
view.setUint16(20, 1, true);
|
| 448 |
+
view.setUint16(22, 1, true);
|
| 449 |
view.setUint32(24, sampleRate, true);
|
| 450 |
+
view.setUint32(28, sampleRate * 2, true);
|
| 451 |
+
view.setUint16(32, 2, true);
|
| 452 |
+
view.setUint16(34, 16, true);
|
| 453 |
writeString(36, 'data');
|
| 454 |
view.setUint32(40, samples.length * 2, true);
|
| 455 |
|
|
|
|
| 456 |
const offset = 44;
|
| 457 |
for (let i = 0; i < samples.length; i++) {
|
| 458 |
const s = Math.max(-1, Math.min(1, samples[i]));
|
|
|
|
| 469 |
showStatus("Please enter text!", 'error');
|
| 470 |
return;
|
| 471 |
}
|
| 472 |
+
|
| 473 |
+
if (!ttsSession) {
|
| 474 |
+
showStatus("Model not loaded. Please wait for initialization...", 'error');
|
| 475 |
+
return;
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
const btn = $("#go");
|
| 479 |
btn.disabled = true;
|
| 480 |
$("#status").className = "chip warning";
|
|
|
|
| 482 |
updateProgress(0);
|
| 483 |
|
| 484 |
try {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
const chunks = chunkText(text, 200);
|
| 486 |
log(`Processing ${chunks.length} chunk(s)...`);
|
| 487 |
showStatus(`Processing ${chunks.length} chunk(s)...`, 'info');
|
| 488 |
|
| 489 |
const audioChunks = [];
|
| 490 |
+
let sampleRate = 22050; // default
|
| 491 |
let voiceId;
|
| 492 |
|
| 493 |
if (currentEngine === 'kokoro') {
|
|
|
|
| 502 |
updateProgress(progress, `Chunk ${i + 1}/${chunks.length}`);
|
| 503 |
log(`Generating chunk ${i + 1}: "${chunk.substring(0, 30)}..."`);
|
| 504 |
|
| 505 |
+
// Small delay to allow UI update
|
| 506 |
+
await new Promise(resolve => setTimeout(resolve, 10));
|
| 507 |
+
|
| 508 |
const result = await ttsSession.run({
|
| 509 |
text: chunk,
|
| 510 |
+
voiceId: voiceId,
|
| 511 |
});
|
| 512 |
|
| 513 |
+
if (result && result.audio) {
|
| 514 |
+
audioChunks.push(result.audio);
|
| 515 |
+
if (result.sampleRate) {
|
| 516 |
+
sampleRate = result.sampleRate;
|
| 517 |
+
}
|
| 518 |
+
}
|
| 519 |
}
|
| 520 |
|
| 521 |
log("Concatenating audio chunks...");
|
| 522 |
updateProgress(100, "Finalizing...");
|
|
|
|
| 523 |
|
| 524 |
+
const finalAudio = concatenateAudio(audioChunks);
|
| 525 |
log(`Generated ${finalAudio.length} samples (${(finalAudio.length / sampleRate).toFixed(1)}s)`);
|
| 526 |
|
| 527 |
+
// Create WAV blob
|
| 528 |
const wavBuffer = encodeWAV(finalAudio, sampleRate);
|
| 529 |
const blob = new Blob([wavBuffer], { type: "audio/wav" });
|
| 530 |
const url = URL.createObjectURL(blob);
|
| 531 |
|
| 532 |
+
// Player with speed
|
| 533 |
const player = $("#player");
|
| 534 |
player.src = url;
|
| 535 |
player.playbackRate = parseFloat($("#spd").value);
|
|
|
|
| 563 |
$("#backend").textContent = navigator.gpu ? "WebGPU" : "WASM";
|
| 564 |
|
| 565 |
// Initial load
|
| 566 |
+
initTTSSession();
|
| 567 |
</script>
|
| 568 |
</body>
|
| 569 |
</html>
|