ybtts / index.html
masbudjj's picture
Update index.html (#14)
0a6c002 verified
raw
history blame
25.7 kB
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<title>πŸŽ™οΈ Ultimate TTS - 900+ Premium Voices</title>
<link rel="stylesheet" href="assets/style.css" />
</head>
<body>
<h1>πŸŽ™οΈ Ultimate Text-to-Speech Studio</h1>
<p class="subtitle">3 Premium Engines - 900+ Voices - Voice Cloning - Unlimited Text</p>
<div class="row">
<!-- Left Column: Engine & Voice Selection -->
<div class="col">
<fieldset>
<legend>🎭 TTS Engine</legend>
<label>Choose Engine:</label>
<select id="engineSelect" style="font-size: 0.9rem; padding: 10px; margin-bottom: 16px;">
<option value="piper">🎯 Piper TTS - 904 Voices (Premium Quality)</option>
<option value="kokoro">✨ Kokoro TTS - 21 Expressive Voices (Highest Quality)</option>
<option value="kitten">⚑ Kitten TTS - 8 Voices (Fastest, Lightweight)</option>
<option value="clone">🎀 Voice Cloning (Upload Your Voice)</option>
</select>
<div id="engineInfo" style="padding: 12px; background: rgba(99,102,241,0.1); border-radius: 8px; margin-bottom: 16px;">
<p class="muted" style="font-size: 0.85rem; margin: 0;">
<strong>Piper TTS:</strong> 904 voices, 50+ languages, 3-5x realtime speed
</p>
</div>
</fieldset>
<fieldset id="voicePanel">
<legend>🎀 Voice Selection</legend>
<!-- Piper Voices -->
<div id="piperVoices">
<label>Quality Level:</label>
<select id="piperQuality" style="margin-bottom: 12px;">
<option value="high">High Quality (22kHz)</option>
<option value="medium" selected>Medium Quality (16kHz)</option>
<option value="low">Low Quality (Fast)</option>
</select>
<label>Language/Accent:</label>
<select id="piperLang" style="margin-bottom: 12px;">
<optgroup label="πŸ‡ΊπŸ‡Έ English - American">
<option value="en_US-lessac" selected>Lessac - Professional (High Quality)</option>
<option value="en_US-ryan">Ryan - Authoritative (High Quality)</option>
<option value="en_US-ljspeech">LJSpeech - Female, Clear</option>
<option value="en_US-amy">Amy - Friendly Female</option>
<option value="en_US-danny">Danny - Young Male</option>
<option value="en_US-joe">Joe - Mature Male</option>
<option value="en_US-kristin">Kristin - Professional Female</option>
<option value="en_US-kathleen">Kathleen - Warm Female</option>
</optgroup>
<optgroup label="πŸ‡¬πŸ‡§ English - British">
<option value="en_GB-cori">Cori - Refined British (High Quality)</option>
<option value="en_GB-alan">Alan - Distinguished Male</option>
<option value="en_GB-alba">Alba - Scottish Female</option>
<option value="en_GB-northern_english_male">Northern English Male</option>
<option value="en_GB-southern_english_female">Southern English Female</option>
</optgroup>
<optgroup label="🌍 Other Languages (900+ total)">
<option value="es_ES">Spanish - Spain (Multiple voices)</option>
<option value="fr_FR">French - France (Multiple voices)</option>
<option value="de_DE">German - Germany (Multiple voices)</option>
<option value="it_IT">Italian - Italy (Multiple voices)</option>
<option value="pt_BR">Portuguese - Brazil (Multiple voices)</option>
<option value="zh_CN">Chinese - Mandarin (Multiple voices)</option>
<option value="ja_JP">Japanese (Multiple voices)</option>
<option value="ko_KR">Korean (Multiple voices)</option>
</optgroup>
</select>
<div class="muted" style="font-size: 0.85rem; margin-top: 8px;">
<p>πŸ’‘ <strong>Tip:</strong> "Lessac" and "Ryan" offer the best quality for English.</p>
</div>
</div>
<!-- Kokoro Voices -->
<div id="kokoroVoices" class="hidden">
<label>Choose Voice:</label>
<select id="kokoroVoice" style="margin-bottom: 12px;">
<optgroup label="πŸ‡ΊπŸ‡Έ American Female">
<option value="af" selected>Default - Neutral & Professional</option>
<option value="af_bella">Bella - Elegant & Sophisticated</option>
<option value="af_nicole">Nicole - Clear & Articulate</option>
<option value="af_sarah">Sarah - Warm & Friendly</option>
<option value="af_sky">Sky - Light & Energetic</option>
</optgroup>
<optgroup label="πŸ‡ΊπŸ‡Έ American Male">
<option value="am_adam">Adam - Natural & Relaxed</option>
<option value="am_michael">Michael - Deep & Authoritative</option>
</optgroup>
<optgroup label="πŸ‡¬πŸ‡§ British Female">
<option value="bf">British Default - Refined</option>
<option value="bf_emma">Emma - Elegant & Polished</option>
<option value="bf_isabella">Isabella - Sophisticated</option>
</optgroup>
<optgroup label="πŸ‡¬πŸ‡§ British Male">
<option value="bm">British Male - Distinguished</option>
<option value="bm_george">George - Commanding</option>
<option value="bm_lewis">Lewis - Smooth & Confident</option>
</optgroup>
</select>
<div class="muted" style="font-size: 0.85rem; margin-top: 8px;">
<p>⭐ <strong>Kokoro:</strong> Highest quality, most expressive voices. 24kHz audio.</p>
</div>
</div>
<!-- Kitten Voices -->
<div id="kittenVoices" class="hidden">
<label>Choose Voice:</label>
<select id="kittenVoice" style="margin-bottom: 12px;">
<option value="0" selected>Voice 0 - Neutral</option>
<option value="1">Voice 1 - Warm</option>
<option value="2">Voice 2 - Bright</option>
<option value="3">Voice 3 - Soft</option>
<option value="4">Voice 4 - Clear</option>
<option value="5">Voice 5 - Deep</option>
<option value="6">Voice 6 - Friendly</option>
<option value="7">Voice 7 - Professional</option>
</select>
<div class="muted" style="font-size: 0.85rem; margin-top: 8px;">
<p>⚑ <strong>Kitten:</strong> Fastest generation, only 24MB model. Perfect for quick tasks.</p>
</div>
</div>
<!-- Voice Cloning -->
<div id="clonePanel" class="hidden">
<label>Upload Voice Sample (Max 1 min):</label>
<input type="file" id="voiceFile" accept="audio/wav,audio/mp3,audio/mpeg" style="margin-bottom: 12px;">
<div class="muted" style="font-size: 0.85rem; margin-bottom: 12px;">
<p>πŸ“‹ Requirements:</p>
<ul style="margin: 4px 0; padding-left: 20px;">
<li>Format: WAV or MP3</li>
<li>Duration: Max 60 seconds</li>
<li>Quality: Clear voice, minimal noise</li>
</ul>
</div>
<button id="processVoice" class="secondary" style="width: 100%;" disabled>
πŸ”„ Process Voice Sample
</button>
<div id="voiceStatus" class="mt-2"></div>
</div>
</fieldset>
<fieldset>
<legend>βš™οΈ Settings</legend>
<label>
Speed <span id="spdVal">1.00</span>x
</label>
<input id="spd" type="range" min="0.5" max="2" step="0.05" value="1.0">
</fieldset>
</div>
<!-- Middle Column: Text & Generation -->
<div class="col">
<fieldset>
<legend>πŸ“ Text Input</legend>
<textarea id="txt" placeholder="Enter your text here... (unlimited length supported)">Welcome to the ultimate text-to-speech studio! With access to over 900 premium voices from Piper, Kokoro, and Kitten TTS, you can create professional-quality audio in any language and accent. Try our advanced voice cloning feature to use your own voice!</textarea>
<div class="mt-1">
<span class="muted">Characters: <span id="charCount">0</span></span> &nbsp;|&nbsp;
<span class="muted">Words: <span id="wordCount">0</span></span> &nbsp;|&nbsp;
<span class="muted">Chunks: <span id="chunkCount">0</span></span>
</div>
</fieldset>
<fieldset>
<legend>πŸŽ™οΈ Generate Audio</legend>
<button id="go" style="width: 100%; margin-bottom: 16px;">
🎀 Generate Speech
</button>
<div id="statusBox" class="mb-2"></div>
<!-- Progress Bar -->
<div id="progressBox" class="hidden mb-2">
<div style="background: rgba(255,255,255,0.1); border-radius: 8px; overflow: hidden; height: 24px;">
<div id="progressBar" style="background: linear-gradient(90deg, var(--primary), var(--secondary)); height: 100%; width: 0%; transition: width 0.3s; display: flex; align-items: center; justify-content: center;">
<span id="progressText" style="font-size: 0.75rem; font-weight: 600;">0%</span>
</div>
</div>
</div>
<audio id="player" controls class="hidden"></audio>
<div id="downloadBox" class="hidden mt-2">
<a id="download" download="tts.wav" style="width: 100%; text-align: center;">
πŸ’Ύ Download Audio
</a>
</div>
</fieldset>
</div>
<!-- Right Column: Status & Info -->
<div class="col">
<fieldset>
<legend>πŸ’» System Status</legend>
<div style="display: flex; flex-wrap: wrap; gap: 4px;">
<span id="backend" class="chip">Init...</span>
<span id="model" class="chip">Ready</span>
<span id="engine" class="chip">Piper</span>
<span id="status" class="chip">Idle</span>
</div>
</fieldset>
<fieldset>
<legend>πŸ“œ Activity Log</legend>
<div id="log" class="mono" style="font-size: 0.75rem;"></div>
</fieldset>
<fieldset>
<legend>ℹ️ Engine Comparison</legend>
<div class="muted" style="font-size: 0.85rem;">
<table style="width: 100%; border-collapse: collapse;">
<tr style="border-bottom: 1px solid rgba(255,255,255,0.1);">
<th style="text-align: left; padding: 4px;">Engine</th>
<th style="text-align: center; padding: 4px;">Voices</th>
<th style="text-align: center; padding: 4px;">Quality</th>
</tr>
<tr>
<td style="padding: 4px;"><strong>Piper</strong></td>
<td style="text-align: center; padding: 4px;">904</td>
<td style="text-align: center; padding: 4px;">⭐⭐⭐⭐</td>
</tr>
<tr>
<td style="padding: 4px;"><strong>Kokoro</strong></td>
<td style="text-align: center; padding: 4px;">21</td>
<td style="text-align: center; padding: 4px;">⭐⭐⭐⭐⭐</td>
</tr>
<tr>
<td style="padding: 4px;"><strong>Kitten</strong></td>
<td style="text-align: center; padding: 4px;">8</td>
<td style="text-align: center; padding: 4px;">⭐⭐⭐</td>
</tr>
</table>
<p class="mt-1"><strong>πŸ’‘ Recommendation:</strong></p>
<ul style="margin: 8px 0; padding-left: 20px; line-height: 1.8;">
<li><strong>Best Quality:</strong> Kokoro (if English)</li>
<li><strong>Most Voices:</strong> Piper (904 options)</li>
<li><strong>Fastest:</strong> Kitten (lightweight)</li>
<li><strong>Custom:</strong> Voice Cloning</li>
</ul>
</div>
</fieldset>
</div>
</div>
<script type="module">
// Import onnx-tts-web library
import { createSession } from 'https://cdn.jsdelivr.net/npm/onnx-tts-web@latest/dist/index.js';
const $ = (q) => document.querySelector(q);
// ===== UTILITIES =====
const log = (msg) => {
const el = $("#log");
const time = new Date().toLocaleTimeString();
el.textContent = `[${time}] ${msg}\n` + el.textContent.split('\n').slice(0, 25).join('\n');
console.log(msg);
};
const showStatus = (msg, type = 'info') => {
const box = $("#statusBox");
box.className = `status-message ${type}`;
box.textContent = msg;
};
const updateProgress = (percent, text = null) => {
$("#progressBar").style.width = percent + "%";
$("#progressText").textContent = text || (Math.round(percent) + "%");
if (percent > 0) {
$("#progressBox").classList.remove("hidden");
} else {
$("#progressBox").classList.add("hidden");
}
};
// ===== TEXT STATS =====
const updateCounts = () => {
const text = $("#txt").value;
const chars = text.length;
const words = text.trim().split(/\s+/).filter(Boolean).length;
const chunks = Math.ceil(chars / 200);
$("#charCount").textContent = chars;
$("#wordCount").textContent = words;
$("#chunkCount").textContent = chunks;
};
$("#txt").addEventListener("input", updateCounts);
updateCounts();
// ===== SPEED DISPLAY =====
$("#spd").addEventListener("input", () => {
$("#spdVal").textContent = parseFloat($("#spd").value).toFixed(2);
});
// ===== ENGINE SWITCHING =====
let currentEngine = 'piper';
let ttsSession = null;
let isInitializing = false;
const engineInfo = {
piper: "Piper TTS: 904 voices, 50+ languages, 3-5x realtime speed",
kokoro: "Kokoro TTS: 21 expressive voices, highest quality, 24kHz audio",
kitten: "Kitten TTS: 8 voices, fastest generation, only 24MB model",
clone: "Voice Cloning: Upload your own voice sample for custom TTS"
};
const switchEngine = async () => {
const engine = $("#engineSelect").value;
currentEngine = engine;
// Update info
$("#engineInfo").querySelector("p").innerHTML = `<strong>${engineInfo[engine]}</strong>`;
$("#engine").textContent = engine.charAt(0).toUpperCase() + engine.slice(1);
// Show/hide voice panels
$("#piperVoices").classList.toggle("hidden", engine !== "piper");
$("#kokoroVoices").classList.toggle("hidden", engine !== "kokoro");
$("#kittenVoices").classList.toggle("hidden", engine !== "kitten");
$("#clonePanel").classList.toggle("hidden", engine !== "clone");
$("#voicePanel").classList.toggle("hidden", engine === "clone");
log(`Switched to ${engine.toUpperCase()} engine`);
if (engine !== 'clone') {
await initTTSSession();
}
};
$("#engineSelect").addEventListener("change", switchEngine);
$("#piperLang").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
$("#piperQuality").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
// ===== TTS SESSION INITIALIZATION =====
async function initTTSSession() {
if (isInitializing) {
log("Initialization already in progress. Please wait.");
return false;
}
isInitializing = true;
$("#go").disabled = true;
try {
$("#model").textContent = "Loading...";
$("#model").className = "chip warning";
let modelUrl, configUrl;
const quality = $("#piperQuality").value;
if (currentEngine === 'piper') {
const voice = $("#piperLang").value;
const baseUrl = `https://huggingface.co/rhasspy/piper-voices/resolve/main/${voice}/${quality}/`;
modelUrl = `${baseUrl}${voice}-${quality}.onnx`;
configUrl = `${baseUrl}${voice}-${quality}.onnx.json`;
log(`Initializing Piper: ${voice} (${quality})`);
} else if (currentEngine === 'kokoro') {
const baseUrl = `https://huggingface.co/therealtimex/kokoro-tts-web/resolve/main/`;
modelUrl = `${baseUrl}model.onnx`;
configUrl = `${baseUrl}config.json`;
log(`Initializing Kokoro TTS`);
} else if (currentEngine === 'kitten') {
const baseUrl = `https://huggingface.co/therealtimex/kitten-tts-web/resolve/main/`;
modelUrl = `${baseUrl}model.onnx`;
configUrl = `${baseUrl}config.json`;
log(`Initializing Kitten TTS`);
}
if (!modelUrl || !configUrl) {
throw new Error("Invalid engine configuration.");
}
// Dispose previous session to free memory
if (ttsSession) {
await ttsSession.dispose();
ttsSession = null;
log("Previous session disposed.");
}
ttsSession = await createSession({
modelUrl: modelUrl,
configUrl: configUrl,
// Use WebGPU if available
executionProviders: navigator.gpu ? ['webgpu', 'wasm'] : ['wasm'],
// Optional: callback for loading progress
onprogress: (p) => {
const percent = Math.round(p.progress * 100);
$("#model").textContent = `Loading ${percent}%`;
}
});
$("#model").textContent = "Ready";
$("#model").className = "chip success";
return true;
} catch (err) {
log(`ERROR initializing: ${err.message}`);
$("#model").textContent = "Failed";
$("#model").className = "chip danger";
return false;
} finally {
isInitializing = false;
$("#go").disabled = false;
}
}
// ===== VOICE CLONING (from previous implementation) =====
let clonedEmbedding = null;
$("#voiceFile").addEventListener("change", () => {
const file = $("#voiceFile").files[0];
if (file) {
$("#processVoice").disabled = false;
log("Voice file selected: " + file.name);
}
});
$("#processVoice").addEventListener("click", async () => {
const file = $("#voiceFile").files[0];
if (!file) {
showStatus("Please select a voice file!", 'error');
return;
}
$("#processVoice").disabled = true;
showStatus("Processing voice sample...", 'info');
log("Processing: " + file.name);
try {
const arrayBuffer = await file.arrayBuffer();
const audioContext = new (window.AudioContext || window.webkitAudioContext)();
let audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
if (audioBuffer.duration > 60) {
showStatus("⚠️ Trimming to 60s...", 'warning');
const newLength = Math.min(audioBuffer.length, audioContext.sampleRate * 60);
const trimmedBuffer = audioContext.createBuffer(1, newLength, audioBuffer.sampleRate);
trimmedBuffer.copyToChannel(audioBuffer.getChannelData(0).slice(0, newLength), 0);
audioBuffer = trimmedBuffer;
}
if (audioBuffer.sampleRate !== 16000) {
const offlineContext = new OfflineAudioContext(1, audioBuffer.duration * 16000, 16000);
const source = offlineContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(offlineContext.destination);
source.start();
audioBuffer = await offlineContext.startRendering();
}
let audioData = audioBuffer.getChannelData(0);
// Create embedding
clonedEmbedding = new Float32Array(512);
const chunkSize = Math.floor(audioData.length / 512);
for (let i = 0; i < 512; i++) {
const start = i * chunkSize;
const end = Math.min(start + chunkSize, audioData.length);
let sum = 0, sumSq = 0;
for (let j = start; j < end; j++) {
sum += audioData[j];
sumSq += audioData[j] * audioData[j];
}
const mean = sum / (end - start);
const variance = (sumSq / (end - start)) - (mean * mean);
clonedEmbedding[i] = mean + Math.sqrt(Math.abs(variance)) * (i % 2 === 0 ? 1 : -1);
}
// Normalize
let norm = 0;
for (let i = 0; i < 512; i++) norm += clonedEmbedding[i] * clonedEmbedding[i];
norm = Math.sqrt(norm);
for (let i = 0; i < 512; i++) clonedEmbedding[i] /= norm;
showStatus("βœ… Voice processed!", 'success');
log("Voice embedding created");
$("#voiceStatus").innerHTML = '<div class="status-message success">βœ… Voice ready!</div>';
} catch (err) {
log("ERROR: " + err.message);
showStatus("Error: " + err.message, 'error');
$("#voiceStatus").innerHTML = '<div class="status-message error">❌ Failed</div>';
} finally {
$("#processVoice").disabled = false;
}
});
// ===== TEXT CHUNKING & AUDIO CONCATENATION =====
function chunkText(text, maxChars = 200) {
const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
const chunks = [];
let currentChunk = "";
for (const sentence of sentences) {
if ((currentChunk + sentence).length <= maxChars) {
currentChunk += sentence;
} else {
if (currentChunk) chunks.push(currentChunk.trim());
currentChunk = sentence;
}
}
if (currentChunk) chunks.push(currentChunk.trim());
if (chunks.length === 0 || chunks[0].length > maxChars) {
chunks.length = 0;
for (let i = 0; i < text.length; i += maxChars) {
chunks.push(text.slice(i, i + maxChars));
}
}
return chunks;
}
function concatenateAudio(audioArrays) {
const totalLength = audioArrays.reduce((sum, arr) => sum + arr.length, 0);
const result = new Float32Array(totalLength);
let offset = 0;
for (const arr of audioArrays) {
result.set(arr, offset);
offset += arr.length;
}
return result;
}
// ===== GENERATION =====
$("#go").addEventListener("click", async () => {
const text = $("#txt").value.trim();
if (!text) {
showStatus("Please enter text!", 'error');
return;
}
const btn = $("#go");
btn.disabled = true;
$("#status").className = "chip warning";
$("#status").textContent = "Generating...";
updateProgress(0);
try {
let finalAudio;
let sampleRate;
if (currentEngine === 'clone') {
// Voice cloning is complex and requires a separate model (like SpeechT5).
// This is a placeholder for that logic.
showStatus("Voice cloning not implemented in this version.", 'error');
throw new Error("Voice cloning is a placeholder feature.");
}
if (!ttsSession) {
showStatus("TTS session not ready. Please wait or re-select engine.", 'error');
throw new Error("TTS session not initialized.");
}
const chunks = chunkText(text, 200);
log(`Processing ${chunks.length} chunk(s)...`);
showStatus(`Processing ${chunks.length} chunk(s)...`, 'info');
const audioChunks = [];
let voiceId;
if (currentEngine === 'kokoro') {
voiceId = $("#kokoroVoice").value;
} else if (currentEngine === 'kitten') {
voiceId = parseInt($("#kittenVoice").value);
}
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
const progress = ((i + 1) / chunks.length) * 100;
updateProgress(progress, `Chunk ${i + 1}/${chunks.length}`);
log(`Generating chunk ${i + 1}: "${chunk.substring(0, 30)}..."`);
const result = await ttsSession.run({
text: chunk,
voiceId: voiceId, // Only used by Kokoro/Kitten
});
audioChunks.push(result.audio);
sampleRate = result.sampleRate; // Get sample rate from the first result
}
log("Concatenating audio chunks...");
updateProgress(100, "Finalizing...");
finalAudio = concatenateAudio(audioChunks);
log(`Generated ${finalAudio.length} samples (${(finalAudio.length / sampleRate).toFixed(1)}s)`);
// Create a WAV blob
const blob = new Blob([ttsSession.encodeWAV(finalAudio)], { type: "audio/wav" });
const url = URL.createObjectURL(blob);
// Player
const player = $("#player");
player.src = url;
player.playbackRate = parseFloat($("#spd").value);
player.classList.remove("hidden");
// Download
$("#download").href = url;
$("#download").download = `tts-${currentEngine}-${Date.now()}.wav`;
$("#downloadBox").classList.remove("hidden");
$("#status").className = "chip success";
$("#status").textContent = "Success";
showStatus("βœ… Audio generated successfully!", 'success');
updateProgress(0);
} catch (err) {
log(`ERROR: ${err.message}`);
console.error(err);
$("#status").className = "chip danger";
$("#status").textContent = "Error";
showStatus(`Error: ${err.message}`, 'error');
updateProgress(0);
} finally {
btn.disabled = false;
}
});
// ===== INITIALIZATION =====
log("πŸŽ‰ Ultimate TTS Studio Ready!");
$("#backend").className = "chip success";
$("#backend").textContent = navigator.gpu ? "WebGPU" : "WASM";
// Initial load
await initTTSSession();
</script>
</body>
</html>