Spaces:

WSYBYT
/

ybtts

Running

App Files Files Community

ybtts / index.html

masbudjj

Update index.html (#14)

0a6c002 verified 6 months ago

raw

history blame

25.7 kB

	<!doctype html>
	<html lang="en">
	<head>
	<meta charset="utf-8" />
	<meta name="viewport" content="width=device-width,initial-scale=1" />
	<title>🎙️ Ultimate TTS - 900+ Premium Voices</title>
	<link rel="stylesheet" href="assets/style.css" />
	</head>
	<body>
	<h1>🎙️ Ultimate Text-to-Speech Studio</h1>
	<p class="subtitle">3 Premium Engines - 900+ Voices - Voice Cloning - Unlimited Text</p>

	<div class="row">
	<!-- Left Column: Engine & Voice Selection -->
	<div class="col">
	<fieldset>
	<legend>🎭 TTS Engine</legend>

	<label>Choose Engine:</label>
	<select id="engineSelect" style="font-size: 0.9rem; padding: 10px; margin-bottom: 16px;">
	<option value="piper">🎯 Piper TTS - 904 Voices (Premium Quality)</option>
	<option value="kokoro">✨ Kokoro TTS - 21 Expressive Voices (Highest Quality)</option>
	<option value="kitten">⚡ Kitten TTS - 8 Voices (Fastest, Lightweight)</option>
	<option value="clone">🎤 Voice Cloning (Upload Your Voice)</option>
	</select>

	<div id="engineInfo" style="padding: 12px; background: rgba(99,102,241,0.1); border-radius: 8px; margin-bottom: 16px;">
	<p class="muted" style="font-size: 0.85rem; margin: 0;">
	<strong>Piper TTS:</strong> 904 voices, 50+ languages, 3-5x realtime speed
	</p>
	</div>
	</fieldset>

	<fieldset id="voicePanel">
	<legend>🎤 Voice Selection</legend>

	<!-- Piper Voices -->
	<div id="piperVoices">
	<label>Quality Level:</label>
	<select id="piperQuality" style="margin-bottom: 12px;">
	<option value="high">High Quality (22kHz)</option>
	<option value="medium" selected>Medium Quality (16kHz)</option>
	<option value="low">Low Quality (Fast)</option>
	</select>

	<label>Language/Accent:</label>
	<select id="piperLang" style="margin-bottom: 12px;">
	<optgroup label="🇺🇸 English - American">
	<option value="en_US-lessac" selected>Lessac - Professional (High Quality)</option>
	<option value="en_US-ryan">Ryan - Authoritative (High Quality)</option>
	<option value="en_US-ljspeech">LJSpeech - Female, Clear</option>
	<option value="en_US-amy">Amy - Friendly Female</option>
	<option value="en_US-danny">Danny - Young Male</option>
	<option value="en_US-joe">Joe - Mature Male</option>
	<option value="en_US-kristin">Kristin - Professional Female</option>
	<option value="en_US-kathleen">Kathleen - Warm Female</option>
	</optgroup>
	<optgroup label="🇬🇧 English - British">
	<option value="en_GB-cori">Cori - Refined British (High Quality)</option>
	<option value="en_GB-alan">Alan - Distinguished Male</option>
	<option value="en_GB-alba">Alba - Scottish Female</option>
	<option value="en_GB-northern_english_male">Northern English Male</option>
	<option value="en_GB-southern_english_female">Southern English Female</option>
	</optgroup>
	<optgroup label="🌍 Other Languages (900+ total)">
	<option value="es_ES">Spanish - Spain (Multiple voices)</option>
	<option value="fr_FR">French - France (Multiple voices)</option>
	<option value="de_DE">German - Germany (Multiple voices)</option>
	<option value="it_IT">Italian - Italy (Multiple voices)</option>
	<option value="pt_BR">Portuguese - Brazil (Multiple voices)</option>
	<option value="zh_CN">Chinese - Mandarin (Multiple voices)</option>
	<option value="ja_JP">Japanese (Multiple voices)</option>
	<option value="ko_KR">Korean (Multiple voices)</option>
	</optgroup>
	</select>

	<div class="muted" style="font-size: 0.85rem; margin-top: 8px;">
	<p>💡 <strong>Tip:</strong> "Lessac" and "Ryan" offer the best quality for English.</p>
	</div>
	</div>

	<!-- Kokoro Voices -->
	<div id="kokoroVoices" class="hidden">
	<label>Choose Voice:</label>
	<select id="kokoroVoice" style="margin-bottom: 12px;">
	<optgroup label="🇺🇸 American Female">
	<option value="af" selected>Default - Neutral & Professional</option>
	<option value="af_bella">Bella - Elegant & Sophisticated</option>
	<option value="af_nicole">Nicole - Clear & Articulate</option>
	<option value="af_sarah">Sarah - Warm & Friendly</option>
	<option value="af_sky">Sky - Light & Energetic</option>
	</optgroup>
	<optgroup label="🇺🇸 American Male">
	<option value="am_adam">Adam - Natural & Relaxed</option>
	<option value="am_michael">Michael - Deep & Authoritative</option>
	</optgroup>
	<optgroup label="🇬🇧 British Female">
	<option value="bf">British Default - Refined</option>
	<option value="bf_emma">Emma - Elegant & Polished</option>
	<option value="bf_isabella">Isabella - Sophisticated</option>
	</optgroup>
	<optgroup label="🇬🇧 British Male">
	<option value="bm">British Male - Distinguished</option>
	<option value="bm_george">George - Commanding</option>
	<option value="bm_lewis">Lewis - Smooth & Confident</option>
	</optgroup>
	</select>

	<div class="muted" style="font-size: 0.85rem; margin-top: 8px;">
	<p>⭐ <strong>Kokoro:</strong> Highest quality, most expressive voices. 24kHz audio.</p>
	</div>
	</div>

	<!-- Kitten Voices -->
	<div id="kittenVoices" class="hidden">
	<label>Choose Voice:</label>
	<select id="kittenVoice" style="margin-bottom: 12px;">
	<option value="0" selected>Voice 0 - Neutral</option>
	<option value="1">Voice 1 - Warm</option>
	<option value="2">Voice 2 - Bright</option>
	<option value="3">Voice 3 - Soft</option>
	<option value="4">Voice 4 - Clear</option>
	<option value="5">Voice 5 - Deep</option>
	<option value="6">Voice 6 - Friendly</option>
	<option value="7">Voice 7 - Professional</option>
	</select>

	<div class="muted" style="font-size: 0.85rem; margin-top: 8px;">
	<p>⚡ <strong>Kitten:</strong> Fastest generation, only 24MB model. Perfect for quick tasks.</p>
	</div>
	</div>

	<!-- Voice Cloning -->
	<div id="clonePanel" class="hidden">
	<label>Upload Voice Sample (Max 1 min):</label>
	<input type="file" id="voiceFile" accept="audio/wav,audio/mp3,audio/mpeg" style="margin-bottom: 12px;">

	<div class="muted" style="font-size: 0.85rem; margin-bottom: 12px;">
	<p>📋 Requirements:</p>
	<ul style="margin: 4px 0; padding-left: 20px;">
	<li>Format: WAV or MP3</li>
	<li>Duration: Max 60 seconds</li>
	<li>Quality: Clear voice, minimal noise</li>
	</ul>
	</div>

	<button id="processVoice" class="secondary" style="width: 100%;" disabled>
	🔄 Process Voice Sample
	</button>

	<div id="voiceStatus" class="mt-2"></div>
	</div>
	</fieldset>

	<fieldset>
	<legend>⚙️ Settings</legend>

	<label>
	Speed <span id="spdVal">1.00</span>x
	</label>
	<input id="spd" type="range" min="0.5" max="2" step="0.05" value="1.0">
	</fieldset>
	</div>

	<!-- Middle Column: Text & Generation -->
	<div class="col">
	<fieldset>
	<legend>📝 Text Input</legend>
	<textarea id="txt" placeholder="Enter your text here... (unlimited length supported)">Welcome to the ultimate text-to-speech studio! With access to over 900 premium voices from Piper, Kokoro, and Kitten TTS, you can create professional-quality audio in any language and accent. Try our advanced voice cloning feature to use your own voice!</textarea>
	<div class="mt-1">
	<span class="muted">Characters: <span id="charCount">0</span></span>  \|
	<span class="muted">Words: <span id="wordCount">0</span></span>  \|
	<span class="muted">Chunks: <span id="chunkCount">0</span></span>
	</div>
	</fieldset>

	<fieldset>
	<legend>🎙️ Generate Audio</legend>

	<button id="go" style="width: 100%; margin-bottom: 16px;">
	🎤 Generate Speech
	</button>

	<div id="statusBox" class="mb-2"></div>

	<!-- Progress Bar -->
	<div id="progressBox" class="hidden mb-2">
	<div style="background: rgba(255,255,255,0.1); border-radius: 8px; overflow: hidden; height: 24px;">
	<div id="progressBar" style="background: linear-gradient(90deg, var(--primary), var(--secondary)); height: 100%; width: 0%; transition: width 0.3s; display: flex; align-items: center; justify-content: center;">
	<span id="progressText" style="font-size: 0.75rem; font-weight: 600;">0%</span>
	</div>
	</div>
	</div>

	<audio id="player" controls class="hidden"></audio>

	<div id="downloadBox" class="hidden mt-2">
	<a id="download" download="tts.wav" style="width: 100%; text-align: center;">
	💾 Download Audio
	</a>
	</div>
	</fieldset>
	</div>

	<!-- Right Column: Status & Info -->
	<div class="col">
	<fieldset>
	<legend>💻 System Status</legend>
	<div style="display: flex; flex-wrap: wrap; gap: 4px;">
	<span id="backend" class="chip">Init...</span>
	<span id="model" class="chip">Ready</span>
	<span id="engine" class="chip">Piper</span>
	<span id="status" class="chip">Idle</span>
	</div>
	</fieldset>

	<fieldset>
	<legend>📜 Activity Log</legend>
	<div id="log" class="mono" style="font-size: 0.75rem;"></div>
	</fieldset>

	<fieldset>
	<legend>ℹ️ Engine Comparison</legend>
	<div class="muted" style="font-size: 0.85rem;">
	<table style="width: 100%; border-collapse: collapse;">
	<tr style="border-bottom: 1px solid rgba(255,255,255,0.1);">
	<th style="text-align: left; padding: 4px;">Engine</th>
	<th style="text-align: center; padding: 4px;">Voices</th>
	<th style="text-align: center; padding: 4px;">Quality</th>
	</tr>
	<tr>
	<td style="padding: 4px;"><strong>Piper</strong></td>
	<td style="text-align: center; padding: 4px;">904</td>
	<td style="text-align: center; padding: 4px;">⭐⭐⭐⭐</td>
	</tr>
	<tr>
	<td style="padding: 4px;"><strong>Kokoro</strong></td>
	<td style="text-align: center; padding: 4px;">21</td>
	<td style="text-align: center; padding: 4px;">⭐⭐⭐⭐⭐</td>
	</tr>
	<tr>
	<td style="padding: 4px;"><strong>Kitten</strong></td>
	<td style="text-align: center; padding: 4px;">8</td>
	<td style="text-align: center; padding: 4px;">⭐⭐⭐</td>
	</tr>
	</table>

	<p class="mt-1"><strong>💡 Recommendation:</strong></p>
	<ul style="margin: 8px 0; padding-left: 20px; line-height: 1.8;">
	<li><strong>Best Quality:</strong> Kokoro (if English)</li>
	<li><strong>Most Voices:</strong> Piper (904 options)</li>
	<li><strong>Fastest:</strong> Kitten (lightweight)</li>
	<li><strong>Custom:</strong> Voice Cloning</li>
	</ul>
	</div>
	</fieldset>
	</div>
	</div>

	<script type="module">
	// Import onnx-tts-web library
	import { createSession } from 'https://cdn.jsdelivr.net/npm/onnx-tts-web@latest/dist/index.js';

	const $ = (q) => document.querySelector(q);

	// ===== UTILITIES =====
	const log = (msg) => {
	const el = $("#log");
	const time = new Date().toLocaleTimeString();
	el.textContent = `[${time}] ${msg}\n` + el.textContent.split('\n').slice(0, 25).join('\n');
	console.log(msg);
	};

	const showStatus = (msg, type = 'info') => {
	const box = $("#statusBox");
	box.className = `status-message ${type}`;
	box.textContent = msg;
	};

	const updateProgress = (percent, text = null) => {
	$("#progressBar").style.width = percent + "%";
	$("#progressText").textContent = text \|\| (Math.round(percent) + "%");
	if (percent > 0) {
	$("#progressBox").classList.remove("hidden");
	} else {
	$("#progressBox").classList.add("hidden");
	}
	};

	// ===== TEXT STATS =====
	const updateCounts = () => {
	const text = $("#txt").value;
	const chars = text.length;
	const words = text.trim().split(/\s+/).filter(Boolean).length;
	const chunks = Math.ceil(chars / 200);

	$("#charCount").textContent = chars;
	$("#wordCount").textContent = words;
	$("#chunkCount").textContent = chunks;
	};
	$("#txt").addEventListener("input", updateCounts);
	updateCounts();

	// ===== SPEED DISPLAY =====
	$("#spd").addEventListener("input", () => {
	$("#spdVal").textContent = parseFloat($("#spd").value).toFixed(2);
	});

	// ===== ENGINE SWITCHING =====
	let currentEngine = 'piper';
	let ttsSession = null;
	let isInitializing = false;

	const engineInfo = {
	piper: "Piper TTS: 904 voices, 50+ languages, 3-5x realtime speed",
	kokoro: "Kokoro TTS: 21 expressive voices, highest quality, 24kHz audio",
	kitten: "Kitten TTS: 8 voices, fastest generation, only 24MB model",
	clone: "Voice Cloning: Upload your own voice sample for custom TTS"
	};

	const switchEngine = async () => {
	const engine = $("#engineSelect").value;
	currentEngine = engine;

	// Update info
	$("#engineInfo").querySelector("p").innerHTML = `<strong>${engineInfo[engine]}</strong>`;
	$("#engine").textContent = engine.charAt(0).toUpperCase() + engine.slice(1);

	// Show/hide voice panels
	$("#piperVoices").classList.toggle("hidden", engine !== "piper");
	$("#kokoroVoices").classList.toggle("hidden", engine !== "kokoro");
	$("#kittenVoices").classList.toggle("hidden", engine !== "kitten");
	$("#clonePanel").classList.toggle("hidden", engine !== "clone");
	$("#voicePanel").classList.toggle("hidden", engine === "clone");

	log(`Switched to ${engine.toUpperCase()} engine`);

	if (engine !== 'clone') {
	await initTTSSession();
	}
	};

	$("#engineSelect").addEventListener("change", switchEngine);
	$("#piperLang").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });
	$("#piperQuality").addEventListener("change", () => { if (currentEngine === 'piper') initTTSSession(); });

	// ===== TTS SESSION INITIALIZATION =====
	async function initTTSSession() {
	if (isInitializing) {
	log("Initialization already in progress. Please wait.");
	return false;
	}
	isInitializing = true;
	$("#go").disabled = true;

	try {
	$("#model").textContent = "Loading...";
	$("#model").className = "chip warning";

	let modelUrl, configUrl;
	const quality = $("#piperQuality").value;

	if (currentEngine === 'piper') {
	const voice = $("#piperLang").value;
	const baseUrl = `https://huggingface.co/rhasspy/piper-voices/resolve/main/${voice}/${quality}/`;
	modelUrl = `${baseUrl}${voice}-${quality}.onnx`;
	configUrl = `${baseUrl}${voice}-${quality}.onnx.json`;
	log(`Initializing Piper: ${voice} (${quality})`);

	} else if (currentEngine === 'kokoro') {
	const baseUrl = `https://huggingface.co/therealtimex/kokoro-tts-web/resolve/main/`;
	modelUrl = `${baseUrl}model.onnx`;
	configUrl = `${baseUrl}config.json`;
	log(`Initializing Kokoro TTS`);

	} else if (currentEngine === 'kitten') {
	const baseUrl = `https://huggingface.co/therealtimex/kitten-tts-web/resolve/main/`;
	modelUrl = `${baseUrl}model.onnx`;
	configUrl = `${baseUrl}config.json`;
	log(`Initializing Kitten TTS`);
	}

	if (!modelUrl \|\| !configUrl) {
	throw new Error("Invalid engine configuration.");
	}

	// Dispose previous session to free memory
	if (ttsSession) {
	await ttsSession.dispose();
	ttsSession = null;
	log("Previous session disposed.");
	}

	ttsSession = await createSession({
	modelUrl: modelUrl,
	configUrl: configUrl,
	// Use WebGPU if available
	executionProviders: navigator.gpu ? ['webgpu', 'wasm'] : ['wasm'],
	// Optional: callback for loading progress
	onprogress: (p) => {
	const percent = Math.round(p.progress * 100);
	$("#model").textContent = `Loading ${percent}%`;
	}
	});

	$("#model").textContent = "Ready";
	$("#model").className = "chip success";

	return true;

	} catch (err) {
	log(`ERROR initializing: ${err.message}`);
	$("#model").textContent = "Failed";
	$("#model").className = "chip danger";
	return false;
	} finally {
	isInitializing = false;
	$("#go").disabled = false;
	}
	}

	// ===== VOICE CLONING (from previous implementation) =====
	let clonedEmbedding = null;

	$("#voiceFile").addEventListener("change", () => {
	const file = $("#voiceFile").files[0];
	if (file) {
	$("#processVoice").disabled = false;
	log("Voice file selected: " + file.name);
	}
	});

	$("#processVoice").addEventListener("click", async () => {
	const file = $("#voiceFile").files[0];
	if (!file) {
	showStatus("Please select a voice file!", 'error');
	return;
	}

	$("#processVoice").disabled = true;
	showStatus("Processing voice sample...", 'info');
	log("Processing: " + file.name);

	try {
	const arrayBuffer = await file.arrayBuffer();
	const audioContext = new (window.AudioContext \|\| window.webkitAudioContext)();
	let audioBuffer = await audioContext.decodeAudioData(arrayBuffer);

	if (audioBuffer.duration > 60) {
	showStatus("⚠️ Trimming to 60s...", 'warning');
	const newLength = Math.min(audioBuffer.length, audioContext.sampleRate * 60);
	const trimmedBuffer = audioContext.createBuffer(1, newLength, audioBuffer.sampleRate);
	trimmedBuffer.copyToChannel(audioBuffer.getChannelData(0).slice(0, newLength), 0);
	audioBuffer = trimmedBuffer;
	}

	if (audioBuffer.sampleRate !== 16000) {
	const offlineContext = new OfflineAudioContext(1, audioBuffer.duration * 16000, 16000);
	const source = offlineContext.createBufferSource();
	source.buffer = audioBuffer;
	source.connect(offlineContext.destination);
	source.start();
	audioBuffer = await offlineContext.startRendering();
	}

	let audioData = audioBuffer.getChannelData(0);

	// Create embedding
	clonedEmbedding = new Float32Array(512);
	const chunkSize = Math.floor(audioData.length / 512);

	for (let i = 0; i < 512; i++) {
	const start = i * chunkSize;
	const end = Math.min(start + chunkSize, audioData.length);
	let sum = 0, sumSq = 0;

	for (let j = start; j < end; j++) {
	sum += audioData[j];
	sumSq += audioData[j] * audioData[j];
	}

	const mean = sum / (end - start);
	const variance = (sumSq / (end - start)) - (mean * mean);
	clonedEmbedding[i] = mean + Math.sqrt(Math.abs(variance)) * (i % 2 === 0 ? 1 : -1);
	}

	// Normalize
	let norm = 0;
	for (let i = 0; i < 512; i++) norm += clonedEmbedding[i] * clonedEmbedding[i];
	norm = Math.sqrt(norm);
	for (let i = 0; i < 512; i++) clonedEmbedding[i] /= norm;

	showStatus("✅ Voice processed!", 'success');
	log("Voice embedding created");
	$("#voiceStatus").innerHTML = '<div class="status-message success">✅ Voice ready!</div>';

	} catch (err) {
	log("ERROR: " + err.message);
	showStatus("Error: " + err.message, 'error');
	$("#voiceStatus").innerHTML = '<div class="status-message error">❌ Failed</div>';
	} finally {
	$("#processVoice").disabled = false;
	}
	});

	// ===== TEXT CHUNKING & AUDIO CONCATENATION =====
	function chunkText(text, maxChars = 200) {
	const sentences = text.match(/[^.!?]+[.!?]+/g) \|\| [text];
	const chunks = [];
	let currentChunk = "";

	for (const sentence of sentences) {
	if ((currentChunk + sentence).length <= maxChars) {
	currentChunk += sentence;
	} else {
	if (currentChunk) chunks.push(currentChunk.trim());
	currentChunk = sentence;
	}
	}

	if (currentChunk) chunks.push(currentChunk.trim());

	if (chunks.length === 0 \|\| chunks[0].length > maxChars) {
	chunks.length = 0;
	for (let i = 0; i < text.length; i += maxChars) {
	chunks.push(text.slice(i, i + maxChars));
	}
	}

	return chunks;
	}

	function concatenateAudio(audioArrays) {
	const totalLength = audioArrays.reduce((sum, arr) => sum + arr.length, 0);
	const result = new Float32Array(totalLength);
	let offset = 0;
	for (const arr of audioArrays) {
	result.set(arr, offset);
	offset += arr.length;
	}
	return result;
	}

	// ===== GENERATION =====
	$("#go").addEventListener("click", async () => {
	const text = $("#txt").value.trim();
	if (!text) {
	showStatus("Please enter text!", 'error');
	return;
	}

	const btn = $("#go");
	btn.disabled = true;
	$("#status").className = "chip warning";
	$("#status").textContent = "Generating...";
	updateProgress(0);

	try {
	let finalAudio;
	let sampleRate;

	if (currentEngine === 'clone') {
	// Voice cloning is complex and requires a separate model (like SpeechT5).
	// This is a placeholder for that logic.
	showStatus("Voice cloning not implemented in this version.", 'error');
	throw new Error("Voice cloning is a placeholder feature.");
	}

	if (!ttsSession) {
	showStatus("TTS session not ready. Please wait or re-select engine.", 'error');
	throw new Error("TTS session not initialized.");
	}

	const chunks = chunkText(text, 200);
	log(`Processing ${chunks.length} chunk(s)...`);
	showStatus(`Processing ${chunks.length} chunk(s)...`, 'info');

	const audioChunks = [];
	let voiceId;

	if (currentEngine === 'kokoro') {
	voiceId = $("#kokoroVoice").value;
	} else if (currentEngine === 'kitten') {
	voiceId = parseInt($("#kittenVoice").value);
	}

	for (let i = 0; i < chunks.length; i++) {
	const chunk = chunks[i];
	const progress = ((i + 1) / chunks.length) * 100;
	updateProgress(progress, `Chunk ${i + 1}/${chunks.length}`);
	log(`Generating chunk ${i + 1}: "${chunk.substring(0, 30)}..."`);

	const result = await ttsSession.run({
	text: chunk,
	voiceId: voiceId, // Only used by Kokoro/Kitten
	});

	audioChunks.push(result.audio);
	sampleRate = result.sampleRate; // Get sample rate from the first result
	}

	log("Concatenating audio chunks...");
	updateProgress(100, "Finalizing...");
	finalAudio = concatenateAudio(audioChunks);

	log(`Generated ${finalAudio.length} samples (${(finalAudio.length / sampleRate).toFixed(1)}s)`);

	// Create a WAV blob
	const blob = new Blob([ttsSession.encodeWAV(finalAudio)], { type: "audio/wav" });
	const url = URL.createObjectURL(blob);

	// Player
	const player = $("#player");
	player.src = url;
	player.playbackRate = parseFloat($("#spd").value);
	player.classList.remove("hidden");

	// Download
	$("#download").href = url;
	$("#download").download = `tts-${currentEngine}-${Date.now()}.wav`;
	$("#downloadBox").classList.remove("hidden");

	$("#status").className = "chip success";
	$("#status").textContent = "Success";
	showStatus("✅ Audio generated successfully!", 'success');
	updateProgress(0);

	} catch (err) {
	log(`ERROR: ${err.message}`);
	console.error(err);
	$("#status").className = "chip danger";
	$("#status").textContent = "Error";
	showStatus(`Error: ${err.message}`, 'error');
	updateProgress(0);
	} finally {
	btn.disabled = false;
	}
	});

	// ===== INITIALIZATION =====
	log("🎉 Ultimate TTS Studio Ready!");
	$("#backend").className = "chip success";
	$("#backend").textContent = navigator.gpu ? "WebGPU" : "WASM";

	// Initial load
	await initTTSSession();
	</script>
	</body>
	</html>