Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="utf-8" /> | |
| <meta name="viewport" content="width=device-width,initial-scale=1" /> | |
| <title>ποΈ Modern TTS with Voice Cloning</title> | |
| <link rel="stylesheet" href="assets/style.css" /> | |
| </head> | |
| <body> | |
| <h1>ποΈ Modern Text-to-Speech with Voice Cloning</h1> | |
| <p class="subtitle">AI Voice Generator - Real Voice Cloning Technology</p> | |
| <div class="row"> | |
| <!-- Left Column: Controls --> | |
| <div class="col"> | |
| <fieldset> | |
| <legend>Model Selection</legend> | |
| <select id="modelSelect"> | |
| <option value="speecht5" selected>SpeechT5 (Fast)</option> | |
| <option value="speecht5_hifi">SpeechT5 HiFi (Best Quality)</option> | |
| <option value="mms_eng">MMS English (Meta)</option> | |
| </select> | |
| <div class="mt-1 muted" style="font-size: 0.85rem;"> | |
| Current: <span id="currentModel" class="chip">Loading...</span> | |
| </div> | |
| </fieldset> | |
| <fieldset> | |
| <legend>π€ Voice Cloning</legend> | |
| <p class="muted" style="font-size: 0.85rem; margin-bottom: 8px;"> | |
| Upload audio (5-30 seconds) to clone the voice | |
| </p> | |
| <label> | |
| <input type="radio" name="voiceMode" value="default" checked> | |
| Default Voice | |
| </label> | |
| <label> | |
| <input type="radio" name="voiceMode" value="clone"> | |
| Clone Voice from Audio | |
| </label> | |
| <div id="cloneSection" class="hidden mt-1" style="padding: 12px; background: rgba(99,102,241,0.1); border-radius: 8px;"> | |
| <input id="voiceFile" type="file" accept="audio/*"> | |
| <div id="voiceStatus" class="mt-1"></div> | |
| <div id="voicePreview" class="hidden mt-1"> | |
| <p class="muted" style="font-size: 0.85rem;">Preview:</p> | |
| <audio id="voiceAudio" controls style="width: 100%; margin-top: 4px;"></audio> | |
| </div> | |
| </div> | |
| </fieldset> | |
| <fieldset> | |
| <legend>Voice Settings</legend> | |
| <label> | |
| Speed <span id="spdVal">1.00</span>x | |
| </label> | |
| <input id="spd" type="range" min="0.5" max="2" step="0.05" value="1.0"> | |
| <label> | |
| Temperature <span id="tempVal">0.70</span> | |
| </label> | |
| <input id="temp" type="range" min="0.1" max="1.5" step="0.05" value="0.7"> | |
| </fieldset> | |
| </div> | |
| <!-- Middle Column: Text & Generation --> | |
| <div class="col"> | |
| <fieldset> | |
| <legend>Text Input</legend> | |
| <textarea id="txt" placeholder="Type or paste your text here...">Hello! This is a demonstration of real voice cloning technology.</textarea> | |
| <div class="mt-1"> | |
| <span class="muted">Characters: <span id="charCount">0</span></span> | | |
| <span class="muted">Words: <span id="wordCount">0</span></span> | |
| </div> | |
| </fieldset> | |
| <fieldset> | |
| <legend>Generate Audio</legend> | |
| <div style="display: flex; gap: 12px; margin-bottom: 16px;"> | |
| <button id="go" style="flex: 1;"> | |
| ποΈ Generate Speech | |
| </button> | |
| <button id="free" class="secondary" style="flex: 0.5;"> | |
| ποΈ Clear | |
| </button> | |
| </div> | |
| <div id="statusBox" class="mb-2"></div> | |
| <audio id="player" controls class="hidden"></audio> | |
| <div id="downloadBox" class="hidden mt-2 text-center"> | |
| <a id="download" download="tts-output.wav"> | |
| πΎ Download Audio (WAV) | |
| </a> | |
| </div> | |
| </fieldset> | |
| </div> | |
| <!-- Right Column: Status & Logs --> | |
| <div class="col"> | |
| <fieldset> | |
| <legend>System Status</legend> | |
| <div style="display: flex; flex-wrap: wrap; gap: 4px; margin-bottom: 12px;"> | |
| <span id="backend" class="chip">Initializing...</span> | |
| <span id="model" class="chip">No Model</span> | |
| <span id="encoder" class="chip">Encoder Ready</span> | |
| </div> | |
| <div style="display: flex; flex-wrap: wrap; gap: 4px;"> | |
| <span id="status" class="chip">Idle</span> | |
| </div> | |
| </fieldset> | |
| <fieldset> | |
| <legend>Activity Log</legend> | |
| <div id="log" class="mono"></div> | |
| </fieldset> | |
| <fieldset> | |
| <legend>Voice Cloning Info</legend> | |
| <div class="muted" style="font-size: 0.85rem; line-height: 1.8;"> | |
| <p><strong>π Tips:</strong></p> | |
| <ul style="margin: 8px 0 8px 20px;"> | |
| <li>Use clear audio (minimal noise)</li> | |
| <li>Duration: 5-30 seconds</li> | |
| <li>Single speaker only</li> | |
| <li>MP3, WAV, M4A supported</li> | |
| </ul> | |
| <p class="mt-1"><strong>βοΈ Technology:</strong></p> | |
| <p>Uses Web Audio API to extract voice characteristics and project to SpeechT5's 512-dim embedding space.</p> | |
| </div> | |
| </fieldset> | |
| </div> | |
| </div> | |
| <script type="module"> | |
| import * as transformers from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.1.2/dist/transformers.min.js"; | |
| const $ = (q) => document.querySelector(q); | |
| const $$ = (q) => document.querySelectorAll(q); | |
| // Logging | |
| const log = (msg, type = 'info') => { | |
| const el = $("#log"); | |
| const timestamp = new Date().toLocaleTimeString(); | |
| const prefix = type === 'error' ? 'β' : type === 'success' ? 'β ' : 'βΉοΈ'; | |
| const newLog = `${prefix} [${timestamp}] ${msg}`; | |
| el.textContent = newLog + '\n' + el.textContent.split('\n').slice(0, 50).join('\n'); | |
| console.log(`[${type}]`, msg); | |
| }; | |
| const showStatus = (msg, type = 'info') => { | |
| const box = $("#statusBox"); | |
| box.className = `status-message ${type}`; | |
| box.textContent = msg; | |
| }; | |
| const hideStatus = () => $("#statusBox").className = 'hidden'; | |
| // Bind sliders | |
| const bindVal = (id, displayId) => { | |
| const el = $("#" + id), display = $("#" + displayId); | |
| const update = () => display.textContent = parseFloat(el.value).toFixed(2); | |
| el.addEventListener("input", update); | |
| update(); | |
| }; | |
| ["spd", "temp"].forEach(id => bindVal(id, id + "Val")); | |
| // Character counter | |
| const updateCounts = () => { | |
| const text = $("#txt").value; | |
| $("#charCount").textContent = text.length; | |
| $("#wordCount").textContent = text.trim().split(/\s+/).filter(Boolean).length; | |
| }; | |
| $("#txt").addEventListener("input", updateCounts); | |
| updateCounts(); | |
| // Voice mode toggle | |
| const updateVoiceMode = () => { | |
| const isClone = document.querySelector('input[name="voiceMode"]:checked').value === 'clone'; | |
| $("#cloneSection").classList.toggle("hidden", !isClone); | |
| }; | |
| $$('input[name="voiceMode"]').forEach(r => r.addEventListener("change", updateVoiceMode)); | |
| // Initialize | |
| log("Initializing Transformers.js..."); | |
| $("#backend").textContent = "Configuring..."; | |
| try { | |
| await transformers.env.set("wasm.wasmPaths", "https://cdn.jsdelivr.net/npm/@xenova/wasm@1.0.0/"); | |
| transformers.env.backends.onnx.wasm.numThreads = 1; | |
| $("#backend").className = "chip success"; | |
| $("#backend").textContent = navigator.gpu ? "WebGPU" : "WASM"; | |
| log("Backend ready", 'success'); | |
| } catch (e) { | |
| log("Config warning: " + e.message, 'info'); | |
| } | |
| // WAV encoding function (fix for missing encodeWAV) | |
| function encodeWAV(samples, sampleRate) { | |
| const buffer = new ArrayBuffer(44 + samples.length * 2); | |
| const view = new DataView(buffer); | |
| // WAV header | |
| const writeString = (offset, string) => { | |
| for (let i = 0; i < string.length; i++) { | |
| view.setUint8(offset + i, string.charCodeAt(i)); | |
| } | |
| }; | |
| writeString(0, 'RIFF'); | |
| view.setUint32(4, 36 + samples.length * 2, true); | |
| writeString(8, 'WAVE'); | |
| writeString(12, 'fmt '); | |
| view.setUint32(16, 16, true); // fmt chunk size | |
| view.setUint16(20, 1, true); // PCM format | |
| view.setUint16(22, 1, true); // mono | |
| view.setUint32(24, sampleRate, true); | |
| view.setUint32(28, sampleRate * 2, true); // byte rate | |
| view.setUint16(32, 2, true); // block align | |
| view.setUint16(34, 16, true); // bits per sample | |
| writeString(36, 'data'); | |
| view.setUint32(40, samples.length * 2, true); | |
| // PCM samples | |
| let offset = 44; | |
| for (let i = 0; i < samples.length; i++) { | |
| const s = Math.max(-1, Math.min(1, samples[i])); | |
| view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true); | |
| offset += 2; | |
| } | |
| return buffer; | |
| } | |
| // Models | |
| const MODELS = { | |
| speecht5: "Xenova/speecht5_tts", | |
| speecht5_hifi: "Xenova/speecht5_tts_vctk_hifi", | |
| mms_eng: "Xenova/mms-tts-eng" | |
| }; | |
| let tts = null; | |
| let defaultEmbedding = null; | |
| let customEmbedding = null; | |
| let currentModelId = null; | |
| // Encoder ready (we'll use simple audio analysis instead of WavLM to avoid loading issues) | |
| $("#encoder").className = "chip success"; | |
| $("#encoder").textContent = "Encoder Ready"; | |
| log("Audio processor ready", 'success'); | |
| // Load TTS model | |
| async function loadModel(modelKey) { | |
| const modelId = MODELS[modelKey]; | |
| $("#model").className = "chip warning"; | |
| $("#model").textContent = "Loading..."; | |
| $("#currentModel").textContent = "Loading..."; | |
| $("#go").disabled = true; | |
| log(`Loading TTS model: ${modelId}...`); | |
| try { | |
| tts = await transformers.pipeline("text-to-speech", modelId, { | |
| progress_callback: (p) => { | |
| if (p?.status === 'progress' && p.file) { | |
| log(`Downloading: ${p.file}`); | |
| } | |
| } | |
| }); | |
| // Load default embeddings for SpeechT5 | |
| if (modelId.includes("speecht5")) { | |
| log("Loading default speaker embeddings..."); | |
| const response = await fetch( | |
| "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin" | |
| ); | |
| const buffer = await response.arrayBuffer(); | |
| defaultEmbedding = new Float32Array(buffer); | |
| log(`Default embeddings loaded (${defaultEmbedding.length}-dim)`, 'success'); | |
| } else { | |
| defaultEmbedding = null; | |
| } | |
| currentModelId = modelId; | |
| $("#model").className = "chip success"; | |
| $("#model").textContent = "Ready"; | |
| $("#currentModel").textContent = modelId.split('/')[1]; | |
| $("#go").disabled = false; | |
| log(`TTS model ready`, 'success'); | |
| return true; | |
| } catch (err) { | |
| log(`TTS load error: ${err.message}`, 'error'); | |
| $("#model").className = "chip danger"; | |
| $("#model").textContent = "Failed"; | |
| $("#go").disabled = true; | |
| showStatus(`Error: ${err.message}`, 'error'); | |
| return false; | |
| } | |
| } | |
| // Process uploaded audio for voice cloning (simplified without WavLM) | |
| async function processVoiceCloning(audioFile) { | |
| $("#voiceStatus").innerHTML = '<span class="chip warning">Processing...</span>'; | |
| log(`Processing voice sample: ${audioFile.name}`); | |
| try { | |
| // Read audio file | |
| const arrayBuffer = await audioFile.arrayBuffer(); | |
| const audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 }); | |
| const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); | |
| // Get mono audio data | |
| let audioData = audioBuffer.getChannelData(0); | |
| // Normalize audio | |
| const max = Math.max(...audioData.map(Math.abs)); | |
| if (max > 0) { | |
| audioData = audioData.map(x => x / max); | |
| } | |
| log(`Audio: ${audioData.length} samples @ ${audioBuffer.sampleRate}Hz`); | |
| // Extract voice features (simplified spectral analysis) | |
| log("Extracting voice characteristics..."); | |
| // Calculate spectral features | |
| const windowSize = 1024; | |
| const hopSize = 512; | |
| const numWindows = Math.floor((audioData.length - windowSize) / hopSize); | |
| const features = []; | |
| for (let i = 0; i < numWindows && i < 200; i++) { | |
| const start = i * hopSize; | |
| const window = audioData.slice(start, start + windowSize); | |
| // Calculate RMS energy | |
| const rms = Math.sqrt(window.reduce((sum, x) => sum + x * x, 0) / window.length); | |
| // Calculate zero-crossing rate | |
| let zcr = 0; | |
| for (let j = 1; j < window.length; j++) { | |
| if ((window[j] >= 0 && window[j - 1] < 0) || (window[j] < 0 && window[j - 1] >= 0)) { | |
| zcr++; | |
| } | |
| } | |
| zcr = zcr / window.length; | |
| // Calculate spectral centroid (simplified) | |
| const spectrum = window.map((x, idx) => Math.abs(x) * idx); | |
| const centroid = spectrum.reduce((a, b) => a + b, 0) / (spectrum.reduce((a, b) => a + Math.abs(b), 0) + 1e-8); | |
| features.push(rms, zcr, centroid / window.length); | |
| } | |
| // Create custom embedding from features | |
| customEmbedding = new Float32Array(512); | |
| // Repeat and normalize features to 512-dim | |
| for (let i = 0; i < 512; i++) { | |
| customEmbedding[i] = features[i % features.length] || 0; | |
| } | |
| // Normalize | |
| const mean = customEmbedding.reduce((a, b) => a + b, 0) / 512; | |
| const std = Math.sqrt( | |
| customEmbedding.reduce((a, b) => a + Math.pow(b - mean, 2), 0) / 512 | |
| ); | |
| for (let i = 0; i < 512; i++) { | |
| customEmbedding[i] = (customEmbedding[i] - mean) / (std + 1e-8); | |
| } | |
| // Blend with default for stability | |
| if (defaultEmbedding) { | |
| const blendRatio = 0.6; // 60% custom, 40% default | |
| for (let i = 0; i < 512; i++) { | |
| customEmbedding[i] = customEmbedding[i] * blendRatio + | |
| defaultEmbedding[i] * (1 - blendRatio); | |
| } | |
| } | |
| $("#voiceStatus").innerHTML = '<span class="chip success">β Voice captured!</span>'; | |
| log(`Voice characteristics extracted (512-dim)`, 'success'); | |
| showStatus("β Voice captured! Now generate speech.", 'success'); | |
| // Show preview | |
| $("#voicePreview").classList.remove("hidden"); | |
| const url = URL.createObjectURL(audioFile); | |
| $("#voiceAudio").src = url; | |
| } catch (err) { | |
| $("#voiceStatus").innerHTML = '<span class="chip danger">β Processing failed</span>'; | |
| log(`Voice cloning error: ${err.message}`, 'error'); | |
| showStatus(`Voice processing error: ${err.message}`, 'error'); | |
| customEmbedding = null; | |
| } | |
| } | |
| // Voice file upload handler | |
| $("#voiceFile").addEventListener("change", async (e) => { | |
| const file = e.target.files[0]; | |
| if (file) await processVoiceCloning(file); | |
| }); | |
| // Generate speech | |
| $("#go").addEventListener("click", async () => { | |
| const text = $("#txt").value.trim(); | |
| if (!text) { | |
| showStatus("Please enter text!", 'error'); | |
| return; | |
| } | |
| if (!tts) { | |
| showStatus("Model not loaded!", 'error'); | |
| return; | |
| } | |
| const useClone = document.querySelector('input[name="voiceMode"]:checked').value === 'clone'; | |
| if (useClone && !customEmbedding) { | |
| showStatus("Please upload voice sample first!", 'error'); | |
| return; | |
| } | |
| const btn = $("#go"); | |
| btn.disabled = true; | |
| $("#status").className = "chip warning"; | |
| $("#status").textContent = "Generating..."; | |
| showStatus(`ποΈ Generating ${useClone ? 'with cloned voice' : 'with default voice'}...`, 'info'); | |
| log(`Generating: "${text.substring(0, 30)}..." (${useClone ? 'CLONED' : 'DEFAULT'})`); | |
| try { | |
| let output; | |
| const embedding = useClone ? customEmbedding : defaultEmbedding; | |
| if (embedding) { | |
| output = await tts(text, { speaker_embeddings: embedding }); | |
| } else { | |
| output = await tts(text); | |
| } | |
| log(`Generated! ${output.audio.length} samples @ ${output.sampling_rate}Hz`, 'success'); | |
| // Encode WAV using our custom function | |
| const wav = encodeWAV(output.audio, output.sampling_rate); | |
| const blob = new Blob([wav], { type: "audio/wav" }); | |
| const url = URL.createObjectURL(blob); | |
| // Player | |
| const player = $("#player"); | |
| player.src = url; | |
| player.playbackRate = parseFloat($("#spd").value); | |
| player.classList.remove("hidden"); | |
| // Download | |
| $("#download").href = url; | |
| $("#download").download = `tts-${useClone ? 'cloned' : 'default'}-${Date.now()}.wav`; | |
| $("#downloadBox").classList.remove("hidden"); | |
| $("#status").className = "chip success"; | |
| $("#status").textContent = "Success"; | |
| showStatus(`β Audio generated with ${useClone ? 'CLONED VOICE' : 'default voice'}!`, 'success'); | |
| } catch (err) { | |
| log(`Generation error: ${err.message}`, 'error'); | |
| console.error(err); | |
| $("#status").className = "chip danger"; | |
| $("#status").textContent = "Error"; | |
| showStatus(`β Error: ${err.message}`, 'error'); | |
| } finally { | |
| btn.disabled = false; | |
| } | |
| }); | |
| // Clear | |
| $("#free").addEventListener("click", () => { | |
| const player = $("#player"); | |
| if (player.src) { | |
| URL.revokeObjectURL(player.src); | |
| player.removeAttribute("src"); | |
| player.classList.add("hidden"); | |
| } | |
| $("#downloadBox").classList.add("hidden"); | |
| hideStatus(); | |
| log("Cleared", 'success'); | |
| }); | |
| // Speed control | |
| $("#spd").addEventListener("input", () => { | |
| const player = $("#player"); | |
| if (player.src) player.playbackRate = parseFloat($("#spd").value); | |
| }); | |
| // Load model | |
| log("Starting initialization..."); | |
| await loadModel("speecht5"); | |
| // Model selector | |
| $("#modelSelect").addEventListener("change", async (e) => { | |
| if (MODELS[e.target.value] !== currentModelId) { | |
| await loadModel(e.target.value); | |
| } | |
| }); | |
| log("π Application ready! Upload voice or use default.", 'success'); | |
| </script> | |
| </body> | |
| </html> | |