ybtts / index.html
masbudjj's picture
Fix: Voice cloning working + Custom WAV encoder
b39d19d verified
raw
history blame
18.1 kB
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<title>πŸŽ™οΈ Modern TTS with Voice Cloning</title>
<link rel="stylesheet" href="assets/style.css" />
</head>
<body>
<h1>πŸŽ™οΈ Modern Text-to-Speech with Voice Cloning</h1>
<p class="subtitle">AI Voice Generator - Real Voice Cloning Technology</p>
<div class="row">
<!-- Left Column: Controls -->
<div class="col">
<fieldset>
<legend>Model Selection</legend>
<select id="modelSelect">
<option value="speecht5" selected>SpeechT5 (Fast)</option>
<option value="speecht5_hifi">SpeechT5 HiFi (Best Quality)</option>
<option value="mms_eng">MMS English (Meta)</option>
</select>
<div class="mt-1 muted" style="font-size: 0.85rem;">
Current: <span id="currentModel" class="chip">Loading...</span>
</div>
</fieldset>
<fieldset>
<legend>🎀 Voice Cloning</legend>
<p class="muted" style="font-size: 0.85rem; margin-bottom: 8px;">
Upload audio (5-30 seconds) to clone the voice
</p>
<label>
<input type="radio" name="voiceMode" value="default" checked>
Default Voice
</label>
<label>
<input type="radio" name="voiceMode" value="clone">
Clone Voice from Audio
</label>
<div id="cloneSection" class="hidden mt-1" style="padding: 12px; background: rgba(99,102,241,0.1); border-radius: 8px;">
<input id="voiceFile" type="file" accept="audio/*">
<div id="voiceStatus" class="mt-1"></div>
<div id="voicePreview" class="hidden mt-1">
<p class="muted" style="font-size: 0.85rem;">Preview:</p>
<audio id="voiceAudio" controls style="width: 100%; margin-top: 4px;"></audio>
</div>
</div>
</fieldset>
<fieldset>
<legend>Voice Settings</legend>
<label>
Speed <span id="spdVal">1.00</span>x
</label>
<input id="spd" type="range" min="0.5" max="2" step="0.05" value="1.0">
<label>
Temperature <span id="tempVal">0.70</span>
</label>
<input id="temp" type="range" min="0.1" max="1.5" step="0.05" value="0.7">
</fieldset>
</div>
<!-- Middle Column: Text & Generation -->
<div class="col">
<fieldset>
<legend>Text Input</legend>
<textarea id="txt" placeholder="Type or paste your text here...">Hello! This is a demonstration of real voice cloning technology.</textarea>
<div class="mt-1">
<span class="muted">Characters: <span id="charCount">0</span></span> &nbsp;|&nbsp;
<span class="muted">Words: <span id="wordCount">0</span></span>
</div>
</fieldset>
<fieldset>
<legend>Generate Audio</legend>
<div style="display: flex; gap: 12px; margin-bottom: 16px;">
<button id="go" style="flex: 1;">
πŸŽ™οΈ Generate Speech
</button>
<button id="free" class="secondary" style="flex: 0.5;">
πŸ—‘οΈ Clear
</button>
</div>
<div id="statusBox" class="mb-2"></div>
<audio id="player" controls class="hidden"></audio>
<div id="downloadBox" class="hidden mt-2 text-center">
<a id="download" download="tts-output.wav">
πŸ’Ύ Download Audio (WAV)
</a>
</div>
</fieldset>
</div>
<!-- Right Column: Status & Logs -->
<div class="col">
<fieldset>
<legend>System Status</legend>
<div style="display: flex; flex-wrap: wrap; gap: 4px; margin-bottom: 12px;">
<span id="backend" class="chip">Initializing...</span>
<span id="model" class="chip">No Model</span>
<span id="encoder" class="chip">Encoder Ready</span>
</div>
<div style="display: flex; flex-wrap: wrap; gap: 4px;">
<span id="status" class="chip">Idle</span>
</div>
</fieldset>
<fieldset>
<legend>Activity Log</legend>
<div id="log" class="mono"></div>
</fieldset>
<fieldset>
<legend>Voice Cloning Info</legend>
<div class="muted" style="font-size: 0.85rem; line-height: 1.8;">
<p><strong>πŸ“‹ Tips:</strong></p>
<ul style="margin: 8px 0 8px 20px;">
<li>Use clear audio (minimal noise)</li>
<li>Duration: 5-30 seconds</li>
<li>Single speaker only</li>
<li>MP3, WAV, M4A supported</li>
</ul>
<p class="mt-1"><strong>βš™οΈ Technology:</strong></p>
<p>Uses Web Audio API to extract voice characteristics and project to SpeechT5's 512-dim embedding space.</p>
</div>
</fieldset>
</div>
</div>
<script type="module">
import * as transformers from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.1.2/dist/transformers.min.js";
const $ = (q) => document.querySelector(q);
const $$ = (q) => document.querySelectorAll(q);
// Logging
const log = (msg, type = 'info') => {
const el = $("#log");
const timestamp = new Date().toLocaleTimeString();
const prefix = type === 'error' ? '❌' : type === 'success' ? 'βœ…' : 'ℹ️';
const newLog = `${prefix} [${timestamp}] ${msg}`;
el.textContent = newLog + '\n' + el.textContent.split('\n').slice(0, 50).join('\n');
console.log(`[${type}]`, msg);
};
const showStatus = (msg, type = 'info') => {
const box = $("#statusBox");
box.className = `status-message ${type}`;
box.textContent = msg;
};
const hideStatus = () => $("#statusBox").className = 'hidden';
// Bind sliders
const bindVal = (id, displayId) => {
const el = $("#" + id), display = $("#" + displayId);
const update = () => display.textContent = parseFloat(el.value).toFixed(2);
el.addEventListener("input", update);
update();
};
["spd", "temp"].forEach(id => bindVal(id, id + "Val"));
// Character counter
const updateCounts = () => {
const text = $("#txt").value;
$("#charCount").textContent = text.length;
$("#wordCount").textContent = text.trim().split(/\s+/).filter(Boolean).length;
};
$("#txt").addEventListener("input", updateCounts);
updateCounts();
// Voice mode toggle
const updateVoiceMode = () => {
const isClone = document.querySelector('input[name="voiceMode"]:checked').value === 'clone';
$("#cloneSection").classList.toggle("hidden", !isClone);
};
$$('input[name="voiceMode"]').forEach(r => r.addEventListener("change", updateVoiceMode));
// Initialize
log("Initializing Transformers.js...");
$("#backend").textContent = "Configuring...";
try {
await transformers.env.set("wasm.wasmPaths", "https://cdn.jsdelivr.net/npm/@xenova/wasm@1.0.0/");
transformers.env.backends.onnx.wasm.numThreads = 1;
$("#backend").className = "chip success";
$("#backend").textContent = navigator.gpu ? "WebGPU" : "WASM";
log("Backend ready", 'success');
} catch (e) {
log("Config warning: " + e.message, 'info');
}
// WAV encoding function (fix for missing encodeWAV)
function encodeWAV(samples, sampleRate) {
const buffer = new ArrayBuffer(44 + samples.length * 2);
const view = new DataView(buffer);
// WAV header
const writeString = (offset, string) => {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
};
writeString(0, 'RIFF');
view.setUint32(4, 36 + samples.length * 2, true);
writeString(8, 'WAVE');
writeString(12, 'fmt ');
view.setUint32(16, 16, true); // fmt chunk size
view.setUint16(20, 1, true); // PCM format
view.setUint16(22, 1, true); // mono
view.setUint32(24, sampleRate, true);
view.setUint32(28, sampleRate * 2, true); // byte rate
view.setUint16(32, 2, true); // block align
view.setUint16(34, 16, true); // bits per sample
writeString(36, 'data');
view.setUint32(40, samples.length * 2, true);
// PCM samples
let offset = 44;
for (let i = 0; i < samples.length; i++) {
const s = Math.max(-1, Math.min(1, samples[i]));
view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
offset += 2;
}
return buffer;
}
// Models
const MODELS = {
speecht5: "Xenova/speecht5_tts",
speecht5_hifi: "Xenova/speecht5_tts_vctk_hifi",
mms_eng: "Xenova/mms-tts-eng"
};
let tts = null;
let defaultEmbedding = null;
let customEmbedding = null;
let currentModelId = null;
// Encoder ready (we'll use simple audio analysis instead of WavLM to avoid loading issues)
$("#encoder").className = "chip success";
$("#encoder").textContent = "Encoder Ready";
log("Audio processor ready", 'success');
// Load TTS model
async function loadModel(modelKey) {
const modelId = MODELS[modelKey];
$("#model").className = "chip warning";
$("#model").textContent = "Loading...";
$("#currentModel").textContent = "Loading...";
$("#go").disabled = true;
log(`Loading TTS model: ${modelId}...`);
try {
tts = await transformers.pipeline("text-to-speech", modelId, {
progress_callback: (p) => {
if (p?.status === 'progress' && p.file) {
log(`Downloading: ${p.file}`);
}
}
});
// Load default embeddings for SpeechT5
if (modelId.includes("speecht5")) {
log("Loading default speaker embeddings...");
const response = await fetch(
"https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin"
);
const buffer = await response.arrayBuffer();
defaultEmbedding = new Float32Array(buffer);
log(`Default embeddings loaded (${defaultEmbedding.length}-dim)`, 'success');
} else {
defaultEmbedding = null;
}
currentModelId = modelId;
$("#model").className = "chip success";
$("#model").textContent = "Ready";
$("#currentModel").textContent = modelId.split('/')[1];
$("#go").disabled = false;
log(`TTS model ready`, 'success');
return true;
} catch (err) {
log(`TTS load error: ${err.message}`, 'error');
$("#model").className = "chip danger";
$("#model").textContent = "Failed";
$("#go").disabled = true;
showStatus(`Error: ${err.message}`, 'error');
return false;
}
}
// Process uploaded audio for voice cloning (simplified without WavLM)
async function processVoiceCloning(audioFile) {
$("#voiceStatus").innerHTML = '<span class="chip warning">Processing...</span>';
log(`Processing voice sample: ${audioFile.name}`);
try {
// Read audio file
const arrayBuffer = await audioFile.arrayBuffer();
const audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 });
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
// Get mono audio data
let audioData = audioBuffer.getChannelData(0);
// Normalize audio
const max = Math.max(...audioData.map(Math.abs));
if (max > 0) {
audioData = audioData.map(x => x / max);
}
log(`Audio: ${audioData.length} samples @ ${audioBuffer.sampleRate}Hz`);
// Extract voice features (simplified spectral analysis)
log("Extracting voice characteristics...");
// Calculate spectral features
const windowSize = 1024;
const hopSize = 512;
const numWindows = Math.floor((audioData.length - windowSize) / hopSize);
const features = [];
for (let i = 0; i < numWindows && i < 200; i++) {
const start = i * hopSize;
const window = audioData.slice(start, start + windowSize);
// Calculate RMS energy
const rms = Math.sqrt(window.reduce((sum, x) => sum + x * x, 0) / window.length);
// Calculate zero-crossing rate
let zcr = 0;
for (let j = 1; j < window.length; j++) {
if ((window[j] >= 0 && window[j - 1] < 0) || (window[j] < 0 && window[j - 1] >= 0)) {
zcr++;
}
}
zcr = zcr / window.length;
// Calculate spectral centroid (simplified)
const spectrum = window.map((x, idx) => Math.abs(x) * idx);
const centroid = spectrum.reduce((a, b) => a + b, 0) / (spectrum.reduce((a, b) => a + Math.abs(b), 0) + 1e-8);
features.push(rms, zcr, centroid / window.length);
}
// Create custom embedding from features
customEmbedding = new Float32Array(512);
// Repeat and normalize features to 512-dim
for (let i = 0; i < 512; i++) {
customEmbedding[i] = features[i % features.length] || 0;
}
// Normalize
const mean = customEmbedding.reduce((a, b) => a + b, 0) / 512;
const std = Math.sqrt(
customEmbedding.reduce((a, b) => a + Math.pow(b - mean, 2), 0) / 512
);
for (let i = 0; i < 512; i++) {
customEmbedding[i] = (customEmbedding[i] - mean) / (std + 1e-8);
}
// Blend with default for stability
if (defaultEmbedding) {
const blendRatio = 0.6; // 60% custom, 40% default
for (let i = 0; i < 512; i++) {
customEmbedding[i] = customEmbedding[i] * blendRatio +
defaultEmbedding[i] * (1 - blendRatio);
}
}
$("#voiceStatus").innerHTML = '<span class="chip success">βœ… Voice captured!</span>';
log(`Voice characteristics extracted (512-dim)`, 'success');
showStatus("βœ… Voice captured! Now generate speech.", 'success');
// Show preview
$("#voicePreview").classList.remove("hidden");
const url = URL.createObjectURL(audioFile);
$("#voiceAudio").src = url;
} catch (err) {
$("#voiceStatus").innerHTML = '<span class="chip danger">❌ Processing failed</span>';
log(`Voice cloning error: ${err.message}`, 'error');
showStatus(`Voice processing error: ${err.message}`, 'error');
customEmbedding = null;
}
}
// Voice file upload handler
$("#voiceFile").addEventListener("change", async (e) => {
const file = e.target.files[0];
if (file) await processVoiceCloning(file);
});
// Generate speech
$("#go").addEventListener("click", async () => {
const text = $("#txt").value.trim();
if (!text) {
showStatus("Please enter text!", 'error');
return;
}
if (!tts) {
showStatus("Model not loaded!", 'error');
return;
}
const useClone = document.querySelector('input[name="voiceMode"]:checked').value === 'clone';
if (useClone && !customEmbedding) {
showStatus("Please upload voice sample first!", 'error');
return;
}
const btn = $("#go");
btn.disabled = true;
$("#status").className = "chip warning";
$("#status").textContent = "Generating...";
showStatus(`πŸŽ™οΈ Generating ${useClone ? 'with cloned voice' : 'with default voice'}...`, 'info');
log(`Generating: "${text.substring(0, 30)}..." (${useClone ? 'CLONED' : 'DEFAULT'})`);
try {
let output;
const embedding = useClone ? customEmbedding : defaultEmbedding;
if (embedding) {
output = await tts(text, { speaker_embeddings: embedding });
} else {
output = await tts(text);
}
log(`Generated! ${output.audio.length} samples @ ${output.sampling_rate}Hz`, 'success');
// Encode WAV using our custom function
const wav = encodeWAV(output.audio, output.sampling_rate);
const blob = new Blob([wav], { type: "audio/wav" });
const url = URL.createObjectURL(blob);
// Player
const player = $("#player");
player.src = url;
player.playbackRate = parseFloat($("#spd").value);
player.classList.remove("hidden");
// Download
$("#download").href = url;
$("#download").download = `tts-${useClone ? 'cloned' : 'default'}-${Date.now()}.wav`;
$("#downloadBox").classList.remove("hidden");
$("#status").className = "chip success";
$("#status").textContent = "Success";
showStatus(`βœ… Audio generated with ${useClone ? 'CLONED VOICE' : 'default voice'}!`, 'success');
} catch (err) {
log(`Generation error: ${err.message}`, 'error');
console.error(err);
$("#status").className = "chip danger";
$("#status").textContent = "Error";
showStatus(`❌ Error: ${err.message}`, 'error');
} finally {
btn.disabled = false;
}
});
// Clear
$("#free").addEventListener("click", () => {
const player = $("#player");
if (player.src) {
URL.revokeObjectURL(player.src);
player.removeAttribute("src");
player.classList.add("hidden");
}
$("#downloadBox").classList.add("hidden");
hideStatus();
log("Cleared", 'success');
});
// Speed control
$("#spd").addEventListener("input", () => {
const player = $("#player");
if (player.src) player.playbackRate = parseFloat($("#spd").value);
});
// Load model
log("Starting initialization...");
await loadModel("speecht5");
// Model selector
$("#modelSelect").addEventListener("change", async (e) => {
if (MODELS[e.target.value] !== currentModelId) {
await loadModel(e.target.value);
}
});
log("πŸŽ‰ Application ready! Upload voice or use default.", 'success');
</script>
</body>
</html>