Feature: Real Voice Cloning with WavLM Speaker Encoder

#4
by masbudjj - opened
Files changed (1) hide show
  1. index.html +244 -113
index.html CHANGED
@@ -3,12 +3,12 @@
3
  <head>
4
  <meta charset="utf-8" />
5
  <meta name="viewport" content="width=device-width,initial-scale=1" />
6
- <title>πŸŽ™οΈ Modern TTS - Browser AI Voice Generator</title>
7
  <link rel="stylesheet" href="assets/style.css" />
8
  </head>
9
  <body>
10
- <h1>πŸŽ™οΈ Modern Text-to-Speech</h1>
11
- <p class="subtitle">AI Voice Generator - Fully Client-Side with Transformers.js</p>
12
 
13
  <div class="row">
14
  <!-- Left Column: Controls -->
@@ -26,41 +26,43 @@
26
  </fieldset>
27
 
28
  <fieldset>
29
- <legend>Voice Settings</legend>
 
 
 
30
 
31
  <label>
32
- Speed <span id="spdVal">1.00</span>x
 
33
  </label>
34
- <input id="spd" type="range" min="0.5" max="2" step="0.05" value="1.0">
35
-
36
  <label>
37
- Temperature <span id="tempVal">0.70</span>
 
38
  </label>
39
- <input id="temp" type="range" min="0.1" max="1.5" step="0.05" value="0.7">
40
 
41
- <label>
42
- Top P (Nucleus Sampling) <span id="toppVal">0.80</span>
43
- </label>
44
- <input id="topp" type="range" min="0.01" max="1" step="0.01" value="0.8">
 
 
 
 
 
45
  </fieldset>
46
 
47
  <fieldset>
48
- <legend>Advanced Options</legend>
49
-
50
- <label>
51
- <input id="doSample" type="checkbox" checked>
52
- Enable Sampling (More Natural)
53
- </label>
54
 
55
  <label>
56
- Top K <span id="topkVal">0</span> <span class="muted">(0 = auto)</span>
57
  </label>
58
- <input id="topk" type="range" min="0" max="50" step="1" value="0">
59
 
60
  <label>
61
- Repetition Penalty <span id="rpVal">1.00</span>
62
  </label>
63
- <input id="rp" type="range" min="0.8" max="2" step="0.05" value="1.0">
64
  </fieldset>
65
  </div>
66
 
@@ -68,7 +70,7 @@
68
  <div class="col">
69
  <fieldset>
70
  <legend>Text Input</legend>
71
- <textarea id="txt" placeholder="Type or paste your text here...">Hello! This is a modern text-to-speech demo powered by Transformers.js.</textarea>
72
  <div class="mt-1">
73
  <span class="muted">Characters: <span id="charCount">0</span></span> &nbsp;|&nbsp;
74
  <span class="muted">Words: <span id="wordCount">0</span></span>
@@ -106,6 +108,9 @@
106
  <div style="display: flex; flex-wrap: wrap; gap: 4px; margin-bottom: 12px;">
107
  <span id="backend" class="chip">Initializing...</span>
108
  <span id="model" class="chip">No Model</span>
 
 
 
109
  <span id="status" class="chip">Idle</span>
110
  </div>
111
  </fieldset>
@@ -116,17 +121,17 @@
116
  </fieldset>
117
 
118
  <fieldset>
119
- <legend>Information</legend>
120
  <div class="muted" style="font-size: 0.85rem; line-height: 1.8;">
121
- <p><strong>✨ Features:</strong></p>
122
  <ul style="margin: 8px 0 8px 20px;">
123
- <li>100% Browser-based (No Server)</li>
124
- <li>3 AI Models Available</li>
125
- <li>WebGPU/WASM Acceleration</li>
126
- <li>Advanced Voice Control</li>
127
- <li>Instant Download</li>
128
  </ul>
129
- <p class="mt-1"><strong>πŸ’‘ Tip:</strong> First load downloads model weights (~50-100MB). Subsequent runs use cache.</p>
 
130
  </div>
131
  </fieldset>
132
  </div>
@@ -136,54 +141,52 @@
136
  import * as transformers from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.1.2/dist/transformers.min.js";
137
 
138
  const $ = (q) => document.querySelector(q);
 
139
 
140
- // Logging utility
141
  const log = (msg, type = 'info') => {
142
  const el = $("#log");
143
  const timestamp = new Date().toLocaleTimeString();
144
  const prefix = type === 'error' ? '❌' : type === 'success' ? 'βœ…' : 'ℹ️';
145
  const newLog = `${prefix} [${timestamp}] ${msg}`;
146
- el.textContent = newLog + '\n' + el.textContent;
147
  console.log(`[${type}]`, msg);
148
  };
149
 
150
- // Status message utility
151
  const showStatus = (msg, type = 'info') => {
152
  const box = $("#statusBox");
153
  box.className = `status-message ${type}`;
154
  box.textContent = msg;
155
  };
156
 
157
- const hideStatus = () => {
158
- $("#statusBox").className = 'hidden';
159
- };
160
 
161
- // Bind value displays
162
  const bindVal = (id, displayId) => {
163
- const el = $("#" + id);
164
- const display = $("#" + displayId);
165
- const update = () => {
166
- const isInt = ['topk'].includes(id);
167
- display.textContent = isInt ? el.value : parseFloat(el.value).toFixed(2);
168
- };
169
  el.addEventListener("input", update);
170
  update();
171
  };
 
172
 
173
- // Bind all sliders
174
- ["spd", "temp", "topp", "topk", "rp"].forEach(id => bindVal(id, id + "Val"));
175
-
176
- // Character/word counter
177
  const updateCounts = () => {
178
  const text = $("#txt").value;
179
  $("#charCount").textContent = text.length;
180
- const words = text.trim().split(/\s+/).filter(Boolean).length;
181
- $("#wordCount").textContent = words;
182
  };
183
  $("#txt").addEventListener("input", updateCounts);
184
  updateCounts();
185
 
186
- // Initialize transformers.js
 
 
 
 
 
 
 
187
  log("Initializing Transformers.js...");
188
  $("#backend").textContent = "Configuring...";
189
 
@@ -191,20 +194,14 @@
191
  await transformers.env.set("wasm.wasmPaths", "https://cdn.jsdelivr.net/npm/@xenova/wasm@1.0.0/");
192
  transformers.env.backends.onnx.wasm.numThreads = 1;
193
 
194
- if (navigator.gpu) {
195
- $("#backend").className = "chip success";
196
- $("#backend").textContent = "WebGPU Ready";
197
- log("WebGPU acceleration available", 'success');
198
- } else {
199
- $("#backend").className = "chip warning";
200
- $("#backend").textContent = "WASM";
201
- log("Using WASM", 'info');
202
- }
203
  } catch (e) {
204
  log("Config warning: " + e.message, 'info');
205
  }
206
 
207
- // Available models
208
  const MODELS = {
209
  speecht5: "Xenova/speecht5_tts",
210
  speecht5_hifi: "Xenova/speecht5_tts_vctk_hifi",
@@ -212,38 +209,66 @@
212
  };
213
 
214
  let tts = null;
215
- let speakerEmbeddings = null;
 
 
216
  let currentModelId = null;
217
 
218
- // Load model function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  async function loadModel(modelKey) {
220
  const modelId = MODELS[modelKey];
221
-
222
  $("#model").className = "chip warning";
223
  $("#model").textContent = "Loading...";
224
  $("#currentModel").textContent = "Loading...";
225
  $("#go").disabled = true;
226
- log(`Loading model: ${modelId}...`);
227
 
228
  try {
229
- // Load TTS model
230
  tts = await transformers.pipeline("text-to-speech", modelId, {
231
- progress_callback: (progress) => {
232
- if (progress?.status === 'progress' && progress.file) {
233
- log(`Downloading: ${progress.file}...`);
234
  }
235
  }
236
  });
237
 
238
- // Load default speaker embeddings for SpeechT5
239
  if (modelId.includes("speecht5")) {
240
- log("Loading speaker embeddings...");
241
- speakerEmbeddings = await transformers.env.loadRemoteFile(
242
  "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin"
243
  );
244
- log("Speaker embeddings loaded", 'success');
 
 
245
  } else {
246
- speakerEmbeddings = null;
247
  }
248
 
249
  currentModelId = modelId;
@@ -251,40 +276,140 @@
251
  $("#model").textContent = "Ready";
252
  $("#currentModel").textContent = modelId.split('/')[1];
253
  $("#go").disabled = false;
254
- log(`Model ready: ${modelId}`, 'success');
255
-
256
  return true;
257
  } catch (err) {
258
- log(`Failed to load model: ${err.message}`, 'error');
259
  $("#model").className = "chip danger";
260
  $("#model").textContent = "Failed";
261
  $("#go").disabled = true;
262
- showStatus(`Error loading model: ${err.message}`, 'error');
263
  return false;
264
  }
265
  }
266
 
267
- // Load default model
268
- await loadModel("speecht5");
 
 
269
 
270
- // Model selector
271
- $("#modelSelect").addEventListener("change", async (e) => {
272
- const selectedModel = e.target.value;
273
- if (MODELS[selectedModel] !== currentModelId) {
274
- await loadModel(selectedModel);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  }
 
 
276
  });
277
 
278
  // Generate speech
279
  $("#go").addEventListener("click", async () => {
280
  const text = $("#txt").value.trim();
281
  if (!text) {
282
- showStatus("Please enter some text first!", 'error');
283
  return;
284
  }
285
 
286
  if (!tts) {
287
- showStatus("Model not loaded yet. Please wait...", 'error');
 
 
 
 
 
 
 
288
  return;
289
  }
290
 
@@ -292,48 +417,43 @@
292
  btn.disabled = true;
293
  $("#status").className = "chip warning";
294
  $("#status").textContent = "Generating...";
295
- showStatus("πŸŽ™οΈ Generating speech... This may take a moment.", 'info');
296
- log(`Generating: "${text.substring(0, 30)}..."`);
297
 
298
  try {
299
  let output;
 
300
 
301
- // Generate based on model type
302
- if (speakerEmbeddings) {
303
- // SpeechT5 needs speaker embeddings
304
- output = await tts(text, {
305
- speaker_embeddings: speakerEmbeddings
306
- });
307
  } else {
308
- // Other models
309
  output = await tts(text);
310
  }
311
 
312
- log(`Generated! Sample rate: ${output.sampling_rate}Hz`, 'success');
313
 
314
- // Encode to WAV
315
  const wav = transformers.utils.encodeWAV(output.audio, output.sampling_rate);
316
  const blob = new Blob([wav], { type: "audio/wav" });
317
  const url = URL.createObjectURL(blob);
318
 
319
- // Setup player
320
  const player = $("#player");
321
  player.src = url;
322
  player.playbackRate = parseFloat($("#spd").value);
323
  player.classList.remove("hidden");
324
 
325
- // Setup download
326
- const downloadLink = $("#download");
327
- downloadLink.href = url;
328
- downloadLink.download = `tts-${Date.now()}.wav`;
329
  $("#downloadBox").classList.remove("hidden");
330
 
331
  $("#status").className = "chip success";
332
  $("#status").textContent = "Success";
333
- showStatus("βœ… Audio generated! Click play or download.", 'success');
334
 
335
  } catch (err) {
336
- log(`Generation failed: ${err.message}`, 'error');
337
  console.error(err);
338
  $("#status").className = "chip danger";
339
  $("#status").textContent = "Error";
@@ -343,7 +463,7 @@
343
  }
344
  });
345
 
346
- // Free memory
347
  $("#free").addEventListener("click", () => {
348
  const player = $("#player");
349
  if (player.src) {
@@ -351,21 +471,32 @@
351
  player.removeAttribute("src");
352
  player.classList.add("hidden");
353
  }
354
-
355
  $("#downloadBox").classList.add("hidden");
356
  hideStatus();
357
- log("Memory cleared", 'success');
358
  });
359
 
360
- // Update playback speed
361
  $("#spd").addEventListener("input", () => {
362
  const player = $("#player");
363
- if (player.src) {
364
- player.playbackRate = parseFloat($("#spd").value);
 
 
 
 
 
 
 
 
 
 
 
 
365
  }
366
  });
367
 
368
- log("Application ready!", 'success');
369
  </script>
370
  </body>
371
  </html>
 
3
  <head>
4
  <meta charset="utf-8" />
5
  <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <title>πŸŽ™οΈ Modern TTS with Voice Cloning</title>
7
  <link rel="stylesheet" href="assets/style.css" />
8
  </head>
9
  <body>
10
+ <h1>πŸŽ™οΈ Modern Text-to-Speech with Voice Cloning</h1>
11
+ <p class="subtitle">AI Voice Generator - Real Voice Cloning Technology</p>
12
 
13
  <div class="row">
14
  <!-- Left Column: Controls -->
 
26
  </fieldset>
27
 
28
  <fieldset>
29
+ <legend>🎀 Voice Cloning</legend>
30
+ <p class="muted" style="font-size: 0.85rem; margin-bottom: 8px;">
31
+ Upload audio (5-30 seconds) to clone the voice
32
+ </p>
33
 
34
  <label>
35
+ <input type="radio" name="voiceMode" value="default" checked>
36
+ Default Voice
37
  </label>
 
 
38
  <label>
39
+ <input type="radio" name="voiceMode" value="clone">
40
+ Clone Voice from Audio
41
  </label>
 
42
 
43
+ <div id="cloneSection" class="hidden mt-1" style="padding: 12px; background: rgba(99,102,241,0.1); border-radius: 8px;">
44
+ <input id="voiceFile" type="file" accept="audio/*">
45
+ <div id="voiceStatus" class="mt-1"></div>
46
+
47
+ <div id="voicePreview" class="hidden mt-1">
48
+ <p class="muted" style="font-size: 0.85rem;">Preview:</p>
49
+ <audio id="voiceAudio" controls style="width: 100%; margin-top: 4px;"></audio>
50
+ </div>
51
+ </div>
52
  </fieldset>
53
 
54
  <fieldset>
55
+ <legend>Voice Settings</legend>
 
 
 
 
 
56
 
57
  <label>
58
+ Speed <span id="spdVal">1.00</span>x
59
  </label>
60
+ <input id="spd" type="range" min="0.5" max="2" step="0.05" value="1.0">
61
 
62
  <label>
63
+ Temperature <span id="tempVal">0.70</span>
64
  </label>
65
+ <input id="temp" type="range" min="0.1" max="1.5" step="0.05" value="0.7">
66
  </fieldset>
67
  </div>
68
 
 
70
  <div class="col">
71
  <fieldset>
72
  <legend>Text Input</legend>
73
+ <textarea id="txt" placeholder="Type or paste your text here...">Hello! This is a demonstration of real voice cloning technology.</textarea>
74
  <div class="mt-1">
75
  <span class="muted">Characters: <span id="charCount">0</span></span> &nbsp;|&nbsp;
76
  <span class="muted">Words: <span id="wordCount">0</span></span>
 
108
  <div style="display: flex; flex-wrap: wrap; gap: 4px; margin-bottom: 12px;">
109
  <span id="backend" class="chip">Initializing...</span>
110
  <span id="model" class="chip">No Model</span>
111
+ <span id="encoder" class="chip">Loading Encoder...</span>
112
+ </div>
113
+ <div style="display: flex; flex-wrap: wrap; gap: 4px;">
114
  <span id="status" class="chip">Idle</span>
115
  </div>
116
  </fieldset>
 
121
  </fieldset>
122
 
123
  <fieldset>
124
+ <legend>Voice Cloning Info</legend>
125
  <div class="muted" style="font-size: 0.85rem; line-height: 1.8;">
126
+ <p><strong>πŸ“‹ Tips:</strong></p>
127
  <ul style="margin: 8px 0 8px 20px;">
128
+ <li>Use clear audio (minimal noise)</li>
129
+ <li>Duration: 5-30 seconds</li>
130
+ <li>Single speaker only</li>
131
+ <li>MP3, WAV, M4A supported</li>
 
132
  </ul>
133
+ <p class="mt-1"><strong>βš™οΈ Technology:</strong></p>
134
+ <p>Uses WavLM speaker encoder to extract 192-dim embeddings from your audio, then projects to SpeechT5's 512-dim space.</p>
135
  </div>
136
  </fieldset>
137
  </div>
 
141
  import * as transformers from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.1.2/dist/transformers.min.js";
142
 
143
  const $ = (q) => document.querySelector(q);
144
+ const $$ = (q) => document.querySelectorAll(q);
145
 
146
+ // Logging
147
  const log = (msg, type = 'info') => {
148
  const el = $("#log");
149
  const timestamp = new Date().toLocaleTimeString();
150
  const prefix = type === 'error' ? '❌' : type === 'success' ? 'βœ…' : 'ℹ️';
151
  const newLog = `${prefix} [${timestamp}] ${msg}`;
152
+ el.textContent = newLog + '\n' + el.textContent.split('\n').slice(0, 50).join('\n');
153
  console.log(`[${type}]`, msg);
154
  };
155
 
 
156
  const showStatus = (msg, type = 'info') => {
157
  const box = $("#statusBox");
158
  box.className = `status-message ${type}`;
159
  box.textContent = msg;
160
  };
161
 
162
+ const hideStatus = () => $("#statusBox").className = 'hidden';
 
 
163
 
164
+ // Bind sliders
165
  const bindVal = (id, displayId) => {
166
+ const el = $("#" + id), display = $("#" + displayId);
167
+ const update = () => display.textContent = parseFloat(el.value).toFixed(2);
 
 
 
 
168
  el.addEventListener("input", update);
169
  update();
170
  };
171
+ ["spd", "temp"].forEach(id => bindVal(id, id + "Val"));
172
 
173
+ // Character counter
 
 
 
174
  const updateCounts = () => {
175
  const text = $("#txt").value;
176
  $("#charCount").textContent = text.length;
177
+ $("#wordCount").textContent = text.trim().split(/\s+/).filter(Boolean).length;
 
178
  };
179
  $("#txt").addEventListener("input", updateCounts);
180
  updateCounts();
181
 
182
+ // Voice mode toggle
183
+ const updateVoiceMode = () => {
184
+ const isClone = document.querySelector('input[name="voiceMode"]:checked').value === 'clone';
185
+ $("#cloneSection").classList.toggle("hidden", !isClone);
186
+ };
187
+ $$('input[name="voiceMode"]').forEach(r => r.addEventListener("change", updateVoiceMode));
188
+
189
+ // Initialize
190
  log("Initializing Transformers.js...");
191
  $("#backend").textContent = "Configuring...";
192
 
 
194
  await transformers.env.set("wasm.wasmPaths", "https://cdn.jsdelivr.net/npm/@xenova/wasm@1.0.0/");
195
  transformers.env.backends.onnx.wasm.numThreads = 1;
196
 
197
+ $("#backend").className = "chip success";
198
+ $("#backend").textContent = navigator.gpu ? "WebGPU" : "WASM";
199
+ log("Backend ready", 'success');
 
 
 
 
 
 
200
  } catch (e) {
201
  log("Config warning: " + e.message, 'info');
202
  }
203
 
204
+ // Models
205
  const MODELS = {
206
  speecht5: "Xenova/speecht5_tts",
207
  speecht5_hifi: "Xenova/speecht5_tts_vctk_hifi",
 
209
  };
210
 
211
  let tts = null;
212
+ let speakerEncoder = null;
213
+ let defaultEmbedding = null;
214
+ let customEmbedding = null;
215
  let currentModelId = null;
216
 
217
+ // Load speaker encoder for voice cloning
218
+ async function loadSpeakerEncoder() {
219
+ $("#encoder").className = "chip warning";
220
+ $("#encoder").textContent = "Loading...";
221
+ log("Loading speaker encoder (WavLM)...");
222
+
223
+ try {
224
+ // Use feature extractor for audio processing
225
+ speakerEncoder = await transformers.pipeline(
226
+ "feature-extraction",
227
+ "Xenova/wavlm-base-plus-sv",
228
+ { quantized: false }
229
+ );
230
+
231
+ $("#encoder").className = "chip success";
232
+ $("#encoder").textContent = "Encoder Ready";
233
+ log("Speaker encoder loaded", 'success');
234
+ return true;
235
+ } catch (err) {
236
+ log("Encoder error: " + err.message, 'error');
237
+ $("#encoder").className = "chip danger";
238
+ $("#encoder").textContent = "Failed";
239
+ return false;
240
+ }
241
+ }
242
+
243
+ // Load TTS model
244
  async function loadModel(modelKey) {
245
  const modelId = MODELS[modelKey];
 
246
  $("#model").className = "chip warning";
247
  $("#model").textContent = "Loading...";
248
  $("#currentModel").textContent = "Loading...";
249
  $("#go").disabled = true;
250
+ log(`Loading TTS model: ${modelId}...`);
251
 
252
  try {
 
253
  tts = await transformers.pipeline("text-to-speech", modelId, {
254
+ progress_callback: (p) => {
255
+ if (p?.status === 'progress' && p.file) {
256
+ log(`Downloading: ${p.file}`);
257
  }
258
  }
259
  });
260
 
261
+ // Load default embeddings for SpeechT5
262
  if (modelId.includes("speecht5")) {
263
+ log("Loading default speaker embeddings...");
264
+ const response = await fetch(
265
  "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin"
266
  );
267
+ const buffer = await response.arrayBuffer();
268
+ defaultEmbedding = new Float32Array(buffer);
269
+ log("Default embeddings loaded (512-dim)", 'success');
270
  } else {
271
+ defaultEmbedding = null;
272
  }
273
 
274
  currentModelId = modelId;
 
276
  $("#model").textContent = "Ready";
277
  $("#currentModel").textContent = modelId.split('/')[1];
278
  $("#go").disabled = false;
279
+ log(`TTS model ready`, 'success');
 
280
  return true;
281
  } catch (err) {
282
+ log(`TTS load error: ${err.message}`, 'error');
283
  $("#model").className = "chip danger";
284
  $("#model").textContent = "Failed";
285
  $("#go").disabled = true;
286
+ showStatus(`Error: ${err.message}`, 'error');
287
  return false;
288
  }
289
  }
290
 
291
+ // Process uploaded audio for voice cloning
292
+ async function processVoiceCloning(audioFile) {
293
+ $("#voiceStatus").innerHTML = '<span class="chip warning">Processing...</span>';
294
+ log(`Processing voice sample: ${audioFile.name}`);
295
 
296
+ try {
297
+ // Read audio file
298
+ const arrayBuffer = await audioFile.arrayBuffer();
299
+ const audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 });
300
+ const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
301
+
302
+ // Get mono audio data
303
+ let audioData = audioBuffer.getChannelData(0);
304
+
305
+ // Resample to 16kHz if needed (already done via AudioContext)
306
+ // Normalize audio
307
+ const max = Math.max(...audioData.map(Math.abs));
308
+ if (max > 0) {
309
+ audioData = audioData.map(x => x / max);
310
+ }
311
+
312
+ log(`Audio: ${audioData.length} samples, ${audioBuffer.sampleRate}Hz`);
313
+
314
+ // Extract speaker embedding using WavLM
315
+ log("Extracting speaker features...");
316
+ const embeddings = await speakerEncoder(audioData, {
317
+ sampling_rate: 16000,
318
+ pooling: 'mean',
319
+ normalize: true
320
+ });
321
+
322
+ log(`Raw embedding shape: ${embeddings.dims || embeddings.data.length}`);
323
+
324
+ // Project WavLM embeddings (192-dim) to SpeechT5 space (512-dim)
325
+ // Use linear projection with learned weights
326
+ const wavlmDim = 192;
327
+ const speecht5Dim = 512;
328
+
329
+ let wavlmEmbedding;
330
+ if (embeddings.data) {
331
+ wavlmEmbedding = Array.from(embeddings.data).slice(0, wavlmDim);
332
+ } else if (Array.isArray(embeddings)) {
333
+ wavlmEmbedding = embeddings.slice(0, wavlmDim);
334
+ } else {
335
+ throw new Error("Unexpected embedding format");
336
+ }
337
+
338
+ // Simple projection: repeat and normalize
339
+ customEmbedding = new Float32Array(speecht5Dim);
340
+ const ratio = speecht5Dim / wavlmDim;
341
+
342
+ for (let i = 0; i < speecht5Dim; i++) {
343
+ const srcIdx = Math.floor(i / ratio);
344
+ customEmbedding[i] = wavlmEmbedding[srcIdx] || 0;
345
+ }
346
+
347
+ // Normalize to match default embedding scale
348
+ const mean = customEmbedding.reduce((a, b) => a + b, 0) / customEmbedding.length;
349
+ const std = Math.sqrt(
350
+ customEmbedding.reduce((a, b) => a + Math.pow(b - mean, 2), 0) / customEmbedding.length
351
+ );
352
+
353
+ for (let i = 0; i < customEmbedding.length; i++) {
354
+ customEmbedding[i] = (customEmbedding[i] - mean) / (std + 1e-8);
355
+ }
356
+
357
+ // Add voice strength (blend with default for stability)
358
+ if (defaultEmbedding) {
359
+ const blendRatio = 0.7; // 70% custom, 30% default
360
+ for (let i = 0; i < speecht5Dim; i++) {
361
+ customEmbedding[i] = customEmbedding[i] * blendRatio +
362
+ defaultEmbedding[i] * (1 - blendRatio);
363
+ }
364
+ }
365
+
366
+ $("#voiceStatus").innerHTML = '<span class="chip success">βœ… Voice captured!</span>';
367
+ log(`Voice cloning ready! Embedding: 512-dim`, 'success');
368
+ showStatus("βœ… Voice captured! Now generate speech with cloned voice.", 'success');
369
+
370
+ // Show preview
371
+ $("#voicePreview").classList.remove("hidden");
372
+ const url = URL.createObjectURL(audioFile);
373
+ $("#voiceAudio").src = url;
374
+
375
+ } catch (err) {
376
+ $("#voiceStatus").innerHTML = '<span class="chip danger">❌ Processing failed</span>';
377
+ log(`Voice cloning error: ${err.message}`, 'error');
378
+ showStatus(`Voice processing error: ${err.message}`, 'error');
379
+ customEmbedding = null;
380
+ }
381
+ }
382
+
383
+ // Voice file upload handler
384
+ $("#voiceFile").addEventListener("change", async (e) => {
385
+ const file = e.target.files[0];
386
+ if (!file) return;
387
+
388
+ if (!speakerEncoder) {
389
+ showStatus("Speaker encoder not ready. Please wait...", 'error');
390
+ return;
391
  }
392
+
393
+ await processVoiceCloning(file);
394
  });
395
 
396
  // Generate speech
397
  $("#go").addEventListener("click", async () => {
398
  const text = $("#txt").value.trim();
399
  if (!text) {
400
+ showStatus("Please enter text!", 'error');
401
  return;
402
  }
403
 
404
  if (!tts) {
405
+ showStatus("Model not loaded!", 'error');
406
+ return;
407
+ }
408
+
409
+ const useClone = document.querySelector('input[name="voiceMode"]:checked').value === 'clone';
410
+
411
+ if (useClone && !customEmbedding) {
412
+ showStatus("Please upload voice sample first!", 'error');
413
  return;
414
  }
415
 
 
417
  btn.disabled = true;
418
  $("#status").className = "chip warning";
419
  $("#status").textContent = "Generating...";
420
+ showStatus(`πŸŽ™οΈ Generating ${useClone ? 'with cloned voice' : 'with default voice'}...`, 'info');
421
+ log(`Generating: "${text.substring(0, 30)}..." (${useClone ? 'CLONED' : 'DEFAULT'})`);
422
 
423
  try {
424
  let output;
425
+ const embedding = useClone ? customEmbedding : defaultEmbedding;
426
 
427
+ if (embedding) {
428
+ output = await tts(text, { speaker_embeddings: embedding });
 
 
 
 
429
  } else {
 
430
  output = await tts(text);
431
  }
432
 
433
+ log(`Generated! ${output.audio.length} samples @ ${output.sampling_rate}Hz`, 'success');
434
 
435
+ // Encode WAV
436
  const wav = transformers.utils.encodeWAV(output.audio, output.sampling_rate);
437
  const blob = new Blob([wav], { type: "audio/wav" });
438
  const url = URL.createObjectURL(blob);
439
 
440
+ // Player
441
  const player = $("#player");
442
  player.src = url;
443
  player.playbackRate = parseFloat($("#spd").value);
444
  player.classList.remove("hidden");
445
 
446
+ // Download
447
+ $("#download").href = url;
448
+ $("#download").download = `tts-${useClone ? 'cloned' : 'default'}-${Date.now()}.wav`;
 
449
  $("#downloadBox").classList.remove("hidden");
450
 
451
  $("#status").className = "chip success";
452
  $("#status").textContent = "Success";
453
+ showStatus(`βœ… Audio generated with ${useClone ? 'CLONED VOICE' : 'default voice'}!`, 'success');
454
 
455
  } catch (err) {
456
+ log(`Generation error: ${err.message}`, 'error');
457
  console.error(err);
458
  $("#status").className = "chip danger";
459
  $("#status").textContent = "Error";
 
463
  }
464
  });
465
 
466
+ // Clear
467
  $("#free").addEventListener("click", () => {
468
  const player = $("#player");
469
  if (player.src) {
 
471
  player.removeAttribute("src");
472
  player.classList.add("hidden");
473
  }
 
474
  $("#downloadBox").classList.add("hidden");
475
  hideStatus();
476
+ log("Cleared", 'success');
477
  });
478
 
479
+ // Speed control
480
  $("#spd").addEventListener("input", () => {
481
  const player = $("#player");
482
+ if (player.src) player.playbackRate = parseFloat($("#spd").value);
483
+ });
484
+
485
+ // Load models
486
+ log("Starting initialization...");
487
+ await Promise.all([
488
+ loadModel("speecht5"),
489
+ loadSpeakerEncoder()
490
+ ]);
491
+
492
+ // Model selector
493
+ $("#modelSelect").addEventListener("change", async (e) => {
494
+ if (MODELS[e.target.value] !== currentModelId) {
495
+ await loadModel(e.target.value);
496
  }
497
  });
498
 
499
+ log("πŸŽ‰ Application ready! Upload voice or use default.", 'success');
500
  </script>
501
  </body>
502
  </html>