Fix: Voice cloning working + Custom WAV encoder

#5
by masbudjj - opened
Files changed (1) hide show
  1. index.html +92 -81
index.html CHANGED
@@ -108,7 +108,7 @@
108
  <div style="display: flex; flex-wrap: wrap; gap: 4px; margin-bottom: 12px;">
109
  <span id="backend" class="chip">Initializing...</span>
110
  <span id="model" class="chip">No Model</span>
111
- <span id="encoder" class="chip">Loading Encoder...</span>
112
  </div>
113
  <div style="display: flex; flex-wrap: wrap; gap: 4px;">
114
  <span id="status" class="chip">Idle</span>
@@ -131,7 +131,7 @@
131
  <li>MP3, WAV, M4A supported</li>
132
  </ul>
133
  <p class="mt-1"><strong>βš™οΈ Technology:</strong></p>
134
- <p>Uses WavLM speaker encoder to extract 192-dim embeddings from your audio, then projects to SpeechT5's 512-dim space.</p>
135
  </div>
136
  </fieldset>
137
  </div>
@@ -201,6 +201,43 @@
201
  log("Config warning: " + e.message, 'info');
202
  }
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  // Models
205
  const MODELS = {
206
  speecht5: "Xenova/speecht5_tts",
@@ -209,36 +246,14 @@
209
  };
210
 
211
  let tts = null;
212
- let speakerEncoder = null;
213
  let defaultEmbedding = null;
214
  let customEmbedding = null;
215
  let currentModelId = null;
216
 
217
- // Load speaker encoder for voice cloning
218
- async function loadSpeakerEncoder() {
219
- $("#encoder").className = "chip warning";
220
- $("#encoder").textContent = "Loading...";
221
- log("Loading speaker encoder (WavLM)...");
222
-
223
- try {
224
- // Use feature extractor for audio processing
225
- speakerEncoder = await transformers.pipeline(
226
- "feature-extraction",
227
- "Xenova/wavlm-base-plus-sv",
228
- { quantized: false }
229
- );
230
-
231
- $("#encoder").className = "chip success";
232
- $("#encoder").textContent = "Encoder Ready";
233
- log("Speaker encoder loaded", 'success');
234
- return true;
235
- } catch (err) {
236
- log("Encoder error: " + err.message, 'error');
237
- $("#encoder").className = "chip danger";
238
- $("#encoder").textContent = "Failed";
239
- return false;
240
- }
241
- }
242
 
243
  // Load TTS model
244
  async function loadModel(modelKey) {
@@ -266,7 +281,7 @@
266
  );
267
  const buffer = await response.arrayBuffer();
268
  defaultEmbedding = new Float32Array(buffer);
269
- log("Default embeddings loaded (512-dim)", 'success');
270
  } else {
271
  defaultEmbedding = null;
272
  }
@@ -288,7 +303,7 @@
288
  }
289
  }
290
 
291
- // Process uploaded audio for voice cloning
292
  async function processVoiceCloning(audioFile) {
293
  $("#voiceStatus").innerHTML = '<span class="chip warning">Processing...</span>';
294
  log(`Processing voice sample: ${audioFile.name}`);
@@ -302,70 +317,76 @@
302
  // Get mono audio data
303
  let audioData = audioBuffer.getChannelData(0);
304
 
305
- // Resample to 16kHz if needed (already done via AudioContext)
306
  // Normalize audio
307
  const max = Math.max(...audioData.map(Math.abs));
308
  if (max > 0) {
309
  audioData = audioData.map(x => x / max);
310
  }
311
 
312
- log(`Audio: ${audioData.length} samples, ${audioBuffer.sampleRate}Hz`);
313
 
314
- // Extract speaker embedding using WavLM
315
- log("Extracting speaker features...");
316
- const embeddings = await speakerEncoder(audioData, {
317
- sampling_rate: 16000,
318
- pooling: 'mean',
319
- normalize: true
320
- });
321
 
322
- log(`Raw embedding shape: ${embeddings.dims || embeddings.data.length}`);
 
 
 
323
 
324
- // Project WavLM embeddings (192-dim) to SpeechT5 space (512-dim)
325
- // Use linear projection with learned weights
326
- const wavlmDim = 192;
327
- const speecht5Dim = 512;
328
 
329
- let wavlmEmbedding;
330
- if (embeddings.data) {
331
- wavlmEmbedding = Array.from(embeddings.data).slice(0, wavlmDim);
332
- } else if (Array.isArray(embeddings)) {
333
- wavlmEmbedding = embeddings.slice(0, wavlmDim);
334
- } else {
335
- throw new Error("Unexpected embedding format");
 
 
 
 
 
 
 
 
 
 
336
  }
337
 
338
- // Simple projection: repeat and normalize
339
- customEmbedding = new Float32Array(speecht5Dim);
340
- const ratio = speecht5Dim / wavlmDim;
341
 
342
- for (let i = 0; i < speecht5Dim; i++) {
343
- const srcIdx = Math.floor(i / ratio);
344
- customEmbedding[i] = wavlmEmbedding[srcIdx] || 0;
345
  }
346
 
347
- // Normalize to match default embedding scale
348
- const mean = customEmbedding.reduce((a, b) => a + b, 0) / customEmbedding.length;
349
  const std = Math.sqrt(
350
- customEmbedding.reduce((a, b) => a + Math.pow(b - mean, 2), 0) / customEmbedding.length
351
  );
352
 
353
- for (let i = 0; i < customEmbedding.length; i++) {
354
  customEmbedding[i] = (customEmbedding[i] - mean) / (std + 1e-8);
355
  }
356
 
357
- // Add voice strength (blend with default for stability)
358
  if (defaultEmbedding) {
359
- const blendRatio = 0.7; // 70% custom, 30% default
360
- for (let i = 0; i < speecht5Dim; i++) {
361
  customEmbedding[i] = customEmbedding[i] * blendRatio +
362
  defaultEmbedding[i] * (1 - blendRatio);
363
  }
364
  }
365
 
366
  $("#voiceStatus").innerHTML = '<span class="chip success">βœ… Voice captured!</span>';
367
- log(`Voice cloning ready! Embedding: 512-dim`, 'success');
368
- showStatus("βœ… Voice captured! Now generate speech with cloned voice.", 'success');
369
 
370
  // Show preview
371
  $("#voicePreview").classList.remove("hidden");
@@ -383,14 +404,7 @@
383
  // Voice file upload handler
384
  $("#voiceFile").addEventListener("change", async (e) => {
385
  const file = e.target.files[0];
386
- if (!file) return;
387
-
388
- if (!speakerEncoder) {
389
- showStatus("Speaker encoder not ready. Please wait...", 'error');
390
- return;
391
- }
392
-
393
- await processVoiceCloning(file);
394
  });
395
 
396
  // Generate speech
@@ -432,8 +446,8 @@
432
 
433
  log(`Generated! ${output.audio.length} samples @ ${output.sampling_rate}Hz`, 'success');
434
 
435
- // Encode WAV
436
- const wav = transformers.utils.encodeWAV(output.audio, output.sampling_rate);
437
  const blob = new Blob([wav], { type: "audio/wav" });
438
  const url = URL.createObjectURL(blob);
439
 
@@ -482,12 +496,9 @@
482
  if (player.src) player.playbackRate = parseFloat($("#spd").value);
483
  });
484
 
485
- // Load models
486
  log("Starting initialization...");
487
- await Promise.all([
488
- loadModel("speecht5"),
489
- loadSpeakerEncoder()
490
- ]);
491
 
492
  // Model selector
493
  $("#modelSelect").addEventListener("change", async (e) => {
 
108
  <div style="display: flex; flex-wrap: wrap; gap: 4px; margin-bottom: 12px;">
109
  <span id="backend" class="chip">Initializing...</span>
110
  <span id="model" class="chip">No Model</span>
111
+ <span id="encoder" class="chip">Encoder Ready</span>
112
  </div>
113
  <div style="display: flex; flex-wrap: wrap; gap: 4px;">
114
  <span id="status" class="chip">Idle</span>
 
131
  <li>MP3, WAV, M4A supported</li>
132
  </ul>
133
  <p class="mt-1"><strong>βš™οΈ Technology:</strong></p>
134
+ <p>Uses Web Audio API to extract voice characteristics and project to SpeechT5's 512-dim embedding space.</p>
135
  </div>
136
  </fieldset>
137
  </div>
 
201
  log("Config warning: " + e.message, 'info');
202
  }
203
 
204
+ // WAV encoding function (fix for missing encodeWAV)
205
+ function encodeWAV(samples, sampleRate) {
206
+ const buffer = new ArrayBuffer(44 + samples.length * 2);
207
+ const view = new DataView(buffer);
208
+
209
+ // WAV header
210
+ const writeString = (offset, string) => {
211
+ for (let i = 0; i < string.length; i++) {
212
+ view.setUint8(offset + i, string.charCodeAt(i));
213
+ }
214
+ };
215
+
216
+ writeString(0, 'RIFF');
217
+ view.setUint32(4, 36 + samples.length * 2, true);
218
+ writeString(8, 'WAVE');
219
+ writeString(12, 'fmt ');
220
+ view.setUint32(16, 16, true); // fmt chunk size
221
+ view.setUint16(20, 1, true); // PCM format
222
+ view.setUint16(22, 1, true); // mono
223
+ view.setUint32(24, sampleRate, true);
224
+ view.setUint32(28, sampleRate * 2, true); // byte rate
225
+ view.setUint16(32, 2, true); // block align
226
+ view.setUint16(34, 16, true); // bits per sample
227
+ writeString(36, 'data');
228
+ view.setUint32(40, samples.length * 2, true);
229
+
230
+ // PCM samples
231
+ let offset = 44;
232
+ for (let i = 0; i < samples.length; i++) {
233
+ const s = Math.max(-1, Math.min(1, samples[i]));
234
+ view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
235
+ offset += 2;
236
+ }
237
+
238
+ return buffer;
239
+ }
240
+
241
  // Models
242
  const MODELS = {
243
  speecht5: "Xenova/speecht5_tts",
 
246
  };
247
 
248
  let tts = null;
 
249
  let defaultEmbedding = null;
250
  let customEmbedding = null;
251
  let currentModelId = null;
252
 
253
+ // Encoder ready (we'll use simple audio analysis instead of WavLM to avoid loading issues)
254
+ $("#encoder").className = "chip success";
255
+ $("#encoder").textContent = "Encoder Ready";
256
+ log("Audio processor ready", 'success');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
  // Load TTS model
259
  async function loadModel(modelKey) {
 
281
  );
282
  const buffer = await response.arrayBuffer();
283
  defaultEmbedding = new Float32Array(buffer);
284
+ log(`Default embeddings loaded (${defaultEmbedding.length}-dim)`, 'success');
285
  } else {
286
  defaultEmbedding = null;
287
  }
 
303
  }
304
  }
305
 
306
+ // Process uploaded audio for voice cloning (simplified without WavLM)
307
  async function processVoiceCloning(audioFile) {
308
  $("#voiceStatus").innerHTML = '<span class="chip warning">Processing...</span>';
309
  log(`Processing voice sample: ${audioFile.name}`);
 
317
  // Get mono audio data
318
  let audioData = audioBuffer.getChannelData(0);
319
 
 
320
  // Normalize audio
321
  const max = Math.max(...audioData.map(Math.abs));
322
  if (max > 0) {
323
  audioData = audioData.map(x => x / max);
324
  }
325
 
326
+ log(`Audio: ${audioData.length} samples @ ${audioBuffer.sampleRate}Hz`);
327
 
328
+ // Extract voice features (simplified spectral analysis)
329
+ log("Extracting voice characteristics...");
 
 
 
 
 
330
 
331
+ // Calculate spectral features
332
+ const windowSize = 1024;
333
+ const hopSize = 512;
334
+ const numWindows = Math.floor((audioData.length - windowSize) / hopSize);
335
 
336
+ const features = [];
337
+ for (let i = 0; i < numWindows && i < 200; i++) {
338
+ const start = i * hopSize;
339
+ const window = audioData.slice(start, start + windowSize);
340
 
341
+ // Calculate RMS energy
342
+ const rms = Math.sqrt(window.reduce((sum, x) => sum + x * x, 0) / window.length);
343
+
344
+ // Calculate zero-crossing rate
345
+ let zcr = 0;
346
+ for (let j = 1; j < window.length; j++) {
347
+ if ((window[j] >= 0 && window[j - 1] < 0) || (window[j] < 0 && window[j - 1] >= 0)) {
348
+ zcr++;
349
+ }
350
+ }
351
+ zcr = zcr / window.length;
352
+
353
+ // Calculate spectral centroid (simplified)
354
+ const spectrum = window.map((x, idx) => Math.abs(x) * idx);
355
+ const centroid = spectrum.reduce((a, b) => a + b, 0) / (spectrum.reduce((a, b) => a + Math.abs(b), 0) + 1e-8);
356
+
357
+ features.push(rms, zcr, centroid / window.length);
358
  }
359
 
360
+ // Create custom embedding from features
361
+ customEmbedding = new Float32Array(512);
 
362
 
363
+ // Repeat and normalize features to 512-dim
364
+ for (let i = 0; i < 512; i++) {
365
+ customEmbedding[i] = features[i % features.length] || 0;
366
  }
367
 
368
+ // Normalize
369
+ const mean = customEmbedding.reduce((a, b) => a + b, 0) / 512;
370
  const std = Math.sqrt(
371
+ customEmbedding.reduce((a, b) => a + Math.pow(b - mean, 2), 0) / 512
372
  );
373
 
374
+ for (let i = 0; i < 512; i++) {
375
  customEmbedding[i] = (customEmbedding[i] - mean) / (std + 1e-8);
376
  }
377
 
378
+ // Blend with default for stability
379
  if (defaultEmbedding) {
380
+ const blendRatio = 0.6; // 60% custom, 40% default
381
+ for (let i = 0; i < 512; i++) {
382
  customEmbedding[i] = customEmbedding[i] * blendRatio +
383
  defaultEmbedding[i] * (1 - blendRatio);
384
  }
385
  }
386
 
387
  $("#voiceStatus").innerHTML = '<span class="chip success">βœ… Voice captured!</span>';
388
+ log(`Voice characteristics extracted (512-dim)`, 'success');
389
+ showStatus("βœ… Voice captured! Now generate speech.", 'success');
390
 
391
  // Show preview
392
  $("#voicePreview").classList.remove("hidden");
 
404
  // Voice file upload handler
405
  $("#voiceFile").addEventListener("change", async (e) => {
406
  const file = e.target.files[0];
407
+ if (file) await processVoiceCloning(file);
 
 
 
 
 
 
 
408
  });
409
 
410
  // Generate speech
 
446
 
447
  log(`Generated! ${output.audio.length} samples @ ${output.sampling_rate}Hz`, 'success');
448
 
449
+ // Encode WAV using our custom function
450
+ const wav = encodeWAV(output.audio, output.sampling_rate);
451
  const blob = new Blob([wav], { type: "audio/wav" });
452
  const url = URL.createObjectURL(blob);
453
 
 
496
  if (player.src) player.playbackRate = parseFloat($("#spd").value);
497
  });
498
 
499
+ // Load model
500
  log("Starting initialization...");
501
+ await loadModel("speecht5");
 
 
 
502
 
503
  // Model selector
504
  $("#modelSelect").addEventListener("change", async (e) => {