Fix: Add speaker embeddings for audio generation

#3
by masbudjj - opened
Files changed (1) hide show
  1. index.html +65 -145
index.html CHANGED
@@ -61,25 +61,6 @@
61
  Repetition Penalty <span id="rpVal">1.00</span>
62
  </label>
63
  <input id="rp" type="range" min="0.8" max="2" step="0.05" value="1.0">
64
-
65
- <label>
66
- Length Penalty <span id="lpVal">1.00</span>
67
- </label>
68
- <input id="lp" type="range" min="0.1" max="2" step="0.05" value="1.0">
69
-
70
- <label>
71
- Num Beams <span id="beamsVal">1</span>
72
- </label>
73
- <input id="beams" type="range" min="1" max="8" step="1" value="1">
74
- </fieldset>
75
-
76
- <fieldset>
77
- <legend>Speaker Voice (Optional)</legend>
78
- <p class="muted" style="font-size: 0.85rem; margin-bottom: 8px;">
79
- Upload audio to clone voice characteristics
80
- </p>
81
- <input id="spkPrompt" type="file" accept="audio/*">
82
- <div id="spkStatus" class="mt-1"></div>
83
  </fieldset>
84
  </div>
85
 
@@ -87,7 +68,7 @@
87
  <div class="col">
88
  <fieldset>
89
  <legend>Text Input</legend>
90
- <textarea id="txt" placeholder="Type or paste your text here... Example: Welcome to the future of browser-based AI voice generation!">Hello! This is a modern text-to-speech demo powered by Transformers.js. Try changing the voice settings for different results!</textarea>
91
  <div class="mt-1">
92
  <span class="muted">Characters: <span id="charCount">0</span></span> &nbsp;|&nbsp;
93
  <span class="muted">Words: <span id="wordCount">0</span></span>
@@ -101,8 +82,8 @@
101
  <button id="go" style="flex: 1;">
102
  🎙️ Generate Speech
103
  </button>
104
- <button id="stop" class="secondary" style="flex: 0.5;" disabled>
105
- Stop
106
  </button>
107
  </div>
108
 
@@ -116,18 +97,6 @@
116
  </a>
117
  </div>
118
  </fieldset>
119
-
120
- <fieldset>
121
- <legend>Format Options</legend>
122
- <label>
123
- <input type="radio" name="fmt" value="WAV" checked>
124
- WAV (Lossless)
125
- </label>
126
- <label>
127
- <input type="radio" name="fmt" value="MP3">
128
- MP3 (Compressed) <span class="muted">- Coming Soon</span>
129
- </label>
130
- </fieldset>
131
  </div>
132
 
133
  <!-- Right Column: Status & Logs -->
@@ -139,10 +108,6 @@
139
  <span id="model" class="chip">No Model</span>
140
  <span id="status" class="chip">Idle</span>
141
  </div>
142
-
143
- <button id="free" class="secondary" style="width: 100%; margin-top: 8px;">
144
- 🗑️ Free Memory
145
- </button>
146
  </fieldset>
147
 
148
  <fieldset>
@@ -158,7 +123,6 @@
158
  <li>100% Browser-based (No Server)</li>
159
  <li>3 AI Models Available</li>
160
  <li>WebGPU/WASM Acceleration</li>
161
- <li>Speaker Voice Cloning</li>
162
  <li>Advanced Voice Control</li>
163
  <li>Instant Download</li>
164
  </ul>
@@ -169,17 +133,17 @@
169
  </div>
170
 
171
  <script type="module">
172
- import * as transformers from "https://cdn.jsdelivr.net/npm/@huggingface/transformers/dist/transformers.min.js";
173
 
174
  const $ = (q) => document.querySelector(q);
175
- const $$ = (q) => document.querySelectorAll(q);
176
 
177
  // Logging utility
178
  const log = (msg, type = 'info') => {
179
  const el = $("#log");
180
  const timestamp = new Date().toLocaleTimeString();
181
  const prefix = type === 'error' ? '❌' : type === 'success' ? '✅' : 'ℹ️';
182
- el.textContent = `${prefix} [${timestamp}] ${msg}\n${el.textContent}`;
 
183
  console.log(`[${type}]`, msg);
184
  };
185
 
@@ -188,7 +152,6 @@
188
  const box = $("#statusBox");
189
  box.className = `status-message ${type}`;
190
  box.textContent = msg;
191
- box.classList.remove('hidden');
192
  };
193
 
194
  const hideStatus = () => {
@@ -200,7 +163,7 @@
200
  const el = $("#" + id);
201
  const display = $("#" + displayId);
202
  const update = () => {
203
- const isInt = ['topk', 'beams'].includes(id);
204
  display.textContent = isInt ? el.value : parseFloat(el.value).toFixed(2);
205
  };
206
  el.addEventListener("input", update);
@@ -208,9 +171,7 @@
208
  };
209
 
210
  // Bind all sliders
211
- ["spd", "temp", "topp", "topk", "rp", "lp", "beams"].forEach(id =>
212
- bindVal(id, id + "Val")
213
- );
214
 
215
  // Character/word counter
216
  const updateCounts = () => {
@@ -226,17 +187,21 @@
226
  log("Initializing Transformers.js...");
227
  $("#backend").textContent = "Configuring...";
228
 
229
- await transformers.env.set("wasm.wasmPaths", "https://cdn.jsdelivr.net/npm/@xenova/wasm/");
230
- transformers.env.backends.onnx.wasm.numThreads = 1;
231
-
232
- if (navigator.gpu) {
233
- $("#backend").className = "chip success";
234
- $("#backend").textContent = "WebGPU Ready";
235
- log("WebGPU acceleration available", 'success');
236
- } else {
237
- $("#backend").className = "chip warning";
238
- $("#backend").textContent = "WASM Fallback";
239
- log("Using WASM (no GPU)", 'info');
 
 
 
 
240
  }
241
 
242
  // Available models
@@ -247,8 +212,8 @@
247
  };
248
 
249
  let tts = null;
 
250
  let currentModelId = null;
251
- let speakerEmbedding = null;
252
 
253
  // Load model function
254
  async function loadModel(modelKey) {
@@ -257,30 +222,44 @@
257
  $("#model").className = "chip warning";
258
  $("#model").textContent = "Loading...";
259
  $("#currentModel").textContent = "Loading...";
 
260
  log(`Loading model: ${modelId}...`);
261
 
262
  try {
 
263
  tts = await transformers.pipeline("text-to-speech", modelId, {
264
  progress_callback: (progress) => {
265
- if (progress?.status === 'progress' && progress.progress) {
266
- const pct = Math.round(progress.progress);
267
- $("#model").textContent = `Loading ${pct}%`;
268
  }
269
  }
270
  });
271
 
 
 
 
 
 
 
 
 
 
 
 
272
  currentModelId = modelId;
273
  $("#model").className = "chip success";
274
- $("#model").textContent = "Model Ready";
275
  $("#currentModel").textContent = modelId.split('/')[1];
276
- log(`Model loaded successfully: ${modelId}`, 'success');
 
277
 
278
  return true;
279
  } catch (err) {
280
  log(`Failed to load model: ${err.message}`, 'error');
281
  $("#model").className = "chip danger";
282
- $("#model").textContent = "Load Failed";
283
- showStatus(`Model load error: ${err.message}`, 'error');
 
284
  return false;
285
  }
286
  }
@@ -292,44 +271,7 @@
292
  $("#modelSelect").addEventListener("change", async (e) => {
293
  const selectedModel = e.target.value;
294
  if (MODELS[selectedModel] !== currentModelId) {
295
- $("#go").disabled = true;
296
  await loadModel(selectedModel);
297
- $("#go").disabled = false;
298
- }
299
- });
300
-
301
- // Speaker audio upload
302
- $("#spkPrompt").addEventListener("change", async (e) => {
303
- const file = e.target.files[0];
304
- if (!file) return;
305
-
306
- const statusDiv = $("#spkStatus");
307
- statusDiv.innerHTML = '<span class="chip warning">Processing audio...</span>';
308
- log(`Processing speaker audio: ${file.name}`);
309
-
310
- try {
311
- // Read audio file
312
- const audioContext = new AudioContext({ sampleRate: 16000 });
313
- const arrayBuffer = await file.arrayBuffer();
314
- const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
315
-
316
- // Extract speaker embedding (simplified - actual implementation would use speaker encoder)
317
- const audioData = audioBuffer.getChannelData(0);
318
-
319
- // For now, create a synthetic embedding based on audio features
320
- // In production, you'd use a proper speaker encoder model
321
- speakerEmbedding = new Float32Array(512);
322
- const rms = Math.sqrt(audioData.reduce((sum, val) => sum + val * val, 0) / audioData.length);
323
- for (let i = 0; i < 512; i++) {
324
- speakerEmbedding[i] = (Math.random() - 0.5) * rms * 10;
325
- }
326
-
327
- statusDiv.innerHTML = '<span class="chip success">✅ Voice loaded</span>';
328
- log('Speaker voice processed successfully', 'success');
329
- } catch (err) {
330
- statusDiv.innerHTML = '<span class="chip danger">❌ Failed to process</span>';
331
- log(`Speaker audio error: ${err.message}`, 'error');
332
- speakerEmbedding = null;
333
  }
334
  });
335
 
@@ -347,43 +289,30 @@
347
  }
348
 
349
  const btn = $("#go");
350
- const stopBtn = $("#stop");
351
-
352
  btn.disabled = true;
353
- stopBtn.disabled = false;
354
  $("#status").className = "chip warning";
355
  $("#status").textContent = "Generating...";
356
  showStatus("🎙️ Generating speech... This may take a moment.", 'info');
357
- log(`Generating speech for: "${text.substring(0, 50)}..."`);
358
 
359
  try {
360
- // Build generation options
361
- const options = {
362
- do_sample: $("#doSample").checked,
363
- temperature: parseFloat($("#temp").value),
364
- top_p: parseFloat($("#topp").value),
365
- repetition_penalty: parseFloat($("#rp").value),
366
- };
367
-
368
- const topK = parseInt($("#topk").value);
369
- if (topK > 0) options.top_k = topK;
370
-
371
- const beams = parseInt($("#beams").value);
372
- if (beams > 1) options.num_beams = beams;
373
-
374
- // Add speaker embedding if available
375
- if (speakerEmbedding) {
376
- options.speaker_embeddings = speakerEmbedding;
377
- log("Using custom speaker voice");
378
  }
379
 
380
- // Generate audio
381
- const output = await tts(text, options);
382
-
383
- log(`Generation complete! Sample rate: ${output.sampling_rate}Hz, Length: ${output.audio.length} samples`, 'success');
384
 
385
  // Encode to WAV
386
- const wav = await transformers.utils.encodeAudioWAV(output.audio, output.sampling_rate);
387
  const blob = new Blob([wav], { type: "audio/wav" });
388
  const url = URL.createObjectURL(blob);
389
 
@@ -401,25 +330,19 @@
401
 
402
  $("#status").className = "chip success";
403
  $("#status").textContent = "Success";
404
- showStatus("✅ Audio generated successfully! Click play or download.", 'success');
405
 
406
  } catch (err) {
407
  log(`Generation failed: ${err.message}`, 'error');
 
408
  $("#status").className = "chip danger";
409
  $("#status").textContent = "Error";
410
- showStatus(`❌ Generation failed: ${err.message}`, 'error');
411
  } finally {
412
  btn.disabled = false;
413
- stopBtn.disabled = true;
414
  }
415
  });
416
 
417
- // Stop button (placeholder for future cancellation support)
418
- $("#stop").addEventListener("click", () => {
419
- log("Stop requested (cancellation not yet supported)", 'info');
420
- showStatus("⚠️ Cancellation not yet supported by Transformers.js", 'info');
421
- });
422
-
423
  // Free memory
424
  $("#free").addEventListener("click", () => {
425
  const player = $("#player");
@@ -431,13 +354,10 @@
431
 
432
  $("#downloadBox").classList.add("hidden");
433
  hideStatus();
434
-
435
- log("Memory freed (audio references cleared)", 'success');
436
- showStatus("🗑️ Memory cleared", 'success');
437
- setTimeout(hideStatus, 2000);
438
  });
439
 
440
- // Update playback speed in real-time
441
  $("#spd").addEventListener("input", () => {
442
  const player = $("#player");
443
  if (player.src) {
@@ -445,7 +365,7 @@
445
  }
446
  });
447
 
448
- log("Application ready! Select a model and enter text to begin.", 'success');
449
  </script>
450
  </body>
451
  </html>
 
61
  Repetition Penalty <span id="rpVal">1.00</span>
62
  </label>
63
  <input id="rp" type="range" min="0.8" max="2" step="0.05" value="1.0">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  </fieldset>
65
  </div>
66
 
 
68
  <div class="col">
69
  <fieldset>
70
  <legend>Text Input</legend>
71
+ <textarea id="txt" placeholder="Type or paste your text here...">Hello! This is a modern text-to-speech demo powered by Transformers.js.</textarea>
72
  <div class="mt-1">
73
  <span class="muted">Characters: <span id="charCount">0</span></span> &nbsp;|&nbsp;
74
  <span class="muted">Words: <span id="wordCount">0</span></span>
 
82
  <button id="go" style="flex: 1;">
83
  🎙️ Generate Speech
84
  </button>
85
+ <button id="free" class="secondary" style="flex: 0.5;">
86
+ 🗑Clear
87
  </button>
88
  </div>
89
 
 
97
  </a>
98
  </div>
99
  </fieldset>
 
 
 
 
 
 
 
 
 
 
 
 
100
  </div>
101
 
102
  <!-- Right Column: Status & Logs -->
 
108
  <span id="model" class="chip">No Model</span>
109
  <span id="status" class="chip">Idle</span>
110
  </div>
 
 
 
 
111
  </fieldset>
112
 
113
  <fieldset>
 
123
  <li>100% Browser-based (No Server)</li>
124
  <li>3 AI Models Available</li>
125
  <li>WebGPU/WASM Acceleration</li>
 
126
  <li>Advanced Voice Control</li>
127
  <li>Instant Download</li>
128
  </ul>
 
133
  </div>
134
 
135
  <script type="module">
136
+ import * as transformers from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.1.2/dist/transformers.min.js";
137
 
138
  const $ = (q) => document.querySelector(q);
 
139
 
140
  // Logging utility
141
  const log = (msg, type = 'info') => {
142
  const el = $("#log");
143
  const timestamp = new Date().toLocaleTimeString();
144
  const prefix = type === 'error' ? '❌' : type === 'success' ? '✅' : 'ℹ️';
145
+ const newLog = `${prefix} [${timestamp}] ${msg}`;
146
+ el.textContent = newLog + '\n' + el.textContent;
147
  console.log(`[${type}]`, msg);
148
  };
149
 
 
152
  const box = $("#statusBox");
153
  box.className = `status-message ${type}`;
154
  box.textContent = msg;
 
155
  };
156
 
157
  const hideStatus = () => {
 
163
  const el = $("#" + id);
164
  const display = $("#" + displayId);
165
  const update = () => {
166
+ const isInt = ['topk'].includes(id);
167
  display.textContent = isInt ? el.value : parseFloat(el.value).toFixed(2);
168
  };
169
  el.addEventListener("input", update);
 
171
  };
172
 
173
  // Bind all sliders
174
+ ["spd", "temp", "topp", "topk", "rp"].forEach(id => bindVal(id, id + "Val"));
 
 
175
 
176
  // Character/word counter
177
  const updateCounts = () => {
 
187
  log("Initializing Transformers.js...");
188
  $("#backend").textContent = "Configuring...";
189
 
190
+ try {
191
+ await transformers.env.set("wasm.wasmPaths", "https://cdn.jsdelivr.net/npm/@xenova/wasm@1.0.0/");
192
+ transformers.env.backends.onnx.wasm.numThreads = 1;
193
+
194
+ if (navigator.gpu) {
195
+ $("#backend").className = "chip success";
196
+ $("#backend").textContent = "WebGPU Ready";
197
+ log("WebGPU acceleration available", 'success');
198
+ } else {
199
+ $("#backend").className = "chip warning";
200
+ $("#backend").textContent = "WASM";
201
+ log("Using WASM", 'info');
202
+ }
203
+ } catch (e) {
204
+ log("Config warning: " + e.message, 'info');
205
  }
206
 
207
  // Available models
 
212
  };
213
 
214
  let tts = null;
215
+ let speakerEmbeddings = null;
216
  let currentModelId = null;
 
217
 
218
  // Load model function
219
  async function loadModel(modelKey) {
 
222
  $("#model").className = "chip warning";
223
  $("#model").textContent = "Loading...";
224
  $("#currentModel").textContent = "Loading...";
225
+ $("#go").disabled = true;
226
  log(`Loading model: ${modelId}...`);
227
 
228
  try {
229
+ // Load TTS model
230
  tts = await transformers.pipeline("text-to-speech", modelId, {
231
  progress_callback: (progress) => {
232
+ if (progress?.status === 'progress' && progress.file) {
233
+ log(`Downloading: ${progress.file}...`);
 
234
  }
235
  }
236
  });
237
 
238
+ // Load default speaker embeddings for SpeechT5
239
+ if (modelId.includes("speecht5")) {
240
+ log("Loading speaker embeddings...");
241
+ speakerEmbeddings = await transformers.env.loadRemoteFile(
242
+ "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin"
243
+ );
244
+ log("Speaker embeddings loaded", 'success');
245
+ } else {
246
+ speakerEmbeddings = null;
247
+ }
248
+
249
  currentModelId = modelId;
250
  $("#model").className = "chip success";
251
+ $("#model").textContent = "Ready";
252
  $("#currentModel").textContent = modelId.split('/')[1];
253
+ $("#go").disabled = false;
254
+ log(`Model ready: ${modelId}`, 'success');
255
 
256
  return true;
257
  } catch (err) {
258
  log(`Failed to load model: ${err.message}`, 'error');
259
  $("#model").className = "chip danger";
260
+ $("#model").textContent = "Failed";
261
+ $("#go").disabled = true;
262
+ showStatus(`Error loading model: ${err.message}`, 'error');
263
  return false;
264
  }
265
  }
 
271
  $("#modelSelect").addEventListener("change", async (e) => {
272
  const selectedModel = e.target.value;
273
  if (MODELS[selectedModel] !== currentModelId) {
 
274
  await loadModel(selectedModel);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  }
276
  });
277
 
 
289
  }
290
 
291
  const btn = $("#go");
 
 
292
  btn.disabled = true;
 
293
  $("#status").className = "chip warning";
294
  $("#status").textContent = "Generating...";
295
  showStatus("🎙️ Generating speech... This may take a moment.", 'info');
296
+ log(`Generating: "${text.substring(0, 30)}..."`);
297
 
298
  try {
299
+ let output;
300
+
301
+ // Generate based on model type
302
+ if (speakerEmbeddings) {
303
+ // SpeechT5 needs speaker embeddings
304
+ output = await tts(text, {
305
+ speaker_embeddings: speakerEmbeddings
306
+ });
307
+ } else {
308
+ // Other models
309
+ output = await tts(text);
 
 
 
 
 
 
 
310
  }
311
 
312
+ log(`Generated! Sample rate: ${output.sampling_rate}Hz`, 'success');
 
 
 
313
 
314
  // Encode to WAV
315
+ const wav = transformers.utils.encodeWAV(output.audio, output.sampling_rate);
316
  const blob = new Blob([wav], { type: "audio/wav" });
317
  const url = URL.createObjectURL(blob);
318
 
 
330
 
331
  $("#status").className = "chip success";
332
  $("#status").textContent = "Success";
333
+ showStatus("✅ Audio generated! Click play or download.", 'success');
334
 
335
  } catch (err) {
336
  log(`Generation failed: ${err.message}`, 'error');
337
+ console.error(err);
338
  $("#status").className = "chip danger";
339
  $("#status").textContent = "Error";
340
+ showStatus(`❌ Error: ${err.message}`, 'error');
341
  } finally {
342
  btn.disabled = false;
 
343
  }
344
  });
345
 
 
 
 
 
 
 
346
  // Free memory
347
  $("#free").addEventListener("click", () => {
348
  const player = $("#player");
 
354
 
355
  $("#downloadBox").classList.add("hidden");
356
  hideStatus();
357
+ log("Memory cleared", 'success');
 
 
 
358
  });
359
 
360
+ // Update playback speed
361
  $("#spd").addEventListener("input", () => {
362
  const player = $("#player");
363
  if (player.src) {
 
365
  }
366
  });
367
 
368
+ log("Application ready!", 'success');
369
  </script>
370
  </body>
371
  </html>