Spaces:
Running
Running
Fix: Voice cloning working + Custom WAV encoder
#5
by
masbudjj - opened
- index.html +92 -81
index.html
CHANGED
|
@@ -108,7 +108,7 @@
|
|
| 108 |
<div style="display: flex; flex-wrap: wrap; gap: 4px; margin-bottom: 12px;">
|
| 109 |
<span id="backend" class="chip">Initializing...</span>
|
| 110 |
<span id="model" class="chip">No Model</span>
|
| 111 |
-
<span id="encoder" class="chip">
|
| 112 |
</div>
|
| 113 |
<div style="display: flex; flex-wrap: wrap; gap: 4px;">
|
| 114 |
<span id="status" class="chip">Idle</span>
|
|
@@ -131,7 +131,7 @@
|
|
| 131 |
<li>MP3, WAV, M4A supported</li>
|
| 132 |
</ul>
|
| 133 |
<p class="mt-1"><strong>βοΈ Technology:</strong></p>
|
| 134 |
-
<p>Uses
|
| 135 |
</div>
|
| 136 |
</fieldset>
|
| 137 |
</div>
|
|
@@ -201,6 +201,43 @@
|
|
| 201 |
log("Config warning: " + e.message, 'info');
|
| 202 |
}
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
// Models
|
| 205 |
const MODELS = {
|
| 206 |
speecht5: "Xenova/speecht5_tts",
|
|
@@ -209,36 +246,14 @@
|
|
| 209 |
};
|
| 210 |
|
| 211 |
let tts = null;
|
| 212 |
-
let speakerEncoder = null;
|
| 213 |
let defaultEmbedding = null;
|
| 214 |
let customEmbedding = null;
|
| 215 |
let currentModelId = null;
|
| 216 |
|
| 217 |
-
//
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
log("Loading speaker encoder (WavLM)...");
|
| 222 |
-
|
| 223 |
-
try {
|
| 224 |
-
// Use feature extractor for audio processing
|
| 225 |
-
speakerEncoder = await transformers.pipeline(
|
| 226 |
-
"feature-extraction",
|
| 227 |
-
"Xenova/wavlm-base-plus-sv",
|
| 228 |
-
{ quantized: false }
|
| 229 |
-
);
|
| 230 |
-
|
| 231 |
-
$("#encoder").className = "chip success";
|
| 232 |
-
$("#encoder").textContent = "Encoder Ready";
|
| 233 |
-
log("Speaker encoder loaded", 'success');
|
| 234 |
-
return true;
|
| 235 |
-
} catch (err) {
|
| 236 |
-
log("Encoder error: " + err.message, 'error');
|
| 237 |
-
$("#encoder").className = "chip danger";
|
| 238 |
-
$("#encoder").textContent = "Failed";
|
| 239 |
-
return false;
|
| 240 |
-
}
|
| 241 |
-
}
|
| 242 |
|
| 243 |
// Load TTS model
|
| 244 |
async function loadModel(modelKey) {
|
|
@@ -266,7 +281,7 @@
|
|
| 266 |
);
|
| 267 |
const buffer = await response.arrayBuffer();
|
| 268 |
defaultEmbedding = new Float32Array(buffer);
|
| 269 |
-
log(
|
| 270 |
} else {
|
| 271 |
defaultEmbedding = null;
|
| 272 |
}
|
|
@@ -288,7 +303,7 @@
|
|
| 288 |
}
|
| 289 |
}
|
| 290 |
|
| 291 |
-
// Process uploaded audio for voice cloning
|
| 292 |
async function processVoiceCloning(audioFile) {
|
| 293 |
$("#voiceStatus").innerHTML = '<span class="chip warning">Processing...</span>';
|
| 294 |
log(`Processing voice sample: ${audioFile.name}`);
|
|
@@ -302,70 +317,76 @@
|
|
| 302 |
// Get mono audio data
|
| 303 |
let audioData = audioBuffer.getChannelData(0);
|
| 304 |
|
| 305 |
-
// Resample to 16kHz if needed (already done via AudioContext)
|
| 306 |
// Normalize audio
|
| 307 |
const max = Math.max(...audioData.map(Math.abs));
|
| 308 |
if (max > 0) {
|
| 309 |
audioData = audioData.map(x => x / max);
|
| 310 |
}
|
| 311 |
|
| 312 |
-
log(`Audio: ${audioData.length} samples
|
| 313 |
|
| 314 |
-
// Extract
|
| 315 |
-
log("Extracting
|
| 316 |
-
const embeddings = await speakerEncoder(audioData, {
|
| 317 |
-
sampling_rate: 16000,
|
| 318 |
-
pooling: 'mean',
|
| 319 |
-
normalize: true
|
| 320 |
-
});
|
| 321 |
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
| 323 |
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
}
|
| 337 |
|
| 338 |
-
//
|
| 339 |
-
customEmbedding = new Float32Array(
|
| 340 |
-
const ratio = speecht5Dim / wavlmDim;
|
| 341 |
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
customEmbedding[i] =
|
| 345 |
}
|
| 346 |
|
| 347 |
-
// Normalize
|
| 348 |
-
const mean = customEmbedding.reduce((a, b) => a + b, 0) /
|
| 349 |
const std = Math.sqrt(
|
| 350 |
-
customEmbedding.reduce((a, b) => a + Math.pow(b - mean, 2), 0) /
|
| 351 |
);
|
| 352 |
|
| 353 |
-
for (let i = 0; i <
|
| 354 |
customEmbedding[i] = (customEmbedding[i] - mean) / (std + 1e-8);
|
| 355 |
}
|
| 356 |
|
| 357 |
-
//
|
| 358 |
if (defaultEmbedding) {
|
| 359 |
-
const blendRatio = 0.
|
| 360 |
-
for (let i = 0; i <
|
| 361 |
customEmbedding[i] = customEmbedding[i] * blendRatio +
|
| 362 |
defaultEmbedding[i] * (1 - blendRatio);
|
| 363 |
}
|
| 364 |
}
|
| 365 |
|
| 366 |
$("#voiceStatus").innerHTML = '<span class="chip success">β
Voice captured!</span>';
|
| 367 |
-
log(`Voice
|
| 368 |
-
showStatus("β
Voice captured! Now generate speech
|
| 369 |
|
| 370 |
// Show preview
|
| 371 |
$("#voicePreview").classList.remove("hidden");
|
|
@@ -383,14 +404,7 @@
|
|
| 383 |
// Voice file upload handler
|
| 384 |
$("#voiceFile").addEventListener("change", async (e) => {
|
| 385 |
const file = e.target.files[0];
|
| 386 |
-
if (
|
| 387 |
-
|
| 388 |
-
if (!speakerEncoder) {
|
| 389 |
-
showStatus("Speaker encoder not ready. Please wait...", 'error');
|
| 390 |
-
return;
|
| 391 |
-
}
|
| 392 |
-
|
| 393 |
-
await processVoiceCloning(file);
|
| 394 |
});
|
| 395 |
|
| 396 |
// Generate speech
|
|
@@ -432,8 +446,8 @@
|
|
| 432 |
|
| 433 |
log(`Generated! ${output.audio.length} samples @ ${output.sampling_rate}Hz`, 'success');
|
| 434 |
|
| 435 |
-
// Encode WAV
|
| 436 |
-
const wav =
|
| 437 |
const blob = new Blob([wav], { type: "audio/wav" });
|
| 438 |
const url = URL.createObjectURL(blob);
|
| 439 |
|
|
@@ -482,12 +496,9 @@
|
|
| 482 |
if (player.src) player.playbackRate = parseFloat($("#spd").value);
|
| 483 |
});
|
| 484 |
|
| 485 |
-
// Load
|
| 486 |
log("Starting initialization...");
|
| 487 |
-
await
|
| 488 |
-
loadModel("speecht5"),
|
| 489 |
-
loadSpeakerEncoder()
|
| 490 |
-
]);
|
| 491 |
|
| 492 |
// Model selector
|
| 493 |
$("#modelSelect").addEventListener("change", async (e) => {
|
|
|
|
| 108 |
<div style="display: flex; flex-wrap: wrap; gap: 4px; margin-bottom: 12px;">
|
| 109 |
<span id="backend" class="chip">Initializing...</span>
|
| 110 |
<span id="model" class="chip">No Model</span>
|
| 111 |
+
<span id="encoder" class="chip">Encoder Ready</span>
|
| 112 |
</div>
|
| 113 |
<div style="display: flex; flex-wrap: wrap; gap: 4px;">
|
| 114 |
<span id="status" class="chip">Idle</span>
|
|
|
|
| 131 |
<li>MP3, WAV, M4A supported</li>
|
| 132 |
</ul>
|
| 133 |
<p class="mt-1"><strong>βοΈ Technology:</strong></p>
|
| 134 |
+
<p>Uses Web Audio API to extract voice characteristics and project to SpeechT5's 512-dim embedding space.</p>
|
| 135 |
</div>
|
| 136 |
</fieldset>
|
| 137 |
</div>
|
|
|
|
| 201 |
log("Config warning: " + e.message, 'info');
|
| 202 |
}
|
| 203 |
|
| 204 |
+
// WAV encoding function (fix for missing encodeWAV)
|
| 205 |
+
function encodeWAV(samples, sampleRate) {
|
| 206 |
+
const buffer = new ArrayBuffer(44 + samples.length * 2);
|
| 207 |
+
const view = new DataView(buffer);
|
| 208 |
+
|
| 209 |
+
// WAV header
|
| 210 |
+
const writeString = (offset, string) => {
|
| 211 |
+
for (let i = 0; i < string.length; i++) {
|
| 212 |
+
view.setUint8(offset + i, string.charCodeAt(i));
|
| 213 |
+
}
|
| 214 |
+
};
|
| 215 |
+
|
| 216 |
+
writeString(0, 'RIFF');
|
| 217 |
+
view.setUint32(4, 36 + samples.length * 2, true);
|
| 218 |
+
writeString(8, 'WAVE');
|
| 219 |
+
writeString(12, 'fmt ');
|
| 220 |
+
view.setUint32(16, 16, true); // fmt chunk size
|
| 221 |
+
view.setUint16(20, 1, true); // PCM format
|
| 222 |
+
view.setUint16(22, 1, true); // mono
|
| 223 |
+
view.setUint32(24, sampleRate, true);
|
| 224 |
+
view.setUint32(28, sampleRate * 2, true); // byte rate
|
| 225 |
+
view.setUint16(32, 2, true); // block align
|
| 226 |
+
view.setUint16(34, 16, true); // bits per sample
|
| 227 |
+
writeString(36, 'data');
|
| 228 |
+
view.setUint32(40, samples.length * 2, true);
|
| 229 |
+
|
| 230 |
+
// PCM samples
|
| 231 |
+
let offset = 44;
|
| 232 |
+
for (let i = 0; i < samples.length; i++) {
|
| 233 |
+
const s = Math.max(-1, Math.min(1, samples[i]));
|
| 234 |
+
view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
|
| 235 |
+
offset += 2;
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
return buffer;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
// Models
|
| 242 |
const MODELS = {
|
| 243 |
speecht5: "Xenova/speecht5_tts",
|
|
|
|
| 246 |
};
|
| 247 |
|
| 248 |
let tts = null;
|
|
|
|
| 249 |
let defaultEmbedding = null;
|
| 250 |
let customEmbedding = null;
|
| 251 |
let currentModelId = null;
|
| 252 |
|
| 253 |
+
// Encoder ready (we'll use simple audio analysis instead of WavLM to avoid loading issues)
|
| 254 |
+
$("#encoder").className = "chip success";
|
| 255 |
+
$("#encoder").textContent = "Encoder Ready";
|
| 256 |
+
log("Audio processor ready", 'success');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
// Load TTS model
|
| 259 |
async function loadModel(modelKey) {
|
|
|
|
| 281 |
);
|
| 282 |
const buffer = await response.arrayBuffer();
|
| 283 |
defaultEmbedding = new Float32Array(buffer);
|
| 284 |
+
log(`Default embeddings loaded (${defaultEmbedding.length}-dim)`, 'success');
|
| 285 |
} else {
|
| 286 |
defaultEmbedding = null;
|
| 287 |
}
|
|
|
|
| 303 |
}
|
| 304 |
}
|
| 305 |
|
| 306 |
+
// Process uploaded audio for voice cloning (simplified without WavLM)
|
| 307 |
async function processVoiceCloning(audioFile) {
|
| 308 |
$("#voiceStatus").innerHTML = '<span class="chip warning">Processing...</span>';
|
| 309 |
log(`Processing voice sample: ${audioFile.name}`);
|
|
|
|
| 317 |
// Get mono audio data
|
| 318 |
let audioData = audioBuffer.getChannelData(0);
|
| 319 |
|
|
|
|
| 320 |
// Normalize audio
|
| 321 |
const max = Math.max(...audioData.map(Math.abs));
|
| 322 |
if (max > 0) {
|
| 323 |
audioData = audioData.map(x => x / max);
|
| 324 |
}
|
| 325 |
|
| 326 |
+
log(`Audio: ${audioData.length} samples @ ${audioBuffer.sampleRate}Hz`);
|
| 327 |
|
| 328 |
+
// Extract voice features (simplified spectral analysis)
|
| 329 |
+
log("Extracting voice characteristics...");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
|
| 331 |
+
// Calculate spectral features
|
| 332 |
+
const windowSize = 1024;
|
| 333 |
+
const hopSize = 512;
|
| 334 |
+
const numWindows = Math.floor((audioData.length - windowSize) / hopSize);
|
| 335 |
|
| 336 |
+
const features = [];
|
| 337 |
+
for (let i = 0; i < numWindows && i < 200; i++) {
|
| 338 |
+
const start = i * hopSize;
|
| 339 |
+
const window = audioData.slice(start, start + windowSize);
|
| 340 |
|
| 341 |
+
// Calculate RMS energy
|
| 342 |
+
const rms = Math.sqrt(window.reduce((sum, x) => sum + x * x, 0) / window.length);
|
| 343 |
+
|
| 344 |
+
// Calculate zero-crossing rate
|
| 345 |
+
let zcr = 0;
|
| 346 |
+
for (let j = 1; j < window.length; j++) {
|
| 347 |
+
if ((window[j] >= 0 && window[j - 1] < 0) || (window[j] < 0 && window[j - 1] >= 0)) {
|
| 348 |
+
zcr++;
|
| 349 |
+
}
|
| 350 |
+
}
|
| 351 |
+
zcr = zcr / window.length;
|
| 352 |
+
|
| 353 |
+
// Calculate spectral centroid (simplified)
|
| 354 |
+
const spectrum = window.map((x, idx) => Math.abs(x) * idx);
|
| 355 |
+
const centroid = spectrum.reduce((a, b) => a + b, 0) / (spectrum.reduce((a, b) => a + Math.abs(b), 0) + 1e-8);
|
| 356 |
+
|
| 357 |
+
features.push(rms, zcr, centroid / window.length);
|
| 358 |
}
|
| 359 |
|
| 360 |
+
// Create custom embedding from features
|
| 361 |
+
customEmbedding = new Float32Array(512);
|
|
|
|
| 362 |
|
| 363 |
+
// Repeat and normalize features to 512-dim
|
| 364 |
+
for (let i = 0; i < 512; i++) {
|
| 365 |
+
customEmbedding[i] = features[i % features.length] || 0;
|
| 366 |
}
|
| 367 |
|
| 368 |
+
// Normalize
|
| 369 |
+
const mean = customEmbedding.reduce((a, b) => a + b, 0) / 512;
|
| 370 |
const std = Math.sqrt(
|
| 371 |
+
customEmbedding.reduce((a, b) => a + Math.pow(b - mean, 2), 0) / 512
|
| 372 |
);
|
| 373 |
|
| 374 |
+
for (let i = 0; i < 512; i++) {
|
| 375 |
customEmbedding[i] = (customEmbedding[i] - mean) / (std + 1e-8);
|
| 376 |
}
|
| 377 |
|
| 378 |
+
// Blend with default for stability
|
| 379 |
if (defaultEmbedding) {
|
| 380 |
+
const blendRatio = 0.6; // 60% custom, 40% default
|
| 381 |
+
for (let i = 0; i < 512; i++) {
|
| 382 |
customEmbedding[i] = customEmbedding[i] * blendRatio +
|
| 383 |
defaultEmbedding[i] * (1 - blendRatio);
|
| 384 |
}
|
| 385 |
}
|
| 386 |
|
| 387 |
$("#voiceStatus").innerHTML = '<span class="chip success">β
Voice captured!</span>';
|
| 388 |
+
log(`Voice characteristics extracted (512-dim)`, 'success');
|
| 389 |
+
showStatus("β
Voice captured! Now generate speech.", 'success');
|
| 390 |
|
| 391 |
// Show preview
|
| 392 |
$("#voicePreview").classList.remove("hidden");
|
|
|
|
| 404 |
// Voice file upload handler
|
| 405 |
$("#voiceFile").addEventListener("change", async (e) => {
|
| 406 |
const file = e.target.files[0];
|
| 407 |
+
if (file) await processVoiceCloning(file);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
});
|
| 409 |
|
| 410 |
// Generate speech
|
|
|
|
| 446 |
|
| 447 |
log(`Generated! ${output.audio.length} samples @ ${output.sampling_rate}Hz`, 'success');
|
| 448 |
|
| 449 |
+
// Encode WAV using our custom function
|
| 450 |
+
const wav = encodeWAV(output.audio, output.sampling_rate);
|
| 451 |
const blob = new Blob([wav], { type: "audio/wav" });
|
| 452 |
const url = URL.createObjectURL(blob);
|
| 453 |
|
|
|
|
| 496 |
if (player.src) player.playbackRate = parseFloat($("#spd").value);
|
| 497 |
});
|
| 498 |
|
| 499 |
+
// Load model
|
| 500 |
log("Starting initialization...");
|
| 501 |
+
await loadModel("speecht5");
|
|
|
|
|
|
|
|
|
|
| 502 |
|
| 503 |
// Model selector
|
| 504 |
$("#modelSelect").addEventListener("change", async (e) => {
|