Spaces:

Luminia
/

voicefixer-freevc

Running

App Files Files Community

voicefixer-freevc / app.py

Nekochu

restore VoiceFixer + FreeVC

c330539 about 1 month ago

raw

history blame contribute delete

27.5 kB

	"""
	VoiceFixer + FreeVC: Clean & Convert Voice
	All code consolidated into single file for cleaner deployment.
	"""
	import os
	import math
	import json
	import struct
	import tempfile
	import logging
	from pathlib import Path
	from typing import Optional, Union, List
	from time import perf_counter as timer

	import gradio as gr
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.nn import Conv1d, ConvTranspose1d
	from torch.nn.utils import weight_norm, remove_weight_norm
	import numpy as np
	import librosa
	import webrtcvad
	from scipy.io.wavfile import write as wav_write
	from scipy.ndimage import binary_dilation
	from transformers import WavLMModel
	from huggingface_hub import hf_hub_download

	# Force CPU
	os.environ["CUDA_VISIBLE_DEVICES"] = ""
	device = torch.device("cpu")

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# ============================================================
	# CONFIG (instead of JSON file)
	# ============================================================

	FREEVC_CONFIG = {
	"data": {
	"sampling_rate": 16000,
	"filter_length": 1280,
	"hop_length": 320,
	},
	"train": {
	"segment_size": 8640,
	},
	"model": {
	"inter_channels": 192,
	"hidden_channels": 192,
	"filter_channels": 768,
	"n_heads": 2,
	"n_layers": 6,
	"kernel_size": 3,
	"p_dropout": 0.1,
	"resblock": "1",
	"resblock_kernel_sizes": [3, 7, 11],
	"resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
	"upsample_rates": [10, 6, 4, 2],
	"upsample_initial_channel": 512,
	"upsample_kernel_sizes": [16, 16, 4, 4],
	"gin_channels": 256,
	"ssl_dim": 1024,
	"use_spk": True
	}
	}

	# Speaker encoder params
	SE_SAMPLING_RATE = 16000
	SE_MEL_WINDOW_LENGTH = 25
	SE_MEL_WINDOW_STEP = 10
	SE_MEL_N_CHANNELS = 40
	SE_PARTIALS_N_FRAMES = 160
	SE_VAD_WINDOW_LENGTH = 30
	SE_VAD_MOVING_AVG_WIDTH = 8
	SE_VAD_MAX_SILENCE_LENGTH = 6
	SE_AUDIO_NORM_TARGET_DBFS = -30
	SE_MODEL_HIDDEN_SIZE = 256
	SE_MODEL_EMBEDDING_SIZE = 256
	SE_MODEL_NUM_LAYERS = 3


	# ============================================================
	# UTILS
	# ============================================================

	class HParams:
	"""Hyperparameters container."""
	def __init__(self, **kwargs):
	for k, v in kwargs.items():
	if isinstance(v, dict):
	v = HParams(**v)
	setattr(self, k, v)

	def __getitem__(self, key):
	return getattr(self, key)


	def get_hparams():
	"""Get hyperparameters from config dict."""
	return HParams(**FREEVC_CONFIG)


	def load_checkpoint(checkpoint_path, model):
	"""Load model checkpoint."""
	checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
	saved_state_dict = checkpoint_dict['model']
	state_dict = model.state_dict()
	new_state_dict = {}
	for k, v in state_dict.items():
	if k in saved_state_dict:
	new_state_dict[k] = saved_state_dict[k]
	else:
	new_state_dict[k] = v
	model.load_state_dict(new_state_dict)
	return model


	# ============================================================
	# COMMONS (helper functions)
	# ============================================================

	def init_weights(m, mean=0.0, std=0.01):
	if m.__class__.__name__.find("Conv") != -1:
	m.weight.data.normal_(mean, std)


	def get_padding(kernel_size, dilation=1):
	return int((kernel_size * dilation - dilation) / 2)


	def sequence_mask(length, max_length=None):
	if max_length is None:
	max_length = length.max()
	x = torch.arange(max_length, dtype=length.dtype, device=length.device)
	return x.unsqueeze(0) < length.unsqueeze(1)


	def rand_slice_segments(x, x_lengths=None, segment_size=4):
	b, d, t = x.size()
	if x_lengths is None:
	x_lengths = t
	ids_str_max = x_lengths - segment_size + 1
	ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
	ret = slice_segments(x, ids_str, segment_size)
	return ret, ids_str


	def slice_segments(x, ids_str, segment_size=4):
	ret = torch.zeros_like(x[:, :, :segment_size])
	for i in range(x.size(0)):
	idx_str = ids_str[i]
	idx_end = idx_str + segment_size
	ret[i] = x[i, :, idx_str:idx_end]
	return ret


	@torch.jit.script
	def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
	n_channels_int = n_channels[0]
	in_act = input_a + input_b
	t_act = torch.tanh(in_act[:, :n_channels_int, :])
	s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
	return t_act * s_act


	# ============================================================
	# MODULES (neural network building blocks)
	# ============================================================

	LRELU_SLOPE = 0.1


	class LayerNorm(nn.Module):
	def __init__(self, channels, eps=1e-5):
	super().__init__()
	self.channels = channels
	self.eps = eps
	self.gamma = nn.Parameter(torch.ones(channels))
	self.beta = nn.Parameter(torch.zeros(channels))

	def forward(self, x):
	x = x.transpose(1, -1)
	x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
	return x.transpose(1, -1)


	class WN(nn.Module):
	def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
	super().__init__()
	self.hidden_channels = hidden_channels
	self.n_layers = n_layers
	self.gin_channels = gin_channels
	self.in_layers = nn.ModuleList()
	self.res_skip_layers = nn.ModuleList()
	self.drop = nn.Dropout(p_dropout)

	if gin_channels != 0:
	cond_layer = nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
	self.cond_layer = weight_norm(cond_layer, name='weight')

	for i in range(n_layers):
	dilation = dilation_rate ** i
	padding = int((kernel_size * dilation - dilation) / 2)
	in_layer = weight_norm(nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
	dilation=dilation, padding=padding), name='weight')
	self.in_layers.append(in_layer)
	res_skip_channels = 2 * hidden_channels if i < n_layers - 1 else hidden_channels
	res_skip_layer = weight_norm(nn.Conv1d(hidden_channels, res_skip_channels, 1), name='weight')
	self.res_skip_layers.append(res_skip_layer)

	def forward(self, x, x_mask, g=None, **kwargs):
	output = torch.zeros_like(x)
	n_channels_tensor = torch.IntTensor([self.hidden_channels])
	if g is not None:
	g = self.cond_layer(g)
	for i in range(self.n_layers):
	x_in = self.in_layers[i](x)
	g_l = g[:, i * 2 * self.hidden_channels:(i + 1) * 2 * self.hidden_channels, :] if g is not None else torch.zeros_like(x_in)
	acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
	acts = self.drop(acts)
	res_skip_acts = self.res_skip_layers[i](acts)
	if i < self.n_layers - 1:
	x = (x + res_skip_acts[:, :self.hidden_channels, :]) * x_mask
	output = output + res_skip_acts[:, self.hidden_channels:, :]
	else:
	output = output + res_skip_acts
	return output * x_mask


	class ResBlock1(nn.Module):
	def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
	super().__init__()
	self.convs1 = nn.ModuleList([
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=d, padding=get_padding(kernel_size, d)))
	for d in dilation
	])
	self.convs1.apply(init_weights)
	self.convs2 = nn.ModuleList([
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)))
	for _ in dilation
	])
	self.convs2.apply(init_weights)

	def forward(self, x, x_mask=None):
	for c1, c2 in zip(self.convs1, self.convs2):
	xt = F.leaky_relu(x, LRELU_SLOPE)
	if x_mask is not None:
	xt = xt * x_mask
	xt = c1(xt)
	xt = F.leaky_relu(xt, LRELU_SLOPE)
	if x_mask is not None:
	xt = xt * x_mask
	xt = c2(xt)
	x = xt + x
	return x * x_mask if x_mask is not None else x


	class ResBlock2(nn.Module):
	def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
	super().__init__()
	self.convs = nn.ModuleList([
	weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=d, padding=get_padding(kernel_size, d)))
	for d in dilation
	])
	self.convs.apply(init_weights)

	def forward(self, x, x_mask=None):
	for c in self.convs:
	xt = F.leaky_relu(x, LRELU_SLOPE)
	if x_mask is not None:
	xt = xt * x_mask
	xt = c(xt)
	x = xt + x
	return x * x_mask if x_mask is not None else x


	class Flip(nn.Module):
	def forward(self, x, args, reverse=False, *kwargs):
	x = torch.flip(x, [1])
	return x if reverse else (x, torch.zeros(x.size(0)).to(x))


	class ResidualCouplingLayer(nn.Module):
	def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, mean_only=False):
	super().__init__()
	self.half_channels = channels // 2
	self.mean_only = mean_only
	self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
	self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
	self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
	self.post.weight.data.zero_()
	self.post.bias.data.zero_()

	def forward(self, x, x_mask, g=None, reverse=False):
	x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
	h = self.pre(x0) * x_mask
	h = self.enc(h, x_mask, g=g)
	stats = self.post(h) * x_mask
	m = stats if self.mean_only else stats[:, :self.half_channels]
	logs = torch.zeros_like(m) if self.mean_only else stats[:, self.half_channels:]
	if not reverse:
	x1 = m + x1 * torch.exp(logs) * x_mask
	return torch.cat([x0, x1], 1), torch.sum(logs, [1, 2])
	else:
	x1 = (x1 - m) * torch.exp(-logs) * x_mask
	return torch.cat([x0, x1], 1)


	# ============================================================
	# MODELS (FreeVC architecture)
	# ============================================================

	class ResidualCouplingBlock(nn.Module):
	def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0):
	super().__init__()
	self.flows = nn.ModuleList()
	for _ in range(n_flows):
	self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
	self.flows.append(Flip())

	def forward(self, x, x_mask, g=None, reverse=False):
	if not reverse:
	for flow in self.flows:
	x, _ = flow(x, x_mask, g=g, reverse=reverse)
	else:
	for flow in reversed(self.flows):
	x = flow(x, x_mask, g=g, reverse=reverse)
	return x


	class Encoder(nn.Module):
	def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0):
	super().__init__()
	self.out_channels = out_channels
	self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
	self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
	self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)

	def forward(self, x, x_lengths, g=None):
	x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
	x = self.pre(x) * x_mask
	x = self.enc(x, x_mask, g=g)
	stats = self.proj(x) * x_mask
	m, logs = torch.split(stats, self.out_channels, dim=1)
	z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
	return z, m, logs, x_mask


	class Generator(nn.Module):
	def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes,
	upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
	super().__init__()
	self.num_kernels = len(resblock_kernel_sizes)
	self.num_upsamples = len(upsample_rates)
	self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
	resblock_cls = ResBlock1 if resblock == '1' else ResBlock2

	self.ups = nn.ModuleList()
	for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
	self.ups.append(weight_norm(ConvTranspose1d(
	upsample_initial_channel // (2 i), upsample_initial_channel // (2 (i + 1)),
	k, u, padding=(k - u) // 2)))

	self.resblocks = nn.ModuleList()
	for i in range(len(self.ups)):
	ch = upsample_initial_channel // (2 ** (i + 1))
	for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes):
	self.resblocks.append(resblock_cls(ch, k, d))

	self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
	self.ups.apply(init_weights)
	if gin_channels != 0:
	self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)

	def forward(self, x, g=None):
	x = self.conv_pre(x)
	if g is not None:
	x = x + self.cond(g)
	for i in range(self.num_upsamples):
	x = F.leaky_relu(x, LRELU_SLOPE)
	x = self.ups[i](x)
	xs = sum(self.resblocks[i * self.num_kernels + j](x) for j in range(self.num_kernels))
	x = xs / self.num_kernels
	x = F.leaky_relu(x)
	x = self.conv_post(x)
	return torch.tanh(x)


	class SynthesizerTrn(nn.Module):
	"""Main FreeVC voice conversion model."""
	def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels,
	n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes,
	resblock_dilation_sizes, upsample_rates, upsample_initial_channel,
	upsample_kernel_sizes, gin_channels, ssl_dim, use_spk, **kwargs):
	super().__init__()
	self.segment_size = segment_size
	self.use_spk = use_spk
	self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16)
	self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes,
	upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
	self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
	self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)

	def infer(self, c, g=None, c_lengths=None):
	if c_lengths is None:
	c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
	g = g.unsqueeze(-1)
	z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
	z = self.flow(z_p, c_mask, g=g, reverse=True)
	return self.dec(z * c_mask, g=g)


	# ============================================================
	# SPEAKER ENCODER
	# ============================================================

	def se_normalize_volume(wav, target_dBFS, increase_only=False):
	rms = np.sqrt(np.mean(wav ** 2))
	if rms < 1e-10:
	return wav
	dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
	if dBFS_change < 0 and increase_only:
	return wav
	return wav * (10 ** (dBFS_change / 20))


	def se_trim_long_silences(wav):
	samples_per_window = (SE_VAD_WINDOW_LENGTH * SE_SAMPLING_RATE) // 1000
	wav = wav[:len(wav) - (len(wav) % samples_per_window)]
	int16_max = (2 ** 15) - 1
	pcm_wave = struct.pack("%dh" % len(wav), (np.round(wav int16_max)).astype(np.int16))
	voice_flags = []
	vad = webrtcvad.Vad(mode=3)
	for window_start in range(0, len(wav), samples_per_window):
	window_end = window_start + samples_per_window
	voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], sample_rate=SE_SAMPLING_RATE))
	voice_flags = np.array(voice_flags)

	def moving_average(array, width):
	array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
	ret = np.cumsum(array_padded, dtype=float)
	ret[width:] = ret[width:] - ret[:-width]
	return ret[width - 1:] / width

	audio_mask = np.round(moving_average(voice_flags, SE_VAD_MOVING_AVG_WIDTH)).astype(bool)
	audio_mask = binary_dilation(audio_mask, np.ones(SE_VAD_MAX_SILENCE_LENGTH + 1))
	audio_mask = np.repeat(audio_mask, samples_per_window)
	return wav[audio_mask] if len(wav[audio_mask]) > 0 else wav


	def se_wav_to_mel(wav):
	frames = librosa.feature.melspectrogram(
	y=wav, sr=SE_SAMPLING_RATE,
	n_fft=int(SE_SAMPLING_RATE * SE_MEL_WINDOW_LENGTH / 1000),
	hop_length=int(SE_SAMPLING_RATE * SE_MEL_WINDOW_STEP / 1000),
	n_mels=SE_MEL_N_CHANNELS
	)
	return frames.astype(np.float32).T


	class SpeakerEncoder(nn.Module):
	"""Speaker encoder for extracting voice embeddings."""
	def __init__(self, weights_fpath, device=None):
	super().__init__()
	self.lstm = nn.LSTM(SE_MEL_N_CHANNELS, SE_MODEL_HIDDEN_SIZE, SE_MODEL_NUM_LAYERS, batch_first=True)
	self.linear = nn.Linear(SE_MODEL_HIDDEN_SIZE, SE_MODEL_EMBEDDING_SIZE)
	self.relu = nn.ReLU()
	self.device = device or torch.device("cpu")
	checkpoint = torch.load(weights_fpath, map_location="cpu")
	self.load_state_dict(checkpoint["model_state"], strict=False)
	self.to(self.device)

	def forward(self, mels):
	_, (hidden, _) = self.lstm(mels)
	embeds_raw = self.relu(self.linear(hidden[-1]))
	return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)

	def embed_utterance(self, wav, rate=1.3, min_coverage=0.75):
	# Preprocess
	if len(wav) == 0:
	return np.zeros(SE_MODEL_EMBEDDING_SIZE)
	wav = se_normalize_volume(wav, SE_AUDIO_NORM_TARGET_DBFS, increase_only=True)
	wav = se_trim_long_silences(wav)
	if len(wav) == 0:
	return np.zeros(SE_MODEL_EMBEDDING_SIZE)

	# Compute slices
	samples_per_frame = int((SE_SAMPLING_RATE * SE_MEL_WINDOW_STEP / 1000))
	n_frames = int(np.ceil((len(wav) + 1) / samples_per_frame))
	frame_step = int(np.round((SE_SAMPLING_RATE / rate) / samples_per_frame))
	wav_slices, mel_slices = [], []
	for i in range(0, max(1, n_frames - SE_PARTIALS_N_FRAMES + frame_step + 1), frame_step):
	mel_range = np.array([i, i + SE_PARTIALS_N_FRAMES])
	wav_range = mel_range * samples_per_frame
	mel_slices.append(slice(*mel_range))
	wav_slices.append(slice(*wav_range))

	if len(wav_slices) == 0:
	mel_slices = [slice(0, SE_PARTIALS_N_FRAMES)]
	wav_slices = [slice(0, SE_PARTIALS_N_FRAMES * samples_per_frame)]

	max_wave_length = wav_slices[-1].stop
	if max_wave_length >= len(wav):
	wav = np.pad(wav, (0, max_wave_length - len(wav) + 1), "constant")

	mel = se_wav_to_mel(wav)
	mels = np.array([mel[s] if s.stop <= len(mel) else np.pad(mel, ((0, s.stop - len(mel)), (0, 0)))[s] for s in mel_slices])

	with torch.no_grad():
	mels = torch.from_numpy(mels).to(self.device)
	partial_embeds = self(mels).cpu().numpy()

	raw_embed = np.mean(partial_embeds, axis=0)
	return raw_embed / np.linalg.norm(raw_embed, 2)


	# ============================================================
	# MODEL LOADING
	# ============================================================

	_voicefixer_model = None
	_freevc_models = {}


	def load_voicefixer():
	global _voicefixer_model
	if _voicefixer_model is None:
	print("Loading VoiceFixer...")
	from voicefixer import VoiceFixer
	_voicefixer_model = VoiceFixer()
	_voicefixer_model.eval()
	print("VoiceFixer loaded!")
	return _voicefixer_model


	def load_freevc():
	global _freevc_models
	if not _freevc_models:
	print("Loading FreeVC...")
	os.makedirs("checkpoints", exist_ok=True)
	os.makedirs("speaker_encoder/ckpt", exist_ok=True)

	if not os.path.exists("checkpoints/freevc-24.pth"):
	hf_hub_download(repo_id="jn-jairo/freevc", filename="freevc_24.pth",
	local_dir="checkpoints", local_dir_use_symlinks=False)
	os.rename("checkpoints/freevc_24.pth", "checkpoints/freevc-24.pth")

	if not os.path.exists("speaker_encoder/ckpt/pretrained_bak_5805000.pt"):
	hf_hub_download(repo_id="jn-jairo/freevc", filename="speaker_encoder.pt",
	local_dir="speaker_encoder/ckpt", local_dir_use_symlinks=False)
	os.rename("speaker_encoder/ckpt/speaker_encoder.pt", "speaker_encoder/ckpt/pretrained_bak_5805000.pt")

	hps = get_hparams()
	freevc = SynthesizerTrn(
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	**FREEVC_CONFIG["model"]
	).to(device)
	freevc.eval()
	load_checkpoint("checkpoints/freevc-24.pth", freevc)

	smodel = SpeakerEncoder("speaker_encoder/ckpt/pretrained_bak_5805000.pt", device=device)
	cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
	cmodel.eval()

	_freevc_models = {'freevc': freevc, 'speaker_encoder': smodel, 'content_encoder': cmodel, 'hps': hps}
	print("FreeVC loaded!")
	return _freevc_models


	# ============================================================
	# PROCESSING FUNCTIONS
	# ============================================================

	def run_voicefixer(audio_path, mode=0):
	model = load_voicefixer()
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
	output_path = tmp.name
	with torch.no_grad():
	model.restore(input=audio_path, output=output_path, cuda=False, mode=mode)
	return output_path


	def run_freevc(source_path, target_path):
	models = load_freevc()
	hps = models['hps']
	with torch.no_grad():
	wav_tgt, _ = librosa.load(target_path, sr=hps.data.sampling_rate)
	wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
	g_tgt = models['speaker_encoder'].embed_utterance(wav_tgt)
	g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)

	wav_src, _ = librosa.load(source_path, sr=hps.data.sampling_rate)
	wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
	c = models['content_encoder'](wav_src).last_hidden_state.transpose(1, 2).to(device)

	audio = models['freevc'].infer(c, g=g_tgt)
	audio = audio[0][0].data.cpu().float().numpy()

	output_path = tempfile.mktemp(suffix=".wav")
	wav_write(output_path, 24000, audio)
	return output_path


	def process_audio(source_audio, target_audio, mode, voicefixer_mode="General", progress=gr.Progress()):
	"""
	Process audio with VoiceFixer and/or FreeVC.

	Args:
	source_audio: Path to source audio (speech to process)
	target_audio: Path to target audio (voice to mimic)
	mode: "VoiceFixer + FreeVC", "VoiceFixer only", or "FreeVC only"
	voicefixer_mode: "General", "Speech 44.1kHz", or "Speech 48kHz"

	Returns:
	Tuple of (output_path, status_message)
	"""
	if source_audio is None:
	return None, "Please upload source audio"
	if mode in ["VoiceFixer + FreeVC", "FreeVC only"] and target_audio is None:
	return None, "Please upload target voice audio"

	vf_mode_map = {"General": 0, "Speech 44.1kHz": 1, "Speech 48kHz": 2}
	vf_mode = vf_mode_map.get(voicefixer_mode, 0)

	try:
	if mode == "VoiceFixer only":
	progress(0.2, desc="Running VoiceFixer...")
	return run_voicefixer(source_audio, vf_mode), "Audio restored!"
	elif mode == "FreeVC only":
	progress(0.2, desc="Running FreeVC...")
	return run_freevc(source_audio, target_audio), "Voice converted!"
	else:
	progress(0.2, desc="Step 1/2: VoiceFixer...")
	cleaned = run_voicefixer(source_audio, vf_mode)
	progress(0.6, desc="Step 2/2: FreeVC...")
	output = run_freevc(cleaned, target_audio)
	os.unlink(cleaned)
	return output, "Cleaned and converted!"
	except Exception as e:
	return None, f"Error: {str(e)}"


	# ============================================================
	# GRADIO UI
	# ============================================================

	with gr.Blocks(title="VoiceFixer + FreeVC") as demo:
	gr.Markdown("# 🎤 VoiceFixer + FreeVC\nClean & Convert Voice")

	with gr.Row():
	with gr.Column():
	mode = gr.Dropdown(["VoiceFixer + FreeVC", "VoiceFixer only", "FreeVC only"],
	value="VoiceFixer + FreeVC", label="Mode")
	source_audio = gr.Audio(label="Source Audio", type="filepath", sources=["upload", "microphone"])
	target_audio = gr.Audio(label="Target Voice", type="filepath", sources=["upload", "microphone"])
	with gr.Accordion("Settings", open=False):
	voicefixer_mode = gr.Dropdown(["General", "Speech 44.1kHz", "Speech 48kHz"],
	value="General", label="VoiceFixer Mode")
	process_btn = gr.Button("Process", variant="primary", size="lg")

	with gr.Column():
	output_audio = gr.Audio(label="Output", type="filepath")
	status = gr.Textbox(label="Status")

	mode.change(lambda m: gr.update(visible=m != "VoiceFixer only"), [mode], [target_audio])

	process_btn.click(process_audio, [source_audio, target_audio, mode, voicefixer_mode],
	[output_audio, status], api_name="process")

	gr.Examples(
	examples=[["examples/source_example.wav", "examples/target_example.mp3", "VoiceFixer + FreeVC", "General"]],
	inputs=[source_audio, target_audio, mode, voicefixer_mode],
	outputs=[output_audio, status],
	fn=process_audio,
	cache_examples=True,
	cache_mode="lazy",
	label="Examples"
	)

	if __name__ == "__main__":
	demo.launch(mcp_server=True, show_error=True)