Spaces:

Plachta
/

Seed-VC

Running on Zero

App Files Files Community

Update app.py

#10

by divyeshhole15 - opened Jan 21

base: refs/heads/main

←

from: refs/pr/10

Discussion Files changed

+322

-159

Files changed (1) hide show

app.py +322 -159

app.py CHANGED Viewed

@@ -3,74 +3,128 @@ import gradio as gr
 import torch
 import torchaudio
 import librosa
 from modules.commons import build_model, load_checkpoint, recursive_munch
 import yaml
 from hf_utils import load_custom_model_from_hf
 import numpy as np
 from pydub import AudioSegment
-# Load model and configuration
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
-                                                "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
-                                                "config_dit_mel_seed_uvit_whisper_small_wavenet.yml")
-# dit_checkpoint_path = "E:/DiT_epoch_00018_step_801000.pth"
-# dit_config_path = "configs/config_dit_mel_seed_uvit_whisper_small_encoder_wavenet.yml"
 config = yaml.safe_load(open(dit_config_path, 'r'))
 model_params = recursive_munch(config['model_params'])
 model = build_model(model_params, stage='DiT')
 hop_length = config['preprocess_params']['spect_params']['hop_length']
 sr = config['preprocess_params']['sr']
-# Load checkpoints
-model, _, _, _ = load_checkpoint(model, None, dit_checkpoint_path,
-                                 load_only_params=True, ignore_modules=[], is_distributed=False)
 for key in model:
     model[key].eval()
     model[key].to(device)
-model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
-# Load additional modules
-from modules.campplus.DTDNN import CAMPPlus
-campplus_ckpt_path = load_custom_model_from_hf("funasr/campplus", "campplus_cn_common.bin", config_filename=None)
-campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
-campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
-campplus_model.eval()
-campplus_model.to(device)
 from modules.bigvgan import bigvgan
-bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
-# remove weight norm in the model and set to eval mode
 bigvgan_model.remove_weight_norm()
 bigvgan_model = bigvgan_model.eval().to(device)
 ckpt_path, config_path = load_custom_model_from_hf("Plachta/FAcodec", 'pytorch_model.bin', 'config.yml')
 codec_config = yaml.safe_load(open(config_path))
 codec_model_params = recursive_munch(codec_config['model_params'])
 codec_encoder = build_model(codec_model_params, stage="codec")
 ckpt_params = torch.load(ckpt_path, map_location="cpu")
 for key in codec_encoder:
     codec_encoder[key].load_state_dict(ckpt_params[key], strict=False)
 _ = [codec_encoder[key].eval() for key in codec_encoder]
 _ = [codec_encoder[key].to(device) for key in codec_encoder]
-# whisper
 from transformers import AutoFeatureExtractor, WhisperModel
-whisper_name = model_params.speech_tokenizer.whisper_name if hasattr(model_params.speech_tokenizer,
-                                                                     'whisper_name') else "openai/whisper-small"
 whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float16).to(device)
 del whisper_model.decoder
 whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
-# Generate mel spectrograms
 mel_fn_args = {
     "n_fft": config['preprocess_params']['spect_params']['n_fft'],
     "win_size": config['preprocess_params']['spect_params']['win_length'],
@@ -82,51 +136,62 @@ mel_fn_args = {
     "center": False
 }
 from modules.audio import mel_spectrogram
 to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
-# f0 conditioned model
-dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
-                                                "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema.pth",
-                                                "config_dit_mel_seed_uvit_whisper_base_f0_44k.yml")
-config = yaml.safe_load(open(dit_config_path, 'r'))
-model_params = recursive_munch(config['model_params'])
-model_f0 = build_model(model_params, stage='DiT')
-hop_length = config['preprocess_params']['spect_params']['hop_length']
-sr = config['preprocess_params']['sr']
-# Load checkpoints
-model_f0, _, _, _ = load_checkpoint(model_f0, None, dit_checkpoint_path,
-                                 load_only_params=True, ignore_modules=[], is_distributed=False)
 for key in model_f0:
     model_f0[key].eval()
     model_f0[key].to(device)
 model_f0.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
-# f0 extractor
 from modules.rmvpe import RMVPE
 model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
 rmvpe = RMVPE(model_path, is_half=False, device=device)
 mel_fn_args_f0 = {
-    "n_fft": config['preprocess_params']['spect_params']['n_fft'],
-    "win_size": config['preprocess_params']['spect_params']['win_length'],
-    "hop_size": config['preprocess_params']['spect_params']['hop_length'],
-    "num_mels": config['preprocess_params']['spect_params']['n_mels'],
-    "sampling_rate": sr,
     "fmin": 0,
     "fmax": None,
     "center": False
 }
 to_mel_f0 = lambda x: mel_spectrogram(x, **mel_fn_args_f0)
-bigvgan_44k_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_44khz_128band_512x', use_cuda_kernel=False)
-# remove weight norm in the model and set to eval mode
 bigvgan_44k_model.remove_weight_norm()
 bigvgan_44k_model = bigvgan_44k_model.eval().to(device)
 def adjust_f0_semitones(f0_sequence, n_semitones):
     factor = 2 ** (n_semitones / 12)
     return f0_sequence * factor
@@ -137,39 +202,86 @@ def crossfade(chunk1, chunk2, overlap):
     chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
     return chunk2
-# streaming and chunk processing related params
 bitrate = "320k"
 overlap_frame_len = 16
 @spaces.GPU
 @torch.no_grad()
 @torch.inference_mode()
-def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate, f0_condition, auto_f0_adjust, pitch_shift):
     inference_module = model if not f0_condition else model_f0
     mel_fn = to_mel if not f0_condition else to_mel_f0
     bigvgan_fn = bigvgan_model if not f0_condition else bigvgan_44k_model
-    sr = 22050 if not f0_condition else 44100
-    hop_length = 256 if not f0_condition else 512
-    max_context_window = sr // hop_length * 30
-    overlap_wave_len = overlap_frame_len * hop_length
     # Load audio
-    source_audio = librosa.load(source, sr=sr)[0]
-    ref_audio = librosa.load(target, sr=sr)[0]
-    # Process audio
     source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
-    ref_audio = torch.tensor(ref_audio[:sr * 25]).unsqueeze(0).float().to(device)
-    # Resample
-    ref_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
-    converted_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
-    # if source audio less than 30 seconds, whisper can handle in one forward
     if converted_waves_16k.size(-1) <= 16000 * 30:
-        alt_inputs = whisper_feature_extractor([converted_waves_16k.squeeze(0).cpu().numpy()],
-                                               return_tensors="pt",
-                                               return_attention_mask=True,
-                                               sampling_rate=16000)
         alt_input_features = whisper_model._mask_input_features(
-            alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
         alt_outputs = whisper_model.encoder(
             alt_input_features.to(whisper_model.encoder.dtype),
             head_mask=None,
@@ -180,21 +292,28 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
         S_alt = alt_outputs.last_hidden_state.to(torch.float32)
         S_alt = S_alt[:, :converted_waves_16k.size(-1) // 320 + 1]
     else:
-        overlapping_time = 5  # 5 seconds
         S_alt_list = []
         buffer = None
         traversed_time = 0
         while traversed_time < converted_waves_16k.size(-1):
-            if buffer is None:  # first chunk
                 chunk = converted_waves_16k[:, traversed_time:traversed_time + 16000 * 30]
             else:
-                chunk = torch.cat([buffer, converted_waves_16k[:, traversed_time:traversed_time + 16000 * (30 - overlapping_time)]], dim=-1)
-            alt_inputs = whisper_feature_extractor([chunk.squeeze(0).cpu().numpy()],
-                                                   return_tensors="pt",
-                                                   return_attention_mask=True,
-                                                   sampling_rate=16000)
             alt_input_features = whisper_model._mask_input_features(
-                alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
             alt_outputs = whisper_model.encoder(
                 alt_input_features.to(whisper_model.encoder.dtype),
                 head_mask=None,
@@ -202,30 +321,35 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
                 output_hidden_states=False,
                 return_dict=True,
             )
-            S_alt = alt_outputs.last_hidden_state.to(torch.float32)
-            S_alt = S_alt[:, :chunk.size(-1) // 320 + 1]
             if traversed_time == 0:
-                S_alt_list.append(S_alt)
             else:
-                S_alt_list.append(S_alt[:, 50 * overlapping_time:])
             buffer = chunk[:, -16000 * overlapping_time:]
             traversed_time += 30 * 16000 if traversed_time == 0 else chunk.size(-1) - 16000 * overlapping_time
         S_alt = torch.cat(S_alt_list, dim=1)
-    ori_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
-    ori_inputs = whisper_feature_extractor([ori_waves_16k.squeeze(0).cpu().numpy()],
-                                           return_tensors="pt",
-                                           return_attention_mask=True)
     ori_input_features = whisper_model._mask_input_features(
-        ori_inputs.input_features, attention_mask=ori_inputs.attention_mask).to(device)
-    with torch.no_grad():
-        ori_outputs = whisper_model.encoder(
-            ori_input_features.to(whisper_model.encoder.dtype),
-            head_mask=None,
-            output_attentions=False,
-            output_hidden_states=False,
-            return_dict=True,
-        )
     S_ori = ori_outputs.last_hidden_state.to(torch.float32)
     S_ori = S_ori[:, :ori_waves_16k.size(-1) // 320 + 1]
@@ -235,15 +359,12 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
     target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
     target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
-    feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k,
-                                              num_mel_bins=80,
-                                              dither=0,
-                                              sample_frequency=16000)
-    feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
-    style2 = campplus_model(feat2.unsqueeze(0))
     if f0_condition:
-        F0_ori = rmvpe.infer_from_audio(ref_waves_16k[0], thred=0.5)
         F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.5)
         F0_ori = torch.from_numpy(F0_ori).to(device)[None]
@@ -258,118 +379,160 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
         median_log_f0_ori = torch.median(voiced_log_f0_ori)
         median_log_f0_alt = torch.median(voiced_log_f0_alt)
-        # shift alt log f0 level to ori log f0 level
         shifted_log_f0_alt = log_f0_alt.clone()
-        if auto_f0_adjust:
             shifted_log_f0_alt[F0_alt > 1] = log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
         shifted_f0_alt = torch.exp(shifted_log_f0_alt)
         if pitch_shift != 0:
             shifted_f0_alt[F0_alt > 1] = adjust_f0_semitones(shifted_f0_alt[F0_alt > 1], pitch_shift)
     else:
         F0_ori = None
-        F0_alt = None
         shifted_f0_alt = None
     # Length regulation
-    cond, _, codes, commitment_loss, codebook_loss = inference_module.length_regulator(S_alt, ylens=target_lengths, n_quantizers=3, f0=shifted_f0_alt)
-    prompt_condition, _, codes, commitment_loss, codebook_loss = inference_module.length_regulator(S_ori, ylens=target2_lengths, n_quantizers=3, f0=F0_ori)
     max_source_window = max_context_window - mel2.size(2)
-    # split source condition (cond) into chunks
     processed_frames = 0
     generated_wave_chunks = []
-    # generate chunk by chunk and stream the output
     while processed_frames < cond.size(1):
         chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
         is_last_chunk = processed_frames + max_source_window >= cond.size(1)
         cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
-        with torch.autocast(device_type='cuda', dtype=torch.float16):
-            # Voice Conversion
-            vc_target = inference_module.cfm.inference(cat_condition,
-                                                       torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
-                                                       mel2, style2, None, diffusion_steps,
-                                                       inference_cfg_rate=inference_cfg_rate)
             vc_target = vc_target[:, :, mel2.size(-1):]
         vc_wave = bigvgan_fn(vc_target.float())[0]
         if processed_frames == 0:
             if is_last_chunk:
                 output_wave = vc_wave[0].cpu().numpy()
                 generated_wave_chunks.append(output_wave)
-                output_wave = (output_wave * 32768.0).astype(np.int16)
                 mp3_bytes = AudioSegment(
-                    output_wave.tobytes(), frame_rate=sr,
-                    sample_width=output_wave.dtype.itemsize, channels=1
                 ).export(format="mp3", bitrate=bitrate).read()
-                yield mp3_bytes, (sr, np.concatenate(generated_wave_chunks))
                 break
             output_wave = vc_wave[0, :-overlap_wave_len].cpu().numpy()
             generated_wave_chunks.append(output_wave)
             previous_chunk = vc_wave[0, -overlap_wave_len:]
             processed_frames += vc_target.size(2) - overlap_frame_len
-            output_wave = (output_wave * 32768.0).astype(np.int16)
             mp3_bytes = AudioSegment(
-                output_wave.tobytes(), frame_rate=sr,
-                sample_width=output_wave.dtype.itemsize, channels=1
             ).export(format="mp3", bitrate=bitrate).read()
             yield mp3_bytes, None
         elif is_last_chunk:
             output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0].cpu().numpy(), overlap_wave_len)
             generated_wave_chunks.append(output_wave)
             processed_frames += vc_target.size(2) - overlap_frame_len
-            output_wave = (output_wave * 32768.0).astype(np.int16)
             mp3_bytes = AudioSegment(
-                output_wave.tobytes(), frame_rate=sr,
-                sample_width=output_wave.dtype.itemsize, channels=1
             ).export(format="mp3", bitrate=bitrate).read()
-            yield mp3_bytes, (sr, np.concatenate(generated_wave_chunks))
             break
         else:
             output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0, :-overlap_wave_len].cpu().numpy(), overlap_wave_len)
             generated_wave_chunks.append(output_wave)
             previous_chunk = vc_wave[0, -overlap_wave_len:]
             processed_frames += vc_target.size(2) - overlap_frame_len
-            output_wave = (output_wave * 32768.0).astype(np.int16)
             mp3_bytes = AudioSegment(
-                output_wave.tobytes(), frame_rate=sr,
-                sample_width=output_wave.dtype.itemsize, channels=1
             ).export(format="mp3", bitrate=bitrate).read()
             yield mp3_bytes, None
 if __name__ == "__main__":
-    description = ("State-of-the-Art zero-shot voice conversion/singing voice conversion. For local deployment please check [GitHub repository](https://github.com/Plachtaa/seed-vc) "
-                   "for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
-                   "If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
-                   "无需训练的 zero-shot 语音/歌声转换模型，若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
-                   "请注意，参考音频若超过 25 秒，则会被自动裁剪至此长度。<br>若源音频和参考音频的总时长超过 30 秒，源音频将被分段处理。")
     inputs = [
         gr.Audio(type="filepath", label="Source Audio / 源音频"),
         gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
-        gr.Slider(minimum=1, maximum=200, value=25, step=1, label="Diffusion Steps / 扩散步数", info="25 by default, 50~100 for best quality / 默认为 25，50~100 为最佳质量"),
-        gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整", info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速，>1.0 减慢语速"),
-        gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate", info="has subtle influence / 有微小影响"),
-        gr.Checkbox(label="Use F0 conditioned model / 启用F0输入", value=False, info="Must set to true for singing voice conversion / 歌声转换时必须勾选"),
         gr.Checkbox(label="Auto F0 adjust / 自动F0调整", value=True,
-                    info="Roughly adjust F0 to match target voice. Only works when F0 conditioned model is used. / 粗略调整 F0 以匹配目标音色，仅在勾选 '启用F0输入' 时生效"),
-        gr.Slider(label='Pitch shift / 音调变换', minimum=-24, maximum=24, step=1, value=0, info="Pitch shift in semitones, only works when F0 conditioned model is used / 半音数的音高变换，仅在勾选 '启用F0输入' 时生效"),
     ]
-    examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, False, True, 0],
-                ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7, False, True, 0],
-                ["examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav",
-                 "examples/reference/kobe_0.wav", 50, 1.0, 0.7, True, False, -6],
-                ["examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav",
-                 "examples/reference/trump_0.wav", 50, 1.0, 0.7, True, False, -12],
-                ]
-    outputs = [gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
-               gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')]
-    gr.Interface(fn=voice_conversion,
-                 description=description,
-                 inputs=inputs,
-                 outputs=outputs,
-                 title="Seed Voice Conversion",
-                 examples=examples,
-                 cache_examples=False,
-                 ).launch()

 import torch
 import torchaudio
 import librosa
+import torch.nn as nn
 from modules.commons import build_model, load_checkpoint, recursive_munch
 import yaml
 from hf_utils import load_custom_model_from_hf
 import numpy as np
 from pydub import AudioSegment
+# =========================================================
+# Device
+# =========================================================
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# =========================================================
+# Load Seed-VC DiT model (non-f0)
+# =========================================================
+dit_checkpoint_path, dit_config_path = load_custom_model_from_hf(
+    "Plachta/Seed-VC",
+    "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
+    "config_dit_mel_seed_uvit_whisper_small_wavenet.yml"
+)
 config = yaml.safe_load(open(dit_config_path, 'r'))
 model_params = recursive_munch(config['model_params'])
 model = build_model(model_params, stage='DiT')
 hop_length = config['preprocess_params']['spect_params']['hop_length']
 sr = config['preprocess_params']['sr']
+model, _, _, _ = load_checkpoint(
+    model, None, dit_checkpoint_path,
+    load_only_params=True, ignore_modules=[],
+    is_distributed=False
+)
 for key in model:
     model[key].eval()
     model[key].to(device)
+# Cache setup
+model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
+# =========================================================
+# Speaker embedding: ECAPA (SpeechBrain) replacement
+# - This reduces CN accent bias vs campplus_cn_common
+# - Fallback to original CAMPPlus if SpeechBrain not available
+# =========================================================
+USE_ECAPA = True
+spk_encoder = None
+try:
+    from speechbrain.pretrained import EncoderClassifier
+    spk_encoder = EncoderClassifier.from_hparams(
+        source="speechbrain/spkrec-ecapa-voxceleb",
+        run_opts={"device": str(device)}
+    )
+    spk_encoder.eval()
+except Exception as e:
+    # If SpeechBrain isn't installed/available, fallback to CAMPPlus
+    USE_ECAPA = False
+    spk_encoder = None
+    print("[WARN] SpeechBrain ECAPA not available. Falling back to CAMPPlus. Error:", str(e))
+# CAMPPlus fallback (original)
+campplus_model = None
+if not USE_ECAPA:
+    from modules.campplus.DTDNN import CAMPPlus
+    campplus_ckpt_path = load_custom_model_from_hf(
+        "funasr/campplus",
+        "campplus_cn_common.bin",
+        config_filename=None
+    )
+    campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
+    campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
+    campplus_model.eval()
+    campplus_model.to(device)
+# A small projection to map ECAPA embedding dim -> expected style dim
+# We build it lazily at first inference once we know ECAPA dim.
+style_proj = None
+STYLE_DIM_EXPECTED = 192  # CAMPPlus embedding_size used originally in this app
+# =========================================================
+# Vocoder (BigVGAN)
+# =========================================================
 from modules.bigvgan import bigvgan
+bigvgan_model = bigvgan.BigVGAN.from_pretrained(
+    'nvidia/bigvgan_v2_22khz_80band_256x',
+    use_cuda_kernel=False
+)
 bigvgan_model.remove_weight_norm()
 bigvgan_model = bigvgan_model.eval().to(device)
+# =========================================================
+# Codec (FAcodec)
+# =========================================================
 ckpt_path, config_path = load_custom_model_from_hf("Plachta/FAcodec", 'pytorch_model.bin', 'config.yml')
 codec_config = yaml.safe_load(open(config_path))
 codec_model_params = recursive_munch(codec_config['model_params'])
 codec_encoder = build_model(codec_model_params, stage="codec")
 ckpt_params = torch.load(ckpt_path, map_location="cpu")
 for key in codec_encoder:
     codec_encoder[key].load_state_dict(ckpt_params[key], strict=False)
 _ = [codec_encoder[key].eval() for key in codec_encoder]
 _ = [codec_encoder[key].to(device) for key in codec_encoder]
+# =========================================================
+# Whisper encoder (content)
+# =========================================================
 from transformers import AutoFeatureExtractor, WhisperModel
+whisper_name = (
+    model_params.speech_tokenizer.whisper_name
+    if hasattr(model_params.speech_tokenizer, 'whisper_name')
+    else "openai/whisper-small"
+)
 whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float16).to(device)
 del whisper_model.decoder
 whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
+# =========================================================
+# Mel Spectrogram
+# =========================================================
 mel_fn_args = {
     "n_fft": config['preprocess_params']['spect_params']['n_fft'],
     "win_size": config['preprocess_params']['spect_params']['win_length'],
     "center": False
 }
 from modules.audio import mel_spectrogram
 to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
+# =========================================================
+# Load Seed-VC DiT model (f0 conditioned)
+# =========================================================
+dit_checkpoint_path, dit_config_path = load_custom_model_from_hf(
+    "Plachta/Seed-VC",
+    "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema.pth",
+    "config_dit_mel_seed_uvit_whisper_base_f0_44k.yml"
+)
+config_f0 = yaml.safe_load(open(dit_config_path, 'r'))
+model_params_f0 = recursive_munch(config_f0['model_params'])
+model_f0 = build_model(model_params_f0, stage='DiT')
+hop_length_f0 = config_f0['preprocess_params']['spect_params']['hop_length']
+sr_f0 = config_f0['preprocess_params']['sr']
+model_f0, _, _, _ = load_checkpoint(
+    model_f0, None, dit_checkpoint_path,
+    load_only_params=True, ignore_modules=[],
+    is_distributed=False
+)
 for key in model_f0:
     model_f0[key].eval()
     model_f0[key].to(device)
 model_f0.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
+# F0 extractor
 from modules.rmvpe import RMVPE
 model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
 rmvpe = RMVPE(model_path, is_half=False, device=device)
 mel_fn_args_f0 = {
+    "n_fft": config_f0['preprocess_params']['spect_params']['n_fft'],
+    "win_size": config_f0['preprocess_params']['spect_params']['win_length'],
+    "hop_size": config_f0['preprocess_params']['spect_params']['hop_length'],
+    "num_mels": config_f0['preprocess_params']['spect_params']['n_mels'],
+    "sampling_rate": sr_f0,
     "fmin": 0,
     "fmax": None,
     "center": False
 }
 to_mel_f0 = lambda x: mel_spectrogram(x, **mel_fn_args_f0)
+bigvgan_44k_model = bigvgan.BigVGAN.from_pretrained(
+    'nvidia/bigvgan_v2_44khz_128band_512x',
+    use_cuda_kernel=False
+)
 bigvgan_44k_model.remove_weight_norm()
 bigvgan_44k_model = bigvgan_44k_model.eval().to(device)
+# =========================================================
+# Helpers
+# =========================================================
 def adjust_f0_semitones(f0_sequence, n_semitones):
     factor = 2 ** (n_semitones / 12)
     return f0_sequence * factor
     chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
     return chunk2
+# Streaming and chunk params
 bitrate = "320k"
 overlap_frame_len = 16
+def get_style_embedding(ref_waves_16k: torch.Tensor) -> torch.Tensor:
+    """
+    ref_waves_16k: (B, T) float tensor @ 16k
+    returns: style2 (B, STYLE_DIM_EXPECTED)
+    """
+    global style_proj
+    if USE_ECAPA and spk_encoder is not None:
+        with torch.no_grad():
+            # SpeechBrain ECAPA returns (B, 1, D) or (B, D) depending on version
+            emb = spk_encoder.encode_batch(ref_waves_16k)
+            if emb.dim() == 3:
+                emb = emb.squeeze(1)  # (B, D)
+            style2 = emb.to(device)
+        # Project to expected style dim if needed
+        if style2.size(-1) != STYLE_DIM_EXPECTED:
+            if style_proj is None:
+                style_proj = nn.Linear(style2.size(-1), STYLE_DIM_EXPECTED).to(device)
+                style_proj.eval()
+            with torch.no_grad():
+                style2 = style_proj(style2)
+        return style2
+    # Fallback: CAMPPlus
+    feat2 = torchaudio.compliance.kaldi.fbank(
+        ref_waves_16k,
+        num_mel_bins=80,
+        dither=0,
+        sample_frequency=16000
+    )
+    feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
+    style2 = campplus_model(feat2.unsqueeze(0))
+    return style2
+# =========================================================
+# Voice Conversion
+# =========================================================
 @spaces.GPU
 @torch.no_grad()
 @torch.inference_mode()
+def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate,
+                     f0_condition, auto_f0_adjust, pitch_shift):
     inference_module = model if not f0_condition else model_f0
     mel_fn = to_mel if not f0_condition else to_mel_f0
     bigvgan_fn = bigvgan_model if not f0_condition else bigvgan_44k_model
+    sr_local = 22050 if not f0_condition else 44100
+    hop_local = 256 if not f0_condition else 512
+    max_context_window = sr_local // hop_local * 30
+    overlap_wave_len = overlap_frame_len * hop_local
     # Load audio
+    source_audio = librosa.load(source, sr=sr_local)[0]
+    ref_audio = librosa.load(target, sr=sr_local)[0]
     source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
+    ref_audio = torch.tensor(ref_audio[:sr_local * 25]).unsqueeze(0).float().to(device)
+    # Resample for whisper and speaker embedding
+    ref_waves_16k = torchaudio.functional.resample(ref_audio, sr_local, 16000)
+    converted_waves_16k = torchaudio.functional.resample(source_audio, sr_local, 16000)
+    # Whisper content encoding (S_alt)
     if converted_waves_16k.size(-1) <= 16000 * 30:
+        alt_inputs = whisper_feature_extractor(
+            [converted_waves_16k.squeeze(0).cpu().numpy()],
+            return_tensors="pt",
+            return_attention_mask=True,
+            sampling_rate=16000
+        )
         alt_input_features = whisper_model._mask_input_features(
+            alt_inputs.input_features, attention_mask=alt_inputs.attention_mask
+        ).to(device)
         alt_outputs = whisper_model.encoder(
             alt_input_features.to(whisper_model.encoder.dtype),
             head_mask=None,
         S_alt = alt_outputs.last_hidden_state.to(torch.float32)
         S_alt = S_alt[:, :converted_waves_16k.size(-1) // 320 + 1]
     else:
+        overlapping_time = 5
         S_alt_list = []
         buffer = None
         traversed_time = 0
         while traversed_time < converted_waves_16k.size(-1):
+            if buffer is None:
                 chunk = converted_waves_16k[:, traversed_time:traversed_time + 16000 * 30]
             else:
+                chunk = torch.cat(
+                    [buffer, converted_waves_16k[:, traversed_time:traversed_time + 16000 * (30 - overlapping_time)]],
+                    dim=-1
+                )
+            alt_inputs = whisper_feature_extractor(
+                [chunk.squeeze(0).cpu().numpy()],
+                return_tensors="pt",
+                return_attention_mask=True,
+                sampling_rate=16000
+            )
             alt_input_features = whisper_model._mask_input_features(
+                alt_inputs.input_features, attention_mask=alt_inputs.attention_mask
+            ).to(device)
             alt_outputs = whisper_model.encoder(
                 alt_input_features.to(whisper_model.encoder.dtype),
                 head_mask=None,
                 output_hidden_states=False,
                 return_dict=True,
             )
+            S_alt_chunk = alt_outputs.last_hidden_state.to(torch.float32)
+            S_alt_chunk = S_alt_chunk[:, :chunk.size(-1) // 320 + 1]
             if traversed_time == 0:
+                S_alt_list.append(S_alt_chunk)
             else:
+                S_alt_list.append(S_alt_chunk[:, 50 * overlapping_time:])
             buffer = chunk[:, -16000 * overlapping_time:]
             traversed_time += 30 * 16000 if traversed_time == 0 else chunk.size(-1) - 16000 * overlapping_time
         S_alt = torch.cat(S_alt_list, dim=1)
+    # Whisper prompt (S_ori)
+    ori_waves_16k = torchaudio.functional.resample(ref_audio, sr_local, 16000)
+    ori_inputs = whisper_feature_extractor(
+        [ori_waves_16k.squeeze(0).cpu().numpy()],
+        return_tensors="pt",
+        return_attention_mask=True
+    )
     ori_input_features = whisper_model._mask_input_features(
+        ori_inputs.input_features, attention_mask=ori_inputs.attention_mask
+    ).to(device)
+    ori_outputs = whisper_model.encoder(
+        ori_input_features.to(whisper_model.encoder.dtype),
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    )
     S_ori = ori_outputs.last_hidden_state.to(torch.float32)
     S_ori = S_ori[:, :ori_waves_16k.size(-1) // 320 + 1]
     target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
     target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
+    # Speaker embedding (ECAPA or fallback)
+    style2 = get_style_embedding(ref_waves_16k)
+    # f0 handling
     if f0_condition:
+        F0_ori = rmvpe.infer_from_audio(ori_waves_16k[0], thred=0.5)
         F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.5)
         F0_ori = torch.from_numpy(F0_ori).to(device)[None]
         median_log_f0_ori = torch.median(voiced_log_f0_ori)
         median_log_f0_alt = torch.median(voiced_log_f0_alt)
         shifted_log_f0_alt = log_f0_alt.clone()
+        if auto_f0_adjust and voiced_F0_alt.numel() > 0 and voiced_F0_ori.numel() > 0:
             shifted_log_f0_alt[F0_alt > 1] = log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
         shifted_f0_alt = torch.exp(shifted_log_f0_alt)
         if pitch_shift != 0:
             shifted_f0_alt[F0_alt > 1] = adjust_f0_semitones(shifted_f0_alt[F0_alt > 1], pitch_shift)
     else:
         F0_ori = None
         shifted_f0_alt = None
     # Length regulation
+    cond, _, _, _, _ = inference_module.length_regulator(
+        S_alt, ylens=target_lengths, n_quantizers=3, f0=shifted_f0_alt
+    )
+    prompt_condition, _, _, _, _ = inference_module.length_regulator(
+        S_ori, ylens=target2_lengths, n_quantizers=3, f0=F0_ori
+    )
     max_source_window = max_context_window - mel2.size(2)
     processed_frames = 0
     generated_wave_chunks = []
+    previous_chunk = None
     while processed_frames < cond.size(1):
         chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
         is_last_chunk = processed_frames + max_source_window >= cond.size(1)
         cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
+        with torch.autocast(device_type='cuda', dtype=torch.float16) if device.type == "cuda" else torch.no_grad():
+            vc_target = inference_module.cfm.inference(
+                cat_condition,
+                torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
+                mel2, style2, None, diffusion_steps,
+                inference_cfg_rate=inference_cfg_rate
+            )
             vc_target = vc_target[:, :, mel2.size(-1):]
         vc_wave = bigvgan_fn(vc_target.float())[0]
         if processed_frames == 0:
             if is_last_chunk:
                 output_wave = vc_wave[0].cpu().numpy()
                 generated_wave_chunks.append(output_wave)
+                output_i16 = (output_wave * 32768.0).astype(np.int16)
                 mp3_bytes = AudioSegment(
+                    output_i16.tobytes(),
+                    frame_rate=sr_local,
+                    sample_width=output_i16.dtype.itemsize,
+                    channels=1
                 ).export(format="mp3", bitrate=bitrate).read()
+                yield mp3_bytes, (sr_local, np.concatenate(generated_wave_chunks))
                 break
             output_wave = vc_wave[0, :-overlap_wave_len].cpu().numpy()
             generated_wave_chunks.append(output_wave)
             previous_chunk = vc_wave[0, -overlap_wave_len:]
             processed_frames += vc_target.size(2) - overlap_frame_len
+            output_i16 = (output_wave * 32768.0).astype(np.int16)
             mp3_bytes = AudioSegment(
+                output_i16.tobytes(),
+                frame_rate=sr_local,
+                sample_width=output_i16.dtype.itemsize,
+                channels=1
             ).export(format="mp3", bitrate=bitrate).read()
             yield mp3_bytes, None
         elif is_last_chunk:
             output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0].cpu().numpy(), overlap_wave_len)
             generated_wave_chunks.append(output_wave)
             processed_frames += vc_target.size(2) - overlap_frame_len
+            output_i16 = (output_wave * 32768.0).astype(np.int16)
             mp3_bytes = AudioSegment(
+                output_i16.tobytes(),
+                frame_rate=sr_local,
+                sample_width=output_i16.dtype.itemsize,
+                channels=1
             ).export(format="mp3", bitrate=bitrate).read()
+            yield mp3_bytes, (sr_local, np.concatenate(generated_wave_chunks))
             break
         else:
             output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0, :-overlap_wave_len].cpu().numpy(), overlap_wave_len)
             generated_wave_chunks.append(output_wave)
             previous_chunk = vc_wave[0, -overlap_wave_len:]
             processed_frames += vc_target.size(2) - overlap_frame_len
+            output_i16 = (output_wave * 32768.0).astype(np.int16)
             mp3_bytes = AudioSegment(
+                output_i16.tobytes(),
+                frame_rate=sr_local,
+                sample_width=output_i16.dtype.itemsize,
+                channels=1
             ).export(format="mp3", bitrate=bitrate).read()
             yield mp3_bytes, None
+# =========================================================
+# Gradio UI
+# =========================================================
 if __name__ == "__main__":
+    description = (
+        "State-of-the-Art zero-shot voice conversion/singing voice conversion. "
+        "For local deployment please check GitHub repository for details and updates.<br>"
+        "Note: reference audio will be clipped to 25s if longer.<br>"
+        "If total duration exceeds 30s, source audio will be processed in chunks.<br>"
+        "<br>"
+        "Hindi tip: Use Hindi SOURCE + Hindi REFERENCE for best Hindi output. "
+        "This app converts voice (audio→audio), it does not do text-to-speech."
+    )
     inputs = [
         gr.Audio(type="filepath", label="Source Audio / 源音频"),
         gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
+        gr.Slider(minimum=1, maximum=200, value=25, step=1,
+                  label="Diffusion Steps / 扩散步数",
+                  info="25 by default, 50~100 for best quality / 默认为 25，50~100 为最佳质量"),
+        gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0,
+                  label="Length Adjust / 长度调整",
+                  info="<1.0 speed-up, >1.0 slow-down / <1.0 加速，>1.0 减速"),
+        gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7,
+                  label="Inference CFG Rate", info="subtle influence / 有微小影响"),
+        gr.Checkbox(label="Use F0 conditioned model / 启用F0输入", value=False,
+                    info="Must set to true for singing voice conversion / 歌声转换时必须勾选"),
         gr.Checkbox(label="Auto F0 adjust / 自动F0调整", value=True,
+                    info="Roughly adjust F0 to match target voice. Only when F0 model is used."),
+        gr.Slider(label='Pitch shift / 音调变换', minimum=-24, maximum=24, step=1, value=0,
+                  info="Semitones. Only when F0 model is used / 半音，仅F0模型生效"),
+    ]
+    examples = [
+        ["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, False, True, 0],
+        ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7, False, True, 0],
+        ["examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav",
+         "examples/reference/kobe_0.wav", 50, 1.0, 0.7, True, False, -6],
+        ["examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav",
+         "examples/reference/trump_0.wav", 50, 1.0, 0.7, True, False, -12],
+    ]
+    outputs = [
+        gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
+        gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')
     ]
+    gr.Interface(
+        fn=voice_conversion,
+        description=description,
+        inputs=inputs,
+        outputs=outputs,
+        title="Seed Voice Conversion (ECAPA speaker embedding)",
+        examples=examples,
+        cache_examples=False
+    ).launch()