File size: 21,808 Bytes
9cf98e1
ac28dc4
 
 
 
796a59d
ac28dc4
 
 
9340499
 
ac28dc4
796a59d
 
 
a812692
ac28dc4
796a59d
 
 
 
 
 
 
 
 
ac28dc4
 
 
 
 
 
796a59d
 
 
 
 
ac28dc4
 
 
 
796a59d
 
ac28dc4
796a59d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e74aea7
 
796a59d
 
 
 
e74aea7
 
 
796a59d
 
 
a812692
 
 
 
 
 
 
 
 
 
 
796a59d
 
 
420ea59
 
796a59d
 
 
 
 
420ea59
 
 
e74aea7
796a59d
 
 
ac28dc4
 
 
 
 
 
 
420ea59
ac28dc4
 
 
 
 
796a59d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1174a8a
 
 
796a59d
1174a8a
 
796a59d
1174a8a
 
 
 
 
c6d0958
796a59d
 
 
 
 
c6d0958
 
 
 
 
 
796a59d
 
 
 
c6d0958
 
 
796a59d
 
 
1174a8a
 
 
 
9340499
c6d0958
 
a812692
9340499
 
796a59d
9340499
a812692
796a59d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cf98e1
ac28dc4
 
796a59d
 
 
1174a8a
e74aea7
c6d0958
796a59d
 
 
 
 
 
ac28dc4
796a59d
 
ac28dc4
91da599
796a59d
 
 
 
 
ac28dc4
796a59d
c6d0958
796a59d
 
 
 
 
 
c6d0958
796a59d
 
 
c6d0958
 
 
 
 
 
ac28dc4
c6d0958
 
420ea59
796a59d
c6d0958
 
 
 
796a59d
c6d0958
 
796a59d
 
 
 
 
 
 
 
 
 
420ea59
796a59d
 
 
420ea59
 
 
 
 
 
 
796a59d
 
c6d0958
796a59d
c6d0958
796a59d
c6d0958
 
796a59d
c6d0958
 
796a59d
 
 
 
 
 
 
c6d0958
796a59d
 
 
 
 
 
 
 
 
 
c6d0958
 
ac28dc4
e74aea7
 
ac28dc4
 
 
 
796a59d
 
ac28dc4
796a59d
1174a8a
796a59d
a812692
1174a8a
 
 
 
 
 
 
 
 
 
 
 
 
 
796a59d
1174a8a
796a59d
1174a8a
 
 
 
 
 
 
ac28dc4
796a59d
 
 
 
 
 
ac28dc4
9340499
796a59d
9340499
 
796a59d
 
9340499
 
 
796a59d
9340499
796a59d
 
 
 
 
 
 
 
4cde2bd
796a59d
a924296
796a59d
9340499
 
 
 
796a59d
 
9340499
796a59d
 
 
 
9340499
796a59d
9340499
796a59d
9340499
 
 
 
796a59d
 
9340499
796a59d
 
 
 
9340499
e74aea7
796a59d
9340499
 
 
 
796a59d
 
9340499
796a59d
 
 
 
9340499
796a59d
9340499
796a59d
9340499
 
 
 
 
796a59d
 
9340499
796a59d
 
 
 
9340499
e74aea7
ac28dc4
796a59d
 
 
ac28dc4
796a59d
 
 
 
 
 
 
 
 
 
ac28dc4
10c4fbc
 
796a59d
 
 
 
 
 
 
 
 
 
10c4fbc
796a59d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac28dc4
 
796a59d
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
import spaces
import gradio as gr
import torch
import torchaudio
import librosa
import torch.nn as nn
from modules.commons import build_model, load_checkpoint, recursive_munch
import yaml
from hf_utils import load_custom_model_from_hf
import numpy as np
from pydub import AudioSegment

# =========================================================
# Device
# =========================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# =========================================================
# Load Seed-VC DiT model (non-f0)
# =========================================================
dit_checkpoint_path, dit_config_path = load_custom_model_from_hf(
    "Plachta/Seed-VC",
    "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
    "config_dit_mel_seed_uvit_whisper_small_wavenet.yml"
)

config = yaml.safe_load(open(dit_config_path, 'r'))
model_params = recursive_munch(config['model_params'])
model = build_model(model_params, stage='DiT')
hop_length = config['preprocess_params']['spect_params']['hop_length']
sr = config['preprocess_params']['sr']

model, _, _, _ = load_checkpoint(
    model, None, dit_checkpoint_path,
    load_only_params=True, ignore_modules=[],
    is_distributed=False
)
for key in model:
    model[key].eval()
    model[key].to(device)

# Cache setup
model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)

# =========================================================
# Speaker embedding: ECAPA (SpeechBrain) replacement
# - This reduces CN accent bias vs campplus_cn_common
# - Fallback to original CAMPPlus if SpeechBrain not available
# =========================================================
USE_ECAPA = True
spk_encoder = None

try:
    from speechbrain.pretrained import EncoderClassifier
    spk_encoder = EncoderClassifier.from_hparams(
        source="speechbrain/spkrec-ecapa-voxceleb",
        run_opts={"device": str(device)}
    )
    spk_encoder.eval()
except Exception as e:
    # If SpeechBrain isn't installed/available, fallback to CAMPPlus
    USE_ECAPA = False
    spk_encoder = None
    print("[WARN] SpeechBrain ECAPA not available. Falling back to CAMPPlus. Error:", str(e))

# CAMPPlus fallback (original)
campplus_model = None
if not USE_ECAPA:
    from modules.campplus.DTDNN import CAMPPlus
    campplus_ckpt_path = load_custom_model_from_hf(
        "funasr/campplus",
        "campplus_cn_common.bin",
        config_filename=None
    )
    campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
    campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
    campplus_model.eval()
    campplus_model.to(device)

# A small projection to map ECAPA embedding dim -> expected style dim
# We build it lazily at first inference once we know ECAPA dim.
style_proj = None
STYLE_DIM_EXPECTED = 192  # CAMPPlus embedding_size used originally in this app

# =========================================================
# Vocoder (BigVGAN)
# =========================================================
from modules.bigvgan import bigvgan

bigvgan_model = bigvgan.BigVGAN.from_pretrained(
    'nvidia/bigvgan_v2_22khz_80band_256x',
    use_cuda_kernel=False
)
bigvgan_model.remove_weight_norm()
bigvgan_model = bigvgan_model.eval().to(device)

# =========================================================
# Codec (FAcodec)
# =========================================================
ckpt_path, config_path = load_custom_model_from_hf("Plachta/FAcodec", 'pytorch_model.bin', 'config.yml')
codec_config = yaml.safe_load(open(config_path))
codec_model_params = recursive_munch(codec_config['model_params'])
codec_encoder = build_model(codec_model_params, stage="codec")

ckpt_params = torch.load(ckpt_path, map_location="cpu")
for key in codec_encoder:
    codec_encoder[key].load_state_dict(ckpt_params[key], strict=False)
_ = [codec_encoder[key].eval() for key in codec_encoder]
_ = [codec_encoder[key].to(device) for key in codec_encoder]

# =========================================================
# Whisper encoder (content)
# =========================================================
from transformers import AutoFeatureExtractor, WhisperModel

whisper_name = (
    model_params.speech_tokenizer.whisper_name
    if hasattr(model_params.speech_tokenizer, 'whisper_name')
    else "openai/whisper-small"
)
whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float16).to(device)
del whisper_model.decoder
whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)

# =========================================================
# Mel Spectrogram
# =========================================================
mel_fn_args = {
    "n_fft": config['preprocess_params']['spect_params']['n_fft'],
    "win_size": config['preprocess_params']['spect_params']['win_length'],
    "hop_size": config['preprocess_params']['spect_params']['hop_length'],
    "num_mels": config['preprocess_params']['spect_params']['n_mels'],
    "sampling_rate": sr,
    "fmin": 0,
    "fmax": None,
    "center": False
}
from modules.audio import mel_spectrogram
to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)

# =========================================================
# Load Seed-VC DiT model (f0 conditioned)
# =========================================================
dit_checkpoint_path, dit_config_path = load_custom_model_from_hf(
    "Plachta/Seed-VC",
    "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema.pth",
    "config_dit_mel_seed_uvit_whisper_base_f0_44k.yml"
)

config_f0 = yaml.safe_load(open(dit_config_path, 'r'))
model_params_f0 = recursive_munch(config_f0['model_params'])
model_f0 = build_model(model_params_f0, stage='DiT')
hop_length_f0 = config_f0['preprocess_params']['spect_params']['hop_length']
sr_f0 = config_f0['preprocess_params']['sr']

model_f0, _, _, _ = load_checkpoint(
    model_f0, None, dit_checkpoint_path,
    load_only_params=True, ignore_modules=[],
    is_distributed=False
)
for key in model_f0:
    model_f0[key].eval()
    model_f0[key].to(device)

model_f0.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)

# F0 extractor
from modules.rmvpe import RMVPE

model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
rmvpe = RMVPE(model_path, is_half=False, device=device)

mel_fn_args_f0 = {
    "n_fft": config_f0['preprocess_params']['spect_params']['n_fft'],
    "win_size": config_f0['preprocess_params']['spect_params']['win_length'],
    "hop_size": config_f0['preprocess_params']['spect_params']['hop_length'],
    "num_mels": config_f0['preprocess_params']['spect_params']['n_mels'],
    "sampling_rate": sr_f0,
    "fmin": 0,
    "fmax": None,
    "center": False
}
to_mel_f0 = lambda x: mel_spectrogram(x, **mel_fn_args_f0)

bigvgan_44k_model = bigvgan.BigVGAN.from_pretrained(
    'nvidia/bigvgan_v2_44khz_128band_512x',
    use_cuda_kernel=False
)
bigvgan_44k_model.remove_weight_norm()
bigvgan_44k_model = bigvgan_44k_model.eval().to(device)

# =========================================================
# Helpers
# =========================================================
def adjust_f0_semitones(f0_sequence, n_semitones):
    factor = 2 ** (n_semitones / 12)
    return f0_sequence * factor

def crossfade(chunk1, chunk2, overlap):
    fade_out = np.cos(np.linspace(0, np.pi / 2, overlap)) ** 2
    fade_in = np.cos(np.linspace(np.pi / 2, 0, overlap)) ** 2
    chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
    return chunk2

# Streaming and chunk params
bitrate = "320k"
overlap_frame_len = 16

def get_style_embedding(ref_waves_16k: torch.Tensor) -> torch.Tensor:
    """
    ref_waves_16k: (B, T) float tensor @ 16k
    returns: style2 (B, STYLE_DIM_EXPECTED)
    """
    global style_proj

    if USE_ECAPA and spk_encoder is not None:
        with torch.no_grad():
            # SpeechBrain ECAPA returns (B, 1, D) or (B, D) depending on version
            emb = spk_encoder.encode_batch(ref_waves_16k)
            if emb.dim() == 3:
                emb = emb.squeeze(1)  # (B, D)
            style2 = emb.to(device)

        # Project to expected style dim if needed
        if style2.size(-1) != STYLE_DIM_EXPECTED:
            if style_proj is None:
                style_proj = nn.Linear(style2.size(-1), STYLE_DIM_EXPECTED).to(device)
                style_proj.eval()
            with torch.no_grad():
                style2 = style_proj(style2)
        return style2

    # Fallback: CAMPPlus
    feat2 = torchaudio.compliance.kaldi.fbank(
        ref_waves_16k,
        num_mel_bins=80,
        dither=0,
        sample_frequency=16000
    )
    feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
    style2 = campplus_model(feat2.unsqueeze(0))
    return style2

# =========================================================
# Voice Conversion
# =========================================================
@spaces.GPU
@torch.no_grad()
@torch.inference_mode()
def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate,
                     f0_condition, auto_f0_adjust, pitch_shift):

    inference_module = model if not f0_condition else model_f0
    mel_fn = to_mel if not f0_condition else to_mel_f0
    bigvgan_fn = bigvgan_model if not f0_condition else bigvgan_44k_model
    sr_local = 22050 if not f0_condition else 44100
    hop_local = 256 if not f0_condition else 512

    max_context_window = sr_local // hop_local * 30
    overlap_wave_len = overlap_frame_len * hop_local

    # Load audio
    source_audio = librosa.load(source, sr=sr_local)[0]
    ref_audio = librosa.load(target, sr=sr_local)[0]

    source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
    ref_audio = torch.tensor(ref_audio[:sr_local * 25]).unsqueeze(0).float().to(device)

    # Resample for whisper and speaker embedding
    ref_waves_16k = torchaudio.functional.resample(ref_audio, sr_local, 16000)
    converted_waves_16k = torchaudio.functional.resample(source_audio, sr_local, 16000)

    # Whisper content encoding (S_alt)
    if converted_waves_16k.size(-1) <= 16000 * 30:
        alt_inputs = whisper_feature_extractor(
            [converted_waves_16k.squeeze(0).cpu().numpy()],
            return_tensors="pt",
            return_attention_mask=True,
            sampling_rate=16000
        )
        alt_input_features = whisper_model._mask_input_features(
            alt_inputs.input_features, attention_mask=alt_inputs.attention_mask
        ).to(device)

        alt_outputs = whisper_model.encoder(
            alt_input_features.to(whisper_model.encoder.dtype),
            head_mask=None,
            output_attentions=False,
            output_hidden_states=False,
            return_dict=True,
        )
        S_alt = alt_outputs.last_hidden_state.to(torch.float32)
        S_alt = S_alt[:, :converted_waves_16k.size(-1) // 320 + 1]
    else:
        overlapping_time = 5
        S_alt_list = []
        buffer = None
        traversed_time = 0
        while traversed_time < converted_waves_16k.size(-1):
            if buffer is None:
                chunk = converted_waves_16k[:, traversed_time:traversed_time + 16000 * 30]
            else:
                chunk = torch.cat(
                    [buffer, converted_waves_16k[:, traversed_time:traversed_time + 16000 * (30 - overlapping_time)]],
                    dim=-1
                )
            alt_inputs = whisper_feature_extractor(
                [chunk.squeeze(0).cpu().numpy()],
                return_tensors="pt",
                return_attention_mask=True,
                sampling_rate=16000
            )
            alt_input_features = whisper_model._mask_input_features(
                alt_inputs.input_features, attention_mask=alt_inputs.attention_mask
            ).to(device)

            alt_outputs = whisper_model.encoder(
                alt_input_features.to(whisper_model.encoder.dtype),
                head_mask=None,
                output_attentions=False,
                output_hidden_states=False,
                return_dict=True,
            )
            S_alt_chunk = alt_outputs.last_hidden_state.to(torch.float32)
            S_alt_chunk = S_alt_chunk[:, :chunk.size(-1) // 320 + 1]
            if traversed_time == 0:
                S_alt_list.append(S_alt_chunk)
            else:
                S_alt_list.append(S_alt_chunk[:, 50 * overlapping_time:])
            buffer = chunk[:, -16000 * overlapping_time:]
            traversed_time += 30 * 16000 if traversed_time == 0 else chunk.size(-1) - 16000 * overlapping_time

        S_alt = torch.cat(S_alt_list, dim=1)

    # Whisper prompt (S_ori)
    ori_waves_16k = torchaudio.functional.resample(ref_audio, sr_local, 16000)
    ori_inputs = whisper_feature_extractor(
        [ori_waves_16k.squeeze(0).cpu().numpy()],
        return_tensors="pt",
        return_attention_mask=True
    )
    ori_input_features = whisper_model._mask_input_features(
        ori_inputs.input_features, attention_mask=ori_inputs.attention_mask
    ).to(device)

    ori_outputs = whisper_model.encoder(
        ori_input_features.to(whisper_model.encoder.dtype),
        head_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    )
    S_ori = ori_outputs.last_hidden_state.to(torch.float32)
    S_ori = S_ori[:, :ori_waves_16k.size(-1) // 320 + 1]

    mel = mel_fn(source_audio.to(device).float())
    mel2 = mel_fn(ref_audio.to(device).float())

    target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
    target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)

    # Speaker embedding (ECAPA or fallback)
    style2 = get_style_embedding(ref_waves_16k)

    # f0 handling
    if f0_condition:
        F0_ori = rmvpe.infer_from_audio(ori_waves_16k[0], thred=0.5)
        F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.5)

        F0_ori = torch.from_numpy(F0_ori).to(device)[None]
        F0_alt = torch.from_numpy(F0_alt).to(device)[None]

        voiced_F0_ori = F0_ori[F0_ori > 1]
        voiced_F0_alt = F0_alt[F0_alt > 1]

        log_f0_alt = torch.log(F0_alt + 1e-5)
        voiced_log_f0_ori = torch.log(voiced_F0_ori + 1e-5)
        voiced_log_f0_alt = torch.log(voiced_F0_alt + 1e-5)
        median_log_f0_ori = torch.median(voiced_log_f0_ori)
        median_log_f0_alt = torch.median(voiced_log_f0_alt)

        shifted_log_f0_alt = log_f0_alt.clone()
        if auto_f0_adjust and voiced_F0_alt.numel() > 0 and voiced_F0_ori.numel() > 0:
            shifted_log_f0_alt[F0_alt > 1] = log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori

        shifted_f0_alt = torch.exp(shifted_log_f0_alt)
        if pitch_shift != 0:
            shifted_f0_alt[F0_alt > 1] = adjust_f0_semitones(shifted_f0_alt[F0_alt > 1], pitch_shift)
    else:
        F0_ori = None
        shifted_f0_alt = None

    # Length regulation
    cond, _, _, _, _ = inference_module.length_regulator(
        S_alt, ylens=target_lengths, n_quantizers=3, f0=shifted_f0_alt
    )
    prompt_condition, _, _, _, _ = inference_module.length_regulator(
        S_ori, ylens=target2_lengths, n_quantizers=3, f0=F0_ori
    )

    max_source_window = max_context_window - mel2.size(2)

    processed_frames = 0
    generated_wave_chunks = []
    previous_chunk = None

    while processed_frames < cond.size(1):
        chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
        is_last_chunk = processed_frames + max_source_window >= cond.size(1)

        cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)

        with torch.autocast(device_type='cuda', dtype=torch.float16) if device.type == "cuda" else torch.no_grad():
            vc_target = inference_module.cfm.inference(
                cat_condition,
                torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
                mel2, style2, None, diffusion_steps,
                inference_cfg_rate=inference_cfg_rate
            )
            vc_target = vc_target[:, :, mel2.size(-1):]

        vc_wave = bigvgan_fn(vc_target.float())[0]

        if processed_frames == 0:
            if is_last_chunk:
                output_wave = vc_wave[0].cpu().numpy()
                generated_wave_chunks.append(output_wave)
                output_i16 = (output_wave * 32768.0).astype(np.int16)

                mp3_bytes = AudioSegment(
                    output_i16.tobytes(),
                    frame_rate=sr_local,
                    sample_width=output_i16.dtype.itemsize,
                    channels=1
                ).export(format="mp3", bitrate=bitrate).read()
                yield mp3_bytes, (sr_local, np.concatenate(generated_wave_chunks))
                break

            output_wave = vc_wave[0, :-overlap_wave_len].cpu().numpy()
            generated_wave_chunks.append(output_wave)
            previous_chunk = vc_wave[0, -overlap_wave_len:]
            processed_frames += vc_target.size(2) - overlap_frame_len

            output_i16 = (output_wave * 32768.0).astype(np.int16)
            mp3_bytes = AudioSegment(
                output_i16.tobytes(),
                frame_rate=sr_local,
                sample_width=output_i16.dtype.itemsize,
                channels=1
            ).export(format="mp3", bitrate=bitrate).read()
            yield mp3_bytes, None

        elif is_last_chunk:
            output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0].cpu().numpy(), overlap_wave_len)
            generated_wave_chunks.append(output_wave)
            processed_frames += vc_target.size(2) - overlap_frame_len

            output_i16 = (output_wave * 32768.0).astype(np.int16)
            mp3_bytes = AudioSegment(
                output_i16.tobytes(),
                frame_rate=sr_local,
                sample_width=output_i16.dtype.itemsize,
                channels=1
            ).export(format="mp3", bitrate=bitrate).read()
            yield mp3_bytes, (sr_local, np.concatenate(generated_wave_chunks))
            break

        else:
            output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0, :-overlap_wave_len].cpu().numpy(), overlap_wave_len)
            generated_wave_chunks.append(output_wave)
            previous_chunk = vc_wave[0, -overlap_wave_len:]
            processed_frames += vc_target.size(2) - overlap_frame_len

            output_i16 = (output_wave * 32768.0).astype(np.int16)
            mp3_bytes = AudioSegment(
                output_i16.tobytes(),
                frame_rate=sr_local,
                sample_width=output_i16.dtype.itemsize,
                channels=1
            ).export(format="mp3", bitrate=bitrate).read()
            yield mp3_bytes, None

# =========================================================
# Gradio UI
# =========================================================
if __name__ == "__main__":
    description = (
        "State-of-the-Art zero-shot voice conversion/singing voice conversion. "
        "For local deployment please check GitHub repository for details and updates.<br>"
        "Note: reference audio will be clipped to 25s if longer.<br>"
        "If total duration exceeds 30s, source audio will be processed in chunks.<br>"
        "<br>"
        "Hindi tip: Use Hindi SOURCE + Hindi REFERENCE for best Hindi output. "
        "This app converts voice (audio→audio), it does not do text-to-speech."
    )

    inputs = [
        gr.Audio(type="filepath", label="Source Audio / 源音频"),
        gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
        gr.Slider(minimum=1, maximum=200, value=25, step=1,
                  label="Diffusion Steps / 扩散步数",
                  info="25 by default, 50~100 for best quality / 默认为 25,50~100 为最佳质量"),
        gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0,
                  label="Length Adjust / 长度调整",
                  info="<1.0 speed-up, >1.0 slow-down / <1.0 加速,>1.0 减速"),
        gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7,
                  label="Inference CFG Rate", info="subtle influence / 有微小影响"),
        gr.Checkbox(label="Use F0 conditioned model / 启用F0输入", value=False,
                    info="Must set to true for singing voice conversion / 歌声转换时必须勾选"),
        gr.Checkbox(label="Auto F0 adjust / 自动F0调整", value=True,
                    info="Roughly adjust F0 to match target voice. Only when F0 model is used."),
        gr.Slider(label='Pitch shift / 音调变换', minimum=-24, maximum=24, step=1, value=0,
                  info="Semitones. Only when F0 model is used / 半音,仅F0模型生效"),
    ]

    examples = [
        ["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, False, True, 0],
        ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7, False, True, 0],
        ["examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav",
         "examples/reference/kobe_0.wav", 50, 1.0, 0.7, True, False, -6],
        ["examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav",
         "examples/reference/trump_0.wav", 50, 1.0, 0.7, True, False, -12],
    ]

    outputs = [
        gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
        gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')
    ]

    gr.Interface(
        fn=voice_conversion,
        description=description,
        inputs=inputs,
        outputs=outputs,
        title="Seed Voice Conversion (ECAPA speaker embedding)",
        examples=examples,
        cache_examples=False
    ).launch()