File size: 2,626 Bytes
b3fe0c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# rvc_advanced.py
import torch
import torchaudio
import librosa
import sounddevice as sd
import streamlit as st
from transformers import HubertModel, HubertProcessor
from speechbrain.inference import HIFIGAN

# ----------------------------
# Load Pretrained Models
# ----------------------------
st.write("Loading models, please wait...")
processor = HubertProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
hubert_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
vocoder = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmp_hifigan")

# ----------------------------
# Audio Processing Functions
# ----------------------------
def extract_speaker_embedding(audio_path):
    y, sr = librosa.load(audio_path, sr=16000)
    inputs = processor(y, sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        embeddings = hubert_model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings

def audio_to_mel(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate)(waveform)
    return mel_spec

def mel_to_audio(mel_spec):
    # Using vocoder to convert mel spectrogram to waveform
    waveform = vocoder.decode_batch(mel_spec)
    return waveform

def convert_voice(input_path, reference_path):
    ref_emb = extract_speaker_embedding(reference_path)
    mel_spec = audio_to_mel(input_path)
    # Apply speaker embedding to mel spectrogram
    converted_mel = mel_spec + ref_emb.unsqueeze(-1)
    waveform = mel_to_audio(converted_mel)
    return waveform

def play_audio(waveform, sample_rate=16000):
    sd.play(waveform.squeeze().cpu().numpy(), sample_rate)
    sd.wait()

# ----------------------------
# Streamlit GUI
# ----------------------------
st.title("Advanced RVC Voice Converter")

input_audio = st.file_uploader("Upload Input Audio", type=["wav", "mp3"])
reference_audio = st.file_uploader("Upload Reference Audio", type=["wav", "mp3"])

if input_audio and reference_audio:
    if st.button("Convert Voice"):
        # Save temp files
        input_path = "temp_input.wav"
        reference_path = "temp_reference.wav"
        with open(input_path, "wb") as f:
            f.write(input_audio.read())
        with open(reference_path, "wb") as f:
            f.write(reference_audio.read())
        
        st.write("Converting voice...")
        waveform = convert_voice(input_path, reference_path)
        st.write("Playing converted audio...")
        play_audio(waveform)
        st.audio(waveform.squeeze().cpu().numpy(), format="audio/wav")