# rvc_advanced.py import torch import torchaudio import librosa import sounddevice as sd import streamlit as st from transformers import HubertModel, HubertProcessor from speechbrain.inference import HIFIGAN # ---------------------------- # Load Pretrained Models # ---------------------------- st.write("Loading models, please wait...") processor = HubertProcessor.from_pretrained("facebook/hubert-large-ls960-ft") hubert_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft") vocoder = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmp_hifigan") # ---------------------------- # Audio Processing Functions # ---------------------------- def extract_speaker_embedding(audio_path): y, sr = librosa.load(audio_path, sr=16000) inputs = processor(y, sampling_rate=sr, return_tensors="pt", padding=True) with torch.no_grad(): embeddings = hubert_model(**inputs).last_hidden_state.mean(dim=1) return embeddings def audio_to_mel(audio_path): waveform, sample_rate = torchaudio.load(audio_path) mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate)(waveform) return mel_spec def mel_to_audio(mel_spec): # Using vocoder to convert mel spectrogram to waveform waveform = vocoder.decode_batch(mel_spec) return waveform def convert_voice(input_path, reference_path): ref_emb = extract_speaker_embedding(reference_path) mel_spec = audio_to_mel(input_path) # Apply speaker embedding to mel spectrogram converted_mel = mel_spec + ref_emb.unsqueeze(-1) waveform = mel_to_audio(converted_mel) return waveform def play_audio(waveform, sample_rate=16000): sd.play(waveform.squeeze().cpu().numpy(), sample_rate) sd.wait() # ---------------------------- # Streamlit GUI # ---------------------------- st.title("Advanced RVC Voice Converter") input_audio = st.file_uploader("Upload Input Audio", type=["wav", "mp3"]) reference_audio = st.file_uploader("Upload Reference Audio", type=["wav", "mp3"]) if input_audio and reference_audio: if st.button("Convert Voice"): # Save temp files input_path = "temp_input.wav" reference_path = "temp_reference.wav" with open(input_path, "wb") as f: f.write(input_audio.read()) with open(reference_path, "wb") as f: f.write(reference_audio.read()) st.write("Converting voice...") waveform = convert_voice(input_path, reference_path) st.write("Playing converted audio...") play_audio(waveform) st.audio(waveform.squeeze().cpu().numpy(), format="audio/wav")