RVC / app.py
Ai128474's picture
Update app.py
b3fe0c0 verified
# rvc_advanced.py
import torch
import torchaudio
import librosa
import sounddevice as sd
import streamlit as st
from transformers import HubertModel, HubertProcessor
from speechbrain.inference import HIFIGAN
# ----------------------------
# Load Pretrained Models
# ----------------------------
st.write("Loading models, please wait...")
processor = HubertProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
hubert_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
vocoder = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmp_hifigan")
# ----------------------------
# Audio Processing Functions
# ----------------------------
def extract_speaker_embedding(audio_path):
y, sr = librosa.load(audio_path, sr=16000)
inputs = processor(y, sampling_rate=sr, return_tensors="pt", padding=True)
with torch.no_grad():
embeddings = hubert_model(**inputs).last_hidden_state.mean(dim=1)
return embeddings
def audio_to_mel(audio_path):
waveform, sample_rate = torchaudio.load(audio_path)
mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate)(waveform)
return mel_spec
def mel_to_audio(mel_spec):
# Using vocoder to convert mel spectrogram to waveform
waveform = vocoder.decode_batch(mel_spec)
return waveform
def convert_voice(input_path, reference_path):
ref_emb = extract_speaker_embedding(reference_path)
mel_spec = audio_to_mel(input_path)
# Apply speaker embedding to mel spectrogram
converted_mel = mel_spec + ref_emb.unsqueeze(-1)
waveform = mel_to_audio(converted_mel)
return waveform
def play_audio(waveform, sample_rate=16000):
sd.play(waveform.squeeze().cpu().numpy(), sample_rate)
sd.wait()
# ----------------------------
# Streamlit GUI
# ----------------------------
st.title("Advanced RVC Voice Converter")
input_audio = st.file_uploader("Upload Input Audio", type=["wav", "mp3"])
reference_audio = st.file_uploader("Upload Reference Audio", type=["wav", "mp3"])
if input_audio and reference_audio:
if st.button("Convert Voice"):
# Save temp files
input_path = "temp_input.wav"
reference_path = "temp_reference.wav"
with open(input_path, "wb") as f:
f.write(input_audio.read())
with open(reference_path, "wb") as f:
f.write(reference_audio.read())
st.write("Converting voice...")
waveform = convert_voice(input_path, reference_path)
st.write("Playing converted audio...")
play_audio(waveform)
st.audio(waveform.squeeze().cpu().numpy(), format="audio/wav")