Update app.py
Browse files
app.py
CHANGED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# rvc_advanced.py
|
| 2 |
+
import torch
|
| 3 |
+
import torchaudio
|
| 4 |
+
import librosa
|
| 5 |
+
import sounddevice as sd
|
| 6 |
+
import streamlit as st
|
| 7 |
+
from transformers import HubertModel, HubertProcessor
|
| 8 |
+
from speechbrain.inference import HIFIGAN
|
| 9 |
+
|
| 10 |
+
# ----------------------------
|
| 11 |
+
# Load Pretrained Models
|
| 12 |
+
# ----------------------------
|
| 13 |
+
st.write("Loading models, please wait...")
|
| 14 |
+
processor = HubertProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
|
| 15 |
+
hubert_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
|
| 16 |
+
vocoder = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmp_hifigan")
|
| 17 |
+
|
| 18 |
+
# ----------------------------
|
| 19 |
+
# Audio Processing Functions
|
| 20 |
+
# ----------------------------
|
| 21 |
+
def extract_speaker_embedding(audio_path):
|
| 22 |
+
y, sr = librosa.load(audio_path, sr=16000)
|
| 23 |
+
inputs = processor(y, sampling_rate=sr, return_tensors="pt", padding=True)
|
| 24 |
+
with torch.no_grad():
|
| 25 |
+
embeddings = hubert_model(**inputs).last_hidden_state.mean(dim=1)
|
| 26 |
+
return embeddings
|
| 27 |
+
|
| 28 |
+
def audio_to_mel(audio_path):
|
| 29 |
+
waveform, sample_rate = torchaudio.load(audio_path)
|
| 30 |
+
mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate)(waveform)
|
| 31 |
+
return mel_spec
|
| 32 |
+
|
| 33 |
+
def mel_to_audio(mel_spec):
|
| 34 |
+
# Using vocoder to convert mel spectrogram to waveform
|
| 35 |
+
waveform = vocoder.decode_batch(mel_spec)
|
| 36 |
+
return waveform
|
| 37 |
+
|
| 38 |
+
def convert_voice(input_path, reference_path):
|
| 39 |
+
ref_emb = extract_speaker_embedding(reference_path)
|
| 40 |
+
mel_spec = audio_to_mel(input_path)
|
| 41 |
+
# Apply speaker embedding to mel spectrogram
|
| 42 |
+
converted_mel = mel_spec + ref_emb.unsqueeze(-1)
|
| 43 |
+
waveform = mel_to_audio(converted_mel)
|
| 44 |
+
return waveform
|
| 45 |
+
|
| 46 |
+
def play_audio(waveform, sample_rate=16000):
|
| 47 |
+
sd.play(waveform.squeeze().cpu().numpy(), sample_rate)
|
| 48 |
+
sd.wait()
|
| 49 |
+
|
| 50 |
+
# ----------------------------
|
| 51 |
+
# Streamlit GUI
|
| 52 |
+
# ----------------------------
|
| 53 |
+
st.title("Advanced RVC Voice Converter")
|
| 54 |
+
|
| 55 |
+
input_audio = st.file_uploader("Upload Input Audio", type=["wav", "mp3"])
|
| 56 |
+
reference_audio = st.file_uploader("Upload Reference Audio", type=["wav", "mp3"])
|
| 57 |
+
|
| 58 |
+
if input_audio and reference_audio:
|
| 59 |
+
if st.button("Convert Voice"):
|
| 60 |
+
# Save temp files
|
| 61 |
+
input_path = "temp_input.wav"
|
| 62 |
+
reference_path = "temp_reference.wav"
|
| 63 |
+
with open(input_path, "wb") as f:
|
| 64 |
+
f.write(input_audio.read())
|
| 65 |
+
with open(reference_path, "wb") as f:
|
| 66 |
+
f.write(reference_audio.read())
|
| 67 |
+
|
| 68 |
+
st.write("Converting voice...")
|
| 69 |
+
waveform = convert_voice(input_path, reference_path)
|
| 70 |
+
st.write("Playing converted audio...")
|
| 71 |
+
play_audio(waveform)
|
| 72 |
+
st.audio(waveform.squeeze().cpu().numpy(), format="audio/wav")
|