Ai128474 commited on
Commit
b3fe0c0
·
verified ·
1 Parent(s): 5ba2108

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -0
app.py CHANGED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rvc_advanced.py
2
+ import torch
3
+ import torchaudio
4
+ import librosa
5
+ import sounddevice as sd
6
+ import streamlit as st
7
+ from transformers import HubertModel, HubertProcessor
8
+ from speechbrain.inference import HIFIGAN
9
+
10
+ # ----------------------------
11
+ # Load Pretrained Models
12
+ # ----------------------------
13
+ st.write("Loading models, please wait...")
14
+ processor = HubertProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
15
+ hubert_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
16
+ vocoder = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmp_hifigan")
17
+
18
+ # ----------------------------
19
+ # Audio Processing Functions
20
+ # ----------------------------
21
+ def extract_speaker_embedding(audio_path):
22
+ y, sr = librosa.load(audio_path, sr=16000)
23
+ inputs = processor(y, sampling_rate=sr, return_tensors="pt", padding=True)
24
+ with torch.no_grad():
25
+ embeddings = hubert_model(**inputs).last_hidden_state.mean(dim=1)
26
+ return embeddings
27
+
28
+ def audio_to_mel(audio_path):
29
+ waveform, sample_rate = torchaudio.load(audio_path)
30
+ mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate)(waveform)
31
+ return mel_spec
32
+
33
+ def mel_to_audio(mel_spec):
34
+ # Using vocoder to convert mel spectrogram to waveform
35
+ waveform = vocoder.decode_batch(mel_spec)
36
+ return waveform
37
+
38
+ def convert_voice(input_path, reference_path):
39
+ ref_emb = extract_speaker_embedding(reference_path)
40
+ mel_spec = audio_to_mel(input_path)
41
+ # Apply speaker embedding to mel spectrogram
42
+ converted_mel = mel_spec + ref_emb.unsqueeze(-1)
43
+ waveform = mel_to_audio(converted_mel)
44
+ return waveform
45
+
46
+ def play_audio(waveform, sample_rate=16000):
47
+ sd.play(waveform.squeeze().cpu().numpy(), sample_rate)
48
+ sd.wait()
49
+
50
+ # ----------------------------
51
+ # Streamlit GUI
52
+ # ----------------------------
53
+ st.title("Advanced RVC Voice Converter")
54
+
55
+ input_audio = st.file_uploader("Upload Input Audio", type=["wav", "mp3"])
56
+ reference_audio = st.file_uploader("Upload Reference Audio", type=["wav", "mp3"])
57
+
58
+ if input_audio and reference_audio:
59
+ if st.button("Convert Voice"):
60
+ # Save temp files
61
+ input_path = "temp_input.wav"
62
+ reference_path = "temp_reference.wav"
63
+ with open(input_path, "wb") as f:
64
+ f.write(input_audio.read())
65
+ with open(reference_path, "wb") as f:
66
+ f.write(reference_audio.read())
67
+
68
+ st.write("Converting voice...")
69
+ waveform = convert_voice(input_path, reference_path)
70
+ st.write("Playing converted audio...")
71
+ play_audio(waveform)
72
+ st.audio(waveform.squeeze().cpu().numpy(), format="audio/wav")