File size: 1,328 Bytes
d6c34b5
 
ec877d4
d6c34b5
ec877d4
d6c34b5
 
ec877d4
d6c34b5
 
a12f3e8
d6c34b5
 
a12f3e8
d6c34b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a12f3e8
d6c34b5
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import torch 
import torchaudio
import gradio as gr
import requests

# Download and load the HuBERT content encoder
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda()

# Assuming similar steps for downloading and loading the acoustic model
acoustic_model = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True).cuda()

# Load the HiFiGAN vocoder (if used in the notebook)
vocoder = torch.hub.load("bshall/hifigan:main", "hifigan", trust_repo=True).cuda()

def voice_conversion(input_audio):
    # Load input audio
    waveform, sample_rate = torchaudio.load(input_audio)
    
    # Process the audio using the models
    with torch.no_grad():
        units = hubert(waveform.cuda())
        mel_spec = acoustic_model.generate(units)
        audio_out = vocoder(mel_spec)
    
    # Save the output audio
    output_path = "output.wav"
    torchaudio.save(output_path, audio_out.cpu(), sample_rate)
    
    return output_path

# Define Gradio interface
iface = gr.Interface(
    fn=voice_conversion,
    inputs=gr.inputs.Audio(source="upload", type="filepath"),
    outputs=gr.outputs.Audio(type="file"),
    title="Voice Conversion Demo",
    description="Upload an audio file to convert its voice using HuBERT and other models."
)

# Launch the interface
iface.launch()