|
|
import torch, torchaudio
|
|
|
import requests
|
|
|
import IPython.display as display
|
|
|
import gradio as gr
|
|
|
import os
|
|
|
import sys
|
|
|
|
|
|
|
|
|
|
|
|
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True, pretrained=False)
|
|
|
acoustic = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True, pretrained=False)
|
|
|
hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, pretrained=False)
|
|
|
|
|
|
hubert_loaded = torch.load("hubert_cpu.pt", map_location=torch.device('cpu'))
|
|
|
acoustic_loaded = torch.load("acoustic_cpu.pt", map_location=torch.device('cpu'))
|
|
|
hifigan_loaded = torch.load("hifigan_cpu.pt", map_location=torch.device('cpu'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_speech(filename, progress=gr.Progress()):
|
|
|
if not filename:
|
|
|
raise ValueError("Please provide an audio")
|
|
|
progress(0, desc="Starting conversion")
|
|
|
|
|
|
progress(0.1, desc="Loading audio")
|
|
|
|
|
|
source, sr = torchaudio.load(filename)
|
|
|
|
|
|
progress(0.3, desc="Preprocessing audio")
|
|
|
|
|
|
|
|
|
if source.shape[0] > 1:
|
|
|
source = source[0, :].unsqueeze(0)
|
|
|
source = torchaudio.functional.resample(source, sr, 16000)
|
|
|
source = source.unsqueeze(0).to('cpu')
|
|
|
|
|
|
progress(0.6, desc="Converting speech")
|
|
|
|
|
|
|
|
|
with torch.inference_mode():
|
|
|
|
|
|
units = hubert_loaded.units(source)
|
|
|
progress(0.7, desc="Generating target spectrogram")
|
|
|
|
|
|
|
|
|
mel = acoustic_loaded.generate(units).transpose(1, 2)
|
|
|
progress(0.8, desc="Generating audio waveform")
|
|
|
|
|
|
|
|
|
target = hifigan_loaded(mel)
|
|
|
progress(0.9, desc="Postprocessing audio")
|
|
|
|
|
|
target = target.squeeze().cpu().numpy()
|
|
|
progress(1.0, desc="Conversion complete")
|
|
|
return 16000, target
|
|
|
|
|
|
"""Convert to the target speaker:"""
|
|
|
|
|
|
def enable_convert_button(audio):
|
|
|
if audio is not None:
|
|
|
return gr.update(interactive=True), gr.update(value="", visible=False)
|
|
|
return gr.update(interactive=False), None
|
|
|
|
|
|
def clear_components():
|
|
|
return None, None
|
|
|
|
|
|
def stop_recording_info(audio):
|
|
|
if audio is None:
|
|
|
return gr.update(value="### <i style='color:yellow'>Recording and uploading, please wait ...</i>", visible=True)
|
|
|
return gr.update(value="", visible=False)
|
|
|
|
|
|
def stop():
|
|
|
print("this is working")
|
|
|
|
|
|
|
|
|
def gui():
|
|
|
with gr.Blocks() as interface:
|
|
|
gr.Markdown("# Soft Speech Units for Improved Voice Conversion")
|
|
|
gr.Markdown("Upload an audio file to convert it to the target speaker's voice using soft speech units. Or use your microphone.")
|
|
|
|
|
|
with gr.Row():
|
|
|
with gr.Column():
|
|
|
audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload Audio", editable=False)
|
|
|
convert_button = gr.Button("Convert Speech", interactive=False)
|
|
|
info = gr.Markdown("", visible=False)
|
|
|
|
|
|
with gr.Column():
|
|
|
converted_audio = gr.Audio(type="numpy", label="Converted Speech", show_share_button=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
audio_input.start_recording(stop_recording_info, inputs=[audio_input], outputs=[info])
|
|
|
|
|
|
audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button, info])
|
|
|
|
|
|
convert_button.click(convert_speech, inputs=[audio_input], outputs=[converted_audio])
|
|
|
|
|
|
|
|
|
audio_input.clear(clear_components, inputs=None, outputs=[audio_input, converted_audio])
|
|
|
|
|
|
|
|
|
|
|
|
return interface
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
app = gui()
|
|
|
|
|
|
app.queue(default_concurrency_limit=40)
|
|
|
|
|
|
app.launch(
|
|
|
max_threads=40,
|
|
|
share=True,
|
|
|
show_error=True,
|
|
|
quiet=False,
|
|
|
debug=False,
|
|
|
)
|
|
|
|
|
|
|