| import torch, torchaudio
|
| import requests
|
| import IPython.display as display
|
| import gradio as gr
|
| import os
|
|
|
|
|
|
|
| def get_file_size_in_mb(file_path):
|
|
|
| file_size_bytes = os.path.getsize(file_path)
|
|
|
|
|
| file_size_mb = file_size_bytes / (1024 * 1024)
|
|
|
| return file_size_mb
|
|
|
|
|
|
|
|
|
| hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
|
| acoustic = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True, pretrained=False)
|
| hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, pretrained=False)
|
|
|
|
|
|
|
| hubert_state_dict = torch.load("hubert_cpu.pt", map_location=torch.device('cpu'))
|
| acoustic_state_dict = torch.load("acoustic_cpu.pt", map_location=torch.device('cpu'))
|
| hifigan_state_dict = torch.load("hifigan_cpu.pt", map_location=torch.device('cpu'))
|
|
|
|
|
|
|
| hubert.load_state_dict(hubert_state_dict, strict=False)
|
| acoustic.load_state_dict(acoustic_state_dict, strict=False)
|
| hifigan.load_state_dict(hifigan_state_dict, strict=False)
|
|
|
|
|
| hubert = hubert.to('cpu')
|
| acoustic = acoustic.to('cpu')
|
| hifigan = hifigan.to('cpu')
|
|
|
|
|
|
|
|
|
| def convert_speech(filename, progress=gr.Progress()):
|
| progress(0, desc="Starting conversion")
|
|
|
| progress(0.1, desc="Loading audio")
|
|
|
| source, sr = torchaudio.load(filename)
|
|
|
| progress(0.3, desc="Preprocessing audio")
|
|
|
|
|
| if source.shape[0] > 1:
|
| source = source[0, :].unsqueeze(0)
|
| source = torchaudio.functional.resample(source, sr, 16000)
|
| source = source.unsqueeze(0).to('cpu')
|
|
|
| progress(0.6, desc="Converting speech")
|
|
|
|
|
| with torch.inference_mode():
|
|
|
| units = hubert.units(source)
|
|
|
| mel = acoustic.generate(units).transpose(1, 2)
|
|
|
| target = hifigan(mel)
|
| progress(0.9, desc="Postprocessing audio")
|
|
|
| target = target.squeeze().cpu().numpy()
|
| progress(1.0, desc="Conversion complete")
|
| return 16000, target
|
|
|
| """Convert to the target speaker:"""
|
|
|
| def enable_convert_button(audio):
|
| if audio is not None:
|
| return gr.update(interactive=True)
|
| return gr.update(interactive=False)
|
|
|
|
|
| def clear_components():
|
| return None, None
|
|
|
|
|
| js = """
|
| document.addEventListener('DOMContentLoaded', function() {
|
| const audioInput = document.querySelector('input[type="file"]');
|
| const convertButton = document.querySelector('button');
|
|
|
| function updateButtonText() {
|
| if (audioInput.files.length > 0) {
|
| convertButton.innerText = "Uploading audio, please wait ...";
|
| }
|
| }
|
|
|
| audioInput.addEventListener('change', updateButtonText);
|
| });
|
| """
|
|
|
|
|
|
|
|
|
| with gr.Blocks() as interface:
|
| gr.Markdown("# Soft Speech Units for Improved Voice Conversion")
|
| gr.Markdown("Upload an audio file to convert it to the target speaker's voice using soft speech units. Or use your microphone.")
|
|
|
| with gr.Row():
|
| with gr.Column():
|
| audio_input = gr.Audio(type="filepath", label="Upload Audio", sources=["upload", "microphone"])
|
| convert_button = gr.Button("Convert Speech", interactive=False)
|
| with gr.Column():
|
| converted_audio = gr.Audio(type="numpy", label="Converted Speech")
|
|
|
| audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button])
|
|
|
| convert_button.click(convert_speech, inputs=[audio_input], outputs=[converted_audio])
|
|
|
| audio_input.clear(clear_components, inputs=None, outputs=[audio_input, converted_audio])
|
|
|
| interface.launch(debug=False)
|
|
|
|
|