vc_demo / app.py
SefyanKehail
debugging..
83b4e28
raw
history blame
4.07 kB
import torch, torchaudio
import requests
import IPython.display as display
import gradio as gr
import os
def get_file_size_in_mb(file_path):
# Get the file size in bytes
file_size_bytes = os.path.getsize(file_path)
# Convert bytes to megabytes
file_size_mb = file_size_bytes / (1024 * 1024)
return file_size_mb
# Load the models using torch.hub
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
acoustic = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True, pretrained=False)
hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, pretrained=False)
# Load the state dictionaries from the CPU-saved files
hubert_state_dict = torch.load("hubert_cpu.pt", map_location=torch.device('cpu'))
acoustic_state_dict = torch.load("acoustic_cpu.pt", map_location=torch.device('cpu'))
hifigan_state_dict = torch.load("hifigan_cpu.pt", map_location=torch.device('cpu'))
# Set the state dictionaries to the models
hubert.load_state_dict(hubert_state_dict, strict=False)
acoustic.load_state_dict(acoustic_state_dict, strict=False)
hifigan.load_state_dict(hifigan_state_dict, strict=False)
# Move models to CPU (if not already on CPU)
hubert = hubert.to('cpu')
acoustic = acoustic.to('cpu')
hifigan = hifigan.to('cpu')
# Conversion function
def convert_speech(filename, progress=gr.Progress()):
progress(0, desc="Starting conversion")
progress(0.1, desc="Loading audio")
source, sr = torchaudio.load(filename)
progress(0.3, desc="Preprocessing audio")
# Use the first channel if the audio is stereo
if source.shape[0] > 1:
source = source[0, :].unsqueeze(0)
source = torchaudio.functional.resample(source, sr, 16000)
source = source.unsqueeze(0).to('cpu')
progress(0.6, desc="Converting speech")
# Convert to the target speaker:
with torch.inference_mode():
# Extract speech units
units = hubert.units(source)
# Generate target spectrogram
mel = acoustic.generate(units).transpose(1, 2)
# Generate audio waveform
target = hifigan(mel)
progress(0.9, desc="Postprocessing audio")
# Move the tensor to CPU and convert to NumPy
target = target.squeeze().cpu().numpy()
progress(1.0, desc="Conversion complete")
return 16000, target
"""Convert to the target speaker:"""
def enable_convert_button(audio):
if audio is not None:
return gr.update(interactive=True)
return gr.update(interactive=False)
def clear_components():
return None, None
js = """
document.addEventListener('DOMContentLoaded', function() {
const audioInput = document.querySelector('input[type="file"]');
const convertButton = document.querySelector('button');
function updateButtonText() {
if (audioInput.files.length > 0) {
convertButton.innerText = "Uploading audio, please wait ...";
}
}
audioInput.addEventListener('change', updateButtonText);
});
"""
# Gradio interface
with gr.Blocks() as interface:
gr.Markdown("# Soft Speech Units for Improved Voice Conversion")
gr.Markdown("Upload an audio file to convert it to the target speaker's voice using soft speech units. Or use your microphone.")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Upload Audio", sources=["upload", "microphone"])
convert_button = gr.Button("Convert Speech", interactive=False)
with gr.Column():
converted_audio = gr.Audio(type="numpy", label="Converted Speech")
audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button])
convert_button.click(convert_speech, inputs=[audio_input], outputs=[converted_audio])
audio_input.clear(clear_components, inputs=None, outputs=[audio_input, converted_audio])
interface.launch(debug=False)