vc_demo / app.py
SefyanKehail
solved
ad61f31
import torch, torchaudio
import requests
import IPython.display as display
import gradio as gr
import os
import sys
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True, pretrained=False)
acoustic = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True, pretrained=False)
hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, pretrained=False)
hubert_loaded = torch.load("hubert_cpu.pt", map_location=torch.device('cpu'))
acoustic_loaded = torch.load("acoustic_cpu.pt", map_location=torch.device('cpu'))
hifigan_loaded = torch.load("hifigan_cpu.pt", map_location=torch.device('cpu'))
# Set the state dictionaries to the models
# model.load_state_dict(hubert_loaded.state_dict(), strict=False)
# acoustic.load_state_dict(acoustic_loaded.state_dict(), strict=False)
# hifigan.load_state_dict(hifigan_loaded.state_dict(), strict=False)
# print(hubert_loaded)
# print(model)
# sys.exit()
# Move models to CPU (if not already on CPU)
# hubert = hubert.to('cpu')
# acoustic = acoustic.to('cpu')
# hifigan = hifigan.to('cpu')
# Conversion function
def convert_speech(filename, progress=gr.Progress()):
if not filename:
raise ValueError("Please provide an audio")
progress(0, desc="Starting conversion")
progress(0.1, desc="Loading audio")
source, sr = torchaudio.load(filename)
progress(0.3, desc="Preprocessing audio")
# Use the first channel if the audio is stereo
if source.shape[0] > 1:
source = source[0, :].unsqueeze(0)
source = torchaudio.functional.resample(source, sr, 16000)
source = source.unsqueeze(0).to('cpu')
progress(0.6, desc="Converting speech")
# Convert to the target speaker:
with torch.inference_mode():
# Extract speech units
units = hubert_loaded.units(source)
progress(0.7, desc="Generating target spectrogram")
# Generate target spectrogram
mel = acoustic_loaded.generate(units).transpose(1, 2)
progress(0.8, desc="Generating audio waveform")
# Generate audio waveform
target = hifigan_loaded(mel)
progress(0.9, desc="Postprocessing audio")
# Move the tensor to CPU and convert to NumPy
target = target.squeeze().cpu().numpy()
progress(1.0, desc="Conversion complete")
return 16000, target
"""Convert to the target speaker:"""
def enable_convert_button(audio):
if audio is not None:
return gr.update(interactive=True), gr.update(value="", visible=False)
return gr.update(interactive=False), None
def clear_components():
return None, None
def stop_recording_info(audio):
if audio is None:
return gr.update(value="### <i style='color:yellow'>Recording and uploading, please wait ...</i>", visible=True)
return gr.update(value="", visible=False)
def stop():
print("this is working")
# Gradio interface
def gui():
with gr.Blocks() as interface:
gr.Markdown("# Soft Speech Units for Improved Voice Conversion")
gr.Markdown("Upload an audio file to convert it to the target speaker's voice using soft speech units. Or use your microphone.")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload Audio", editable=False)
convert_button = gr.Button("Convert Speech", interactive=False)
info = gr.Markdown("", visible=False)
with gr.Column():
converted_audio = gr.Audio(type="numpy", label="Converted Speech", show_share_button=False)
# Use audio_input.change to trigger stop_recording_info when audio changes
audio_input.start_recording(stop_recording_info, inputs=[audio_input], outputs=[info])
audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button, info])
convert_button.click(convert_speech, inputs=[audio_input], outputs=[converted_audio])
audio_input.clear(clear_components, inputs=None, outputs=[audio_input, converted_audio])
# audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button, info])
return interface
if __name__ == "__main__":
app = gui()
app.queue(default_concurrency_limit=40)
app.launch(
max_threads=40,
share=True,
show_error=True,
quiet=False,
debug=False,
)