File size: 4,620 Bytes
92dd9b7 7d807f6 288d695 6bb5d28 fa2468d efabd33 677e8db 231bc11 7585608 83b4e28 ae95dde a7af99a 49bb6e1 ae95dde 49bb6e1 10d2c76 3c80605 490051a 9417555 92dd9b7 833dcce 92dd9b7 0288ea7 92dd9b7 1840c64 92dd9b7 231bc11 7ec5d0b 0e32fbb 92dd9b7 231bc11 7ec5d0b 0e32fbb 92dd9b7 231bc11 92dd9b7 d099b64 92dd9b7 4327eb9 92dd9b7 4ff9373 d2fc5e2 53c09fb d099b64 d500a47 5e3224c 55eefe7 793d67e 92dd9b7 793d67e e244b7c 793d67e 92dd9b7 1af470c d6f741b 4327eb9 793d67e 4327eb9 9952831 5e3224c 55eefe7 d099b64 793d67e d099b64 4327eb9 ad61f31 f68e440 793d67e f68e440 793d67e f68e440 793d67e f68e440 793d67e f68e440 793d67e f68e440 793d67e d9fb395 793d67e 92dd9b7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import torch, torchaudio
import requests
import IPython.display as display
import gradio as gr
import os
import sys
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True, pretrained=False)
acoustic = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True, pretrained=False)
hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, pretrained=False)
hubert_loaded = torch.load("hubert_cpu.pt", map_location=torch.device('cpu'))
acoustic_loaded = torch.load("acoustic_cpu.pt", map_location=torch.device('cpu'))
hifigan_loaded = torch.load("hifigan_cpu.pt", map_location=torch.device('cpu'))
# Set the state dictionaries to the models
# model.load_state_dict(hubert_loaded.state_dict(), strict=False)
# acoustic.load_state_dict(acoustic_loaded.state_dict(), strict=False)
# hifigan.load_state_dict(hifigan_loaded.state_dict(), strict=False)
# print(hubert_loaded)
# print(model)
# sys.exit()
# Move models to CPU (if not already on CPU)
# hubert = hubert.to('cpu')
# acoustic = acoustic.to('cpu')
# hifigan = hifigan.to('cpu')
# Conversion function
def convert_speech(filename, progress=gr.Progress()):
if not filename:
raise ValueError("Please provide an audio")
progress(0, desc="Starting conversion")
progress(0.1, desc="Loading audio")
source, sr = torchaudio.load(filename)
progress(0.3, desc="Preprocessing audio")
# Use the first channel if the audio is stereo
if source.shape[0] > 1:
source = source[0, :].unsqueeze(0)
source = torchaudio.functional.resample(source, sr, 16000)
source = source.unsqueeze(0).to('cpu')
progress(0.6, desc="Converting speech")
# Convert to the target speaker:
with torch.inference_mode():
# Extract speech units
units = hubert_loaded.units(source)
progress(0.7, desc="Generating target spectrogram")
# Generate target spectrogram
mel = acoustic_loaded.generate(units).transpose(1, 2)
progress(0.8, desc="Generating audio waveform")
# Generate audio waveform
target = hifigan_loaded(mel)
progress(0.9, desc="Postprocessing audio")
# Move the tensor to CPU and convert to NumPy
target = target.squeeze().cpu().numpy()
progress(1.0, desc="Conversion complete")
return 16000, target
"""Convert to the target speaker:"""
def enable_convert_button(audio):
if audio is not None:
return gr.update(interactive=True), gr.update(value="", visible=False)
return gr.update(interactive=False), None
def clear_components():
return None, None
def stop_recording_info(audio):
if audio is None:
return gr.update(value="### <i style='color:yellow'>Recording and uploading, please wait ...</i>", visible=True)
return gr.update(value="", visible=False)
def stop():
print("this is working")
# Gradio interface
def gui():
with gr.Blocks() as interface:
gr.Markdown("# Soft Speech Units for Improved Voice Conversion")
gr.Markdown("Upload an audio file to convert it to the target speaker's voice using soft speech units. Or use your microphone.")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload Audio", editable=False)
convert_button = gr.Button("Convert Speech", interactive=False)
info = gr.Markdown("", visible=False)
with gr.Column():
converted_audio = gr.Audio(type="numpy", label="Converted Speech", show_share_button=False)
# Use audio_input.change to trigger stop_recording_info when audio changes
audio_input.start_recording(stop_recording_info, inputs=[audio_input], outputs=[info])
audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button, info])
convert_button.click(convert_speech, inputs=[audio_input], outputs=[converted_audio])
audio_input.clear(clear_components, inputs=None, outputs=[audio_input, converted_audio])
# audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button, info])
return interface
if __name__ == "__main__":
app = gui()
app.queue(default_concurrency_limit=40)
app.launch(
max_threads=40,
share=True,
show_error=True,
quiet=False,
debug=False,
)
|