import torch, torchaudio import requests import IPython.display as display import gradio as gr import os import sys hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True, pretrained=False) acoustic = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True, pretrained=False) hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, pretrained=False) hubert_loaded = torch.load("hubert_cpu.pt", map_location=torch.device('cpu')) acoustic_loaded = torch.load("acoustic_cpu.pt", map_location=torch.device('cpu')) hifigan_loaded = torch.load("hifigan_cpu.pt", map_location=torch.device('cpu')) # Set the state dictionaries to the models # model.load_state_dict(hubert_loaded.state_dict(), strict=False) # acoustic.load_state_dict(acoustic_loaded.state_dict(), strict=False) # hifigan.load_state_dict(hifigan_loaded.state_dict(), strict=False) # print(hubert_loaded) # print(model) # sys.exit() # Move models to CPU (if not already on CPU) # hubert = hubert.to('cpu') # acoustic = acoustic.to('cpu') # hifigan = hifigan.to('cpu') # Conversion function def convert_speech(filename, progress=gr.Progress()): if not filename: raise ValueError("Please provide an audio") progress(0, desc="Starting conversion") progress(0.1, desc="Loading audio") source, sr = torchaudio.load(filename) progress(0.3, desc="Preprocessing audio") # Use the first channel if the audio is stereo if source.shape[0] > 1: source = source[0, :].unsqueeze(0) source = torchaudio.functional.resample(source, sr, 16000) source = source.unsqueeze(0).to('cpu') progress(0.6, desc="Converting speech") # Convert to the target speaker: with torch.inference_mode(): # Extract speech units units = hubert_loaded.units(source) progress(0.7, desc="Generating target spectrogram") # Generate target spectrogram mel = acoustic_loaded.generate(units).transpose(1, 2) progress(0.8, desc="Generating audio waveform") # Generate audio waveform target = hifigan_loaded(mel) progress(0.9, desc="Postprocessing audio") # Move the tensor to CPU and convert to NumPy target = target.squeeze().cpu().numpy() progress(1.0, desc="Conversion complete") return 16000, target """Convert to the target speaker:""" def enable_convert_button(audio): if audio is not None: return gr.update(interactive=True), gr.update(value="", visible=False) return gr.update(interactive=False), None def clear_components(): return None, None def stop_recording_info(audio): if audio is None: return gr.update(value="### Recording and uploading, please wait ...", visible=True) return gr.update(value="", visible=False) def stop(): print("this is working") # Gradio interface def gui(): with gr.Blocks() as interface: gr.Markdown("# Soft Speech Units for Improved Voice Conversion") gr.Markdown("Upload an audio file to convert it to the target speaker's voice using soft speech units. Or use your microphone.") with gr.Row(): with gr.Column(): audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload Audio", editable=False) convert_button = gr.Button("Convert Speech", interactive=False) info = gr.Markdown("", visible=False) with gr.Column(): converted_audio = gr.Audio(type="numpy", label="Converted Speech", show_share_button=False) # Use audio_input.change to trigger stop_recording_info when audio changes audio_input.start_recording(stop_recording_info, inputs=[audio_input], outputs=[info]) audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button, info]) convert_button.click(convert_speech, inputs=[audio_input], outputs=[converted_audio]) audio_input.clear(clear_components, inputs=None, outputs=[audio_input, converted_audio]) # audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button, info]) return interface if __name__ == "__main__": app = gui() app.queue(default_concurrency_limit=40) app.launch( max_threads=40, share=True, show_error=True, quiet=False, debug=False, )