import os import torch import argparse import gradio as gr parser = argparse.ArgumentParser() # parser.add_argument( # "--online_checkpoint_url", # default="https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_1226.zip", # ) parser.add_argument( "--share", action="store_true", default=False, help="make link public" ) args = parser.parse_args() # first download the checkpoints from server # if not os.path.exists("checkpoints/"): # print("Downloading OpenVoice checkpoint ...") # os.system(f"wget {args.online_checkpoint_url} -O ckpt.zip") # print("Extracting OpenVoice checkpoint ...") # ZipFile("ckpt.zip").extractall() print("Starting OpenVoice") from openvoice import se_extractor from openvoice.api import ToneColorConverter ckpt_converter = "checkpoints/converter" device = "cuda" if torch.cuda.is_available() else "cpu" output_dir = "outputs" os.makedirs(output_dir, exist_ok=True) tone_color_converter = ToneColorConverter( f"{ckpt_converter}/config.json", device=device ) tone_color_converter.load_ckpt(f"{ckpt_converter}/checkpoint.pth") def predict(speaker_wav, transform_wav): # initialize a empty info text_hint = "" # extract source_se source_se = se_extractor.get_se( transform_wav, tone_color_converter, ) # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference try: target_se = se_extractor.get_se( speaker_wav, tone_color_converter, ) except Exception as e: text_hint += f"[ERROR] Get target tone color error {str(e)} \n" gr.Warning("[ERROR] Get target tone color error {str(e)} \n") return ( text_hint, None, None, ) save_path = f"{output_dir}/output.wav" # Run the tone color converter tone_color_converter.convert( audio_src_path=transform_wav, src_se=source_se, tgt_se=target_se, output_path=save_path, ) text_hint += f"""Get response successfully \n""" return ( text_hint, save_path, speaker_wav, ) with gr.Blocks(analytics_enabled=False) as demo: with gr.Row(): with gr.Column(): ref_gr = gr.Audio( label="Reference Audio", # info="Click on the ✎ button to upload your own target speaker audio", type="filepath", value="examples/speaker0.mp3", ) tra_gr = gr.Audio( label="Transform Audio", # info="Click on the ✎ button to upload your own target transform audio", type="filepath", value=None, ) tts_button = gr.Button("Send", elem_id="send-btn", visible=True) with gr.Column(): out_text_gr = gr.Text(label="Info") audio_gr = gr.Audio(label="Synthesized Audio", autoplay=True) ref_audio_gr = gr.Audio(label="Reference Audio Used") tts_button.click( predict, [ref_gr, tra_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr], ) demo.queue() demo.launch(debug=True, show_api=True, share=args.share)