| import torch |
| import torchaudio |
| import gradio as gr |
|
|
| from zonos.model import Zonos |
| from zonos.conditioning import make_cond_dict, supported_language_codes |
|
|
| device = "cuda" |
| CURRENT_MODEL_TYPE = None |
| CURRENT_MODEL = None |
|
|
|
|
| def load_model_if_needed(model_choice: str): |
| global CURRENT_MODEL_TYPE, CURRENT_MODEL |
| if CURRENT_MODEL_TYPE != model_choice: |
| if CURRENT_MODEL is not None: |
| del CURRENT_MODEL |
| torch.cuda.empty_cache() |
| print(f"Loading {model_choice} model...") |
| if model_choice == "Transformer": |
| CURRENT_MODEL = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=device) |
| else: |
| CURRENT_MODEL = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device=device) |
| CURRENT_MODEL.to(device) |
| CURRENT_MODEL.bfloat16() |
| CURRENT_MODEL.eval() |
| CURRENT_MODEL_TYPE = model_choice |
| print(f"{model_choice} model loaded successfully!") |
| else: |
| print(f"{model_choice} model is already loaded.") |
| return CURRENT_MODEL |
|
|
|
|
| def update_ui(model_choice): |
| """ |
| Dynamically show/hide UI elements based on the model's conditioners. |
| We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model. |
| """ |
| model = load_model_if_needed(model_choice) |
| cond_names = [c.name for c in model.prefix_conditioner.conditioners] |
| print("Conditioners in this model:", cond_names) |
|
|
| text_update = gr.update(visible=("espeak" in cond_names)) |
| language_update = gr.update(visible=("espeak" in cond_names)) |
| speaker_audio_update = gr.update(visible=("speaker" in cond_names)) |
| prefix_audio_update = gr.update(visible=True) |
| skip_speaker_update = gr.update(visible=("speaker" in cond_names)) |
| skip_emotion_update = gr.update(visible=("emotion" in cond_names)) |
| emotion1_update = gr.update(visible=("emotion" in cond_names)) |
| emotion2_update = gr.update(visible=("emotion" in cond_names)) |
| emotion3_update = gr.update(visible=("emotion" in cond_names)) |
| emotion4_update = gr.update(visible=("emotion" in cond_names)) |
| emotion5_update = gr.update(visible=("emotion" in cond_names)) |
| emotion6_update = gr.update(visible=("emotion" in cond_names)) |
| emotion7_update = gr.update(visible=("emotion" in cond_names)) |
| emotion8_update = gr.update(visible=("emotion" in cond_names)) |
| skip_vqscore_8_update = gr.update(visible=("vqscore_8" in cond_names)) |
| vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names)) |
| fmax_slider_update = gr.update(visible=("fmax" in cond_names)) |
| skip_fmax_update = gr.update(visible=("fmax" in cond_names)) |
| pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names)) |
| skip_pitch_std_update = gr.update(visible=("pitch_std" in cond_names)) |
| speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names)) |
| skip_speaking_rate_update = gr.update(visible=("speaking_rate" in cond_names)) |
| dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names)) |
| skip_dnsmos_ovrl_update = gr.update(visible=("dnsmos_ovrl" in cond_names)) |
| speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names)) |
| skip_speaker_noised_update = gr.update(visible=("speaker_noised" in cond_names)) |
|
|
| return ( |
| text_update, |
| language_update, |
| speaker_audio_update, |
| prefix_audio_update, |
| skip_speaker_update, |
| skip_emotion_update, |
| emotion1_update, |
| emotion2_update, |
| emotion3_update, |
| emotion4_update, |
| emotion5_update, |
| emotion6_update, |
| emotion7_update, |
| emotion8_update, |
| skip_vqscore_8_update, |
| vq_single_slider_update, |
| fmax_slider_update, |
| skip_fmax_update, |
| pitch_std_slider_update, |
| skip_pitch_std_update, |
| speaking_rate_slider_update, |
| skip_speaking_rate_update, |
| dnsmos_slider_update, |
| skip_dnsmos_ovrl_update, |
| speaker_noised_checkbox_update, |
| skip_speaker_noised_update, |
| ) |
|
|
|
|
| def generate_audio( |
| model_choice, |
| text, |
| language, |
| speaker_audio, |
| prefix_audio, |
| skip_speaker, |
| skip_emotion, |
| e1, |
| e2, |
| e3, |
| e4, |
| e5, |
| e6, |
| e7, |
| e8, |
| skip_vqscore_8, |
| vq_single, |
| fmax, |
| skip_fmax, |
| pitch_std, |
| skip_pitch_std, |
| speaking_rate, |
| skip_speaking_rate, |
| dnsmos_ovrl, |
| skip_dnsmos_ovrl, |
| speaker_noised, |
| skip_speaker_noised, |
| cfg_scale, |
| min_p, |
| seed, |
| ): |
| """ |
| Generates audio based on the provided UI parameters. |
| We do NOT use language_id or ctc_loss even if the model has them. |
| """ |
| selected_model = load_model_if_needed(model_choice) |
|
|
| uncond_keys = [] |
| if skip_speaker: |
| uncond_keys.append("speaker") |
| if skip_emotion: |
| uncond_keys.append("emotion") |
| if skip_vqscore_8: |
| uncond_keys.append("vqscore_8") |
| if skip_fmax: |
| uncond_keys.append("fmax") |
| if skip_pitch_std: |
| uncond_keys.append("pitch_std") |
| if skip_speaking_rate: |
| uncond_keys.append("speaking_rate") |
| if skip_dnsmos_ovrl: |
| uncond_keys.append("dnsmos_ovrl") |
| if skip_speaker_noised: |
| uncond_keys.append("speaker_noised") |
|
|
| speaker_noised_bool = bool(speaker_noised) |
| fmax = float(fmax) |
| pitch_std = float(pitch_std) |
| speaking_rate = float(speaking_rate) |
| dnsmos_ovrl = float(dnsmos_ovrl) |
| cfg_scale = float(cfg_scale) |
| min_p = float(min_p) |
| seed = int(seed) |
| max_new_tokens = 86 * 30 |
|
|
| torch.manual_seed(seed) |
|
|
| speaker_embedding = None |
| if speaker_audio is not None and not skip_speaker: |
| wav, sr = torchaudio.load(speaker_audio) |
| speaker_embedding = selected_model.make_speaker_embedding(wav, sr) |
| speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16) |
|
|
| audio_prefix_codes = None |
| if prefix_audio is not None: |
| wav_prefix, sr_prefix = torchaudio.load(prefix_audio) |
| wav_prefix = wav_prefix.mean(0, keepdim=True) |
| wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate) |
| wav_prefix = wav_prefix.to(device, dtype=torch.float32) |
| with torch.autocast(device, dtype=torch.float32): |
| audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0)) |
|
|
| emotion_tensor = torch.tensor( |
| [[float(e1), float(e2), float(e3), float(e4), float(e5), float(e6), float(e7), float(e8)]], device=device |
| ) |
|
|
| vq_val = float(vq_single) |
| vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0) |
|
|
| cond_dict = make_cond_dict( |
| text=text, |
| language=language, |
| speaker=speaker_embedding, |
| emotion=emotion_tensor, |
| vqscore_8=vq_tensor, |
| fmax=fmax, |
| pitch_std=pitch_std, |
| speaking_rate=speaking_rate, |
| dnsmos_ovrl=dnsmos_ovrl, |
| speaker_noised=speaker_noised_bool, |
| device=device, |
| unconditional_keys=uncond_keys, |
| ) |
| conditioning = selected_model.prepare_conditioning(cond_dict) |
|
|
| codes = selected_model.generate( |
| prefix_conditioning=conditioning, |
| audio_prefix_codes=audio_prefix_codes, |
| max_new_tokens=max_new_tokens, |
| cfg_scale=cfg_scale, |
| batch_size=1, |
| sampling_params=dict(min_p=min_p), |
| ) |
|
|
| wav_out = selected_model.autoencoder.decode(codes).cpu().detach() |
| sr_out = selected_model.autoencoder.sampling_rate |
| if wav_out.dim() == 2 and wav_out.size(0) > 1: |
| wav_out = wav_out[0:1, :] |
| return sr_out, wav_out.squeeze().numpy() |
|
|
|
|
| def build_interface(): |
| with gr.Blocks() as demo: |
| with gr.Row(): |
| with gr.Column(): |
| model_choice = gr.Dropdown( |
| choices=["Hybrid", "Transformer"], |
| value="Transformer", |
| label="Zonos Model Type", |
| info="Select the model variant to use.", |
| ) |
| text = gr.Textbox( |
| label="Text to Synthesize", value="Zonos uses eSpeak for text to phoneme conversion!", lines=4 |
| ) |
| language = gr.Dropdown( |
| choices=supported_language_codes, |
| value="en-us", |
| label="Language Code", |
| info="Select a language code.", |
| ) |
| prefix_audio = gr.Audio( |
| value="assets/silence_100ms.wav", |
| label="Optional Prefix Audio (continue from this audio)", |
| type="filepath", |
| ) |
| with gr.Column(): |
| speaker_audio = gr.Audio( |
| label="Optional Speaker Audio (for cloning)", |
| type="filepath", |
| ) |
| speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False) |
|
|
| with gr.Column(): |
| gr.Markdown("## Conditioning Parameters") |
|
|
| with gr.Row(): |
| dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="DNSMOS Overall") |
| fmax_slider = gr.Slider(0, 24000, value=22050, step=1, label="Fmax (Hz)") |
| vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="VQ Score") |
| pitch_std_slider = gr.Slider(0.0, 400.0, value=20.0, step=1, label="Pitch Std") |
| speaking_rate_slider = gr.Slider(0.0, 40.0, value=15.0, step=1, label="Speaking Rate") |
|
|
| gr.Markdown("### Emotion Sliders") |
| with gr.Row(): |
| emotion1 = gr.Slider(0.0, 1.0, 0.6, 0.05, label="Happiness") |
| emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness") |
| emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust") |
| emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear") |
| with gr.Row(): |
| emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise") |
| emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger") |
| emotion7 = gr.Slider(0.0, 1.0, 0.5, 0.05, label="Other") |
| emotion8 = gr.Slider(0.0, 1.0, 0.6, 0.05, label="Neutral") |
|
|
| gr.Markdown("### Unconditional Toggles") |
| with gr.Row(): |
| skip_speaker = gr.Checkbox(label="Skip Speaker", value=False) |
| skip_emotion = gr.Checkbox(label="Skip Emotion", value=False) |
| skip_vqscore_8 = gr.Checkbox(label="Skip VQ Score", value=True) |
| skip_fmax = gr.Checkbox(label="Skip Fmax", value=False) |
| skip_pitch_std = gr.Checkbox(label="Skip Pitch Std", value=False) |
| skip_speaking_rate = gr.Checkbox(label="Skip Speaking Rate", value=False) |
| skip_dnsmos_ovrl = gr.Checkbox(label="Skip DNSMOS", value=True) |
| skip_speaker_noised = gr.Checkbox(label="Skip Noised Speaker", value=False) |
|
|
| with gr.Column(): |
| gr.Markdown("## Generation Parameters") |
| with gr.Row(): |
| cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale") |
| min_p_slider = gr.Slider(0.0, 1.0, 0.1, 0.01, label="Min P") |
| seed_number = gr.Number(label="Seed", value=420, precision=0) |
|
|
| generate_button = gr.Button("Generate Audio") |
| output_audio = gr.Audio(label="Generated Audio", type="numpy") |
|
|
| model_choice.change( |
| fn=update_ui, |
| inputs=[model_choice], |
| outputs=[ |
| text, |
| language, |
| speaker_audio, |
| prefix_audio, |
| skip_speaker, |
| skip_emotion, |
| emotion1, |
| emotion2, |
| emotion3, |
| emotion4, |
| emotion5, |
| emotion6, |
| emotion7, |
| emotion8, |
| skip_vqscore_8, |
| vq_single_slider, |
| fmax_slider, |
| skip_fmax, |
| pitch_std_slider, |
| skip_pitch_std, |
| speaking_rate_slider, |
| skip_speaking_rate, |
| dnsmos_slider, |
| skip_dnsmos_ovrl, |
| speaker_noised_checkbox, |
| skip_speaker_noised, |
| ], |
| ) |
|
|
| |
| demo.load( |
| fn=update_ui, |
| inputs=[model_choice], |
| outputs=[ |
| text, |
| language, |
| speaker_audio, |
| prefix_audio, |
| skip_speaker, |
| skip_emotion, |
| emotion1, |
| emotion2, |
| emotion3, |
| emotion4, |
| emotion5, |
| emotion6, |
| emotion7, |
| emotion8, |
| skip_vqscore_8, |
| vq_single_slider, |
| fmax_slider, |
| skip_fmax, |
| pitch_std_slider, |
| skip_pitch_std, |
| speaking_rate_slider, |
| skip_speaking_rate, |
| dnsmos_slider, |
| skip_dnsmos_ovrl, |
| speaker_noised_checkbox, |
| skip_speaker_noised, |
| ], |
| ) |
|
|
| |
| generate_button.click( |
| fn=generate_audio, |
| inputs=[ |
| model_choice, |
| text, |
| language, |
| speaker_audio, |
| prefix_audio, |
| skip_speaker, |
| skip_emotion, |
| emotion1, |
| emotion2, |
| emotion3, |
| emotion4, |
| emotion5, |
| emotion6, |
| emotion7, |
| emotion8, |
| skip_vqscore_8, |
| vq_single_slider, |
| fmax_slider, |
| skip_fmax, |
| pitch_std_slider, |
| skip_pitch_std, |
| speaking_rate_slider, |
| skip_speaking_rate, |
| dnsmos_slider, |
| skip_dnsmos_ovrl, |
| speaker_noised_checkbox, |
| skip_speaker_noised, |
| cfg_scale_slider, |
| min_p_slider, |
| seed_number, |
| ], |
| outputs=[output_audio], |
| ) |
|
|
| return demo |
|
|
|
|
| if __name__ == "__main__": |
| demo = build_interface() |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=True) |