Spaces:
Paused
Paused
| import os | |
| import shlex | |
| import subprocess | |
| subprocess.run( | |
| shlex.split("pip install flash-attn --no-build-isolation"), | |
| env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, | |
| check=True, | |
| ) | |
| subprocess.run( | |
| shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), | |
| check=True, | |
| ) | |
| subprocess.run( | |
| shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), | |
| check=True, | |
| ) | |
| import spaces | |
| import torch | |
| import torchaudio | |
| import gradio as gr | |
| from os import getenv | |
| def patch_cuda(): | |
| if torch.cuda.is_available(): | |
| for i in range(torch.cuda.device_count()): | |
| p = torch.cuda.get_device_properties(i) | |
| if not hasattr(p, "regs_per_multiprocessor"): | |
| setattr(p, "regs_per_multiprocessor", 65536) | |
| if not hasattr(p, "max_threads_per_multi_processor"): | |
| setattr(p, "max_threads_per_multi_processor", 2048) | |
| patch_cuda() | |
| from zonos.model import Zonos | |
| from zonos.conditioning import make_cond_dict, supported_language_codes | |
| device = "cuda" | |
| MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"] | |
| MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES} | |
| for model in MODELS.values(): | |
| model.requires_grad_(False).eval() | |
| def update_ui(model_choice): | |
| """ | |
| Dynamically show/hide UI elements based on the model's conditioners. | |
| We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model. | |
| """ | |
| model = MODELS[model_choice] | |
| cond_names = [c.name for c in model.prefix_conditioner.conditioners] | |
| print("Conditioners in this model:", cond_names) | |
| text_update = gr.update(visible=("espeak" in cond_names)) | |
| language_update = gr.update(visible=("espeak" in cond_names)) | |
| speaker_audio_update = gr.update(visible=("speaker" in cond_names)) | |
| prefix_audio_update = gr.update(visible=True) | |
| emotion1_update = gr.update(visible=("emotion" in cond_names)) | |
| emotion2_update = gr.update(visible=("emotion" in cond_names)) | |
| emotion3_update = gr.update(visible=("emotion" in cond_names)) | |
| emotion4_update = gr.update(visible=("emotion" in cond_names)) | |
| emotion5_update = gr.update(visible=("emotion" in cond_names)) | |
| emotion6_update = gr.update(visible=("emotion" in cond_names)) | |
| emotion7_update = gr.update(visible=("emotion" in cond_names)) | |
| emotion8_update = gr.update(visible=("emotion" in cond_names)) | |
| vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names)) | |
| fmax_slider_update = gr.update(visible=("fmax" in cond_names)) | |
| pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names)) | |
| speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names)) | |
| dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names)) | |
| speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names)) | |
| unconditional_keys_update = gr.update( | |
| choices=[name for name in cond_names if name not in ("espeak", "language_id")] | |
| ) | |
| return ( | |
| text_update, | |
| language_update, | |
| speaker_audio_update, | |
| prefix_audio_update, | |
| emotion1_update, | |
| emotion2_update, | |
| emotion3_update, | |
| emotion4_update, | |
| emotion5_update, | |
| emotion6_update, | |
| emotion7_update, | |
| emotion8_update, | |
| vq_single_slider_update, | |
| fmax_slider_update, | |
| pitch_std_slider_update, | |
| speaking_rate_slider_update, | |
| dnsmos_slider_update, | |
| speaker_noised_checkbox_update, | |
| unconditional_keys_update, | |
| ) | |
| def generate_audio( | |
| model_choice, | |
| text, | |
| language, | |
| speaker_audio, | |
| prefix_audio, | |
| e1, | |
| e2, | |
| e3, | |
| e4, | |
| e5, | |
| e6, | |
| e7, | |
| e8, | |
| vq_single, | |
| fmax, | |
| pitch_std, | |
| speaking_rate, | |
| dnsmos_ovrl, | |
| speaker_noised, | |
| cfg_scale, | |
| min_p, | |
| seed, | |
| randomize_seed, | |
| unconditional_keys, | |
| progress=gr.Progress(), | |
| ): | |
| """ | |
| Generates audio based on the provided UI parameters. | |
| We do NOT use language_id or ctc_loss even if the model has them. | |
| """ | |
| selected_model = MODELS[model_choice] | |
| speaker_noised_bool = bool(speaker_noised) | |
| fmax = float(fmax) | |
| pitch_std = float(pitch_std) | |
| speaking_rate = float(speaking_rate) | |
| dnsmos_ovrl = float(dnsmos_ovrl) | |
| cfg_scale = float(cfg_scale) | |
| min_p = float(min_p) | |
| seed = int(seed) | |
| max_new_tokens = 86 * 30 | |
| if randomize_seed: | |
| seed = torch.randint(0, 2**32 - 1, (1,)).item() | |
| torch.manual_seed(seed) | |
| speaker_embedding = None | |
| if speaker_audio is not None and "speaker" not in unconditional_keys: | |
| wav, sr = torchaudio.load(speaker_audio) | |
| speaker_embedding = selected_model.make_speaker_embedding(wav, sr) | |
| speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16) | |
| audio_prefix_codes = None | |
| if prefix_audio is not None: | |
| wav_prefix, sr_prefix = torchaudio.load(prefix_audio) | |
| wav_prefix = wav_prefix.mean(0, keepdim=True) | |
| wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate) | |
| wav_prefix = wav_prefix.to(device, dtype=torch.float32) | |
| with torch.autocast(device, dtype=torch.float32): | |
| audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0)) | |
| emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device) | |
| vq_val = float(vq_single) | |
| vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0) | |
| cond_dict = make_cond_dict( | |
| text=text, | |
| language=language, | |
| speaker=speaker_embedding, | |
| emotion=emotion_tensor, | |
| vqscore_8=vq_tensor, | |
| fmax=fmax, | |
| pitch_std=pitch_std, | |
| speaking_rate=speaking_rate, | |
| dnsmos_ovrl=dnsmos_ovrl, | |
| speaker_noised=speaker_noised_bool, | |
| device=device, | |
| unconditional_keys=unconditional_keys, | |
| ) | |
| conditioning = selected_model.prepare_conditioning(cond_dict) | |
| estimated_generation_duration = 30 * len(text) / 400 | |
| estimated_total_steps = int(estimated_generation_duration * 86) | |
| def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool: | |
| progress((step, estimated_total_steps)) | |
| return True | |
| codes = selected_model.generate( | |
| prefix_conditioning=conditioning, | |
| audio_prefix_codes=audio_prefix_codes, | |
| max_new_tokens=max_new_tokens, | |
| cfg_scale=cfg_scale, | |
| batch_size=1, | |
| sampling_params=dict(min_p=min_p), | |
| callback=update_progress, | |
| ) | |
| wav_out = selected_model.autoencoder.decode(codes).cpu().detach() | |
| sr_out = selected_model.autoencoder.sampling_rate | |
| if wav_out.dim() == 2 and wav_out.size(0) > 1: | |
| wav_out = wav_out[0:1, :] | |
| return (sr_out, wav_out.squeeze().numpy()), seed | |
| # Custom CSS for pastel gradient background and enhanced UI | |
| custom_css = """ | |
| .gradio-container { | |
| background: linear-gradient(135deg, #f3e7ff, #e6f0ff, #ffe6f2, #e6fff9); | |
| background-size: 400% 400%; | |
| animation: gradient 15s ease infinite; | |
| } | |
| @keyframes gradient { | |
| 0% { | |
| background-position: 0% 50%; | |
| } | |
| 50% { | |
| background-position: 100% 50%; | |
| } | |
| 100% { | |
| background-position: 0% 50%; | |
| } | |
| } | |
| .container { | |
| max-width: 1200px; | |
| margin: 0 auto; | |
| padding: 20px; | |
| } | |
| .panel { | |
| background-color: rgba(255, 255, 255, 0.7); | |
| border-radius: 16px; | |
| padding: 20px; | |
| box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08); | |
| margin-bottom: 16px; | |
| backdrop-filter: blur(5px); | |
| transition: all 0.3s ease; | |
| } | |
| .panel:hover { | |
| box-shadow: 0 6px 16px rgba(0, 0, 0, 0.12); | |
| transform: translateY(-2px); | |
| } | |
| .title { | |
| font-size: 1.2em; | |
| font-weight: 600; | |
| margin-bottom: 12px; | |
| color: #6a3ea1; | |
| border-bottom: 2px solid #f0e6ff; | |
| padding-bottom: 8px; | |
| } | |
| .slider-container { | |
| background-color: rgba(255, 255, 255, 0.5); | |
| border-radius: 10px; | |
| padding: 10px; | |
| margin: 5px 0; | |
| } | |
| /* Make sliders more appealing */ | |
| input[type=range] { | |
| height: 5px; | |
| appearance: none; | |
| width: 100%; | |
| border-radius: 3px; | |
| background: linear-gradient(90deg, #9c83e0, #83b1e0); | |
| } | |
| .generate-button { | |
| background: linear-gradient(90deg, #a673ff, #7c4dff); | |
| color: white; | |
| border: none; | |
| border-radius: 8px; | |
| padding: 12px 24px; | |
| font-size: 16px; | |
| font-weight: 500; | |
| cursor: pointer; | |
| transition: all 0.3s ease; | |
| box-shadow: 0 4px 10px rgba(124, 77, 255, 0.2); | |
| display: block; | |
| width: 100%; | |
| margin: 20px 0; | |
| } | |
| .generate-button:hover { | |
| background: linear-gradient(90deg, #9c5eff, #6a3aff); | |
| box-shadow: 0 6px 15px rgba(124, 77, 255, 0.3); | |
| transform: translateY(-2px); | |
| } | |
| /* Tabs styling */ | |
| .tabs { | |
| display: flex; | |
| border-bottom: 1px solid #e0e0e0; | |
| margin-bottom: 20px; | |
| } | |
| .tab { | |
| padding: 10px 20px; | |
| cursor: pointer; | |
| transition: all 0.3s ease; | |
| background-color: transparent; | |
| border: none; | |
| color: #666; | |
| } | |
| .tab.active { | |
| color: #7c4dff; | |
| border-bottom: 3px solid #7c4dff; | |
| font-weight: 600; | |
| } | |
| /* Emotion sliders container */ | |
| .emotion-grid { | |
| display: grid; | |
| grid-template-columns: repeat(4, 1fr); | |
| gap: 12px; | |
| } | |
| /* Header styling */ | |
| .app-header { | |
| text-align: center; | |
| margin-bottom: 25px; | |
| } | |
| .app-header h1 { | |
| font-size: 2.5em; | |
| color: #6a3ea1; | |
| margin-bottom: 8px; | |
| font-weight: 700; | |
| } | |
| .app-header p { | |
| font-size: 1.1em; | |
| color: #666; | |
| margin-bottom: 20px; | |
| } | |
| /* Audio player styling */ | |
| .audio-output { | |
| margin-top: 20px; | |
| } | |
| /* Make output area more prominent */ | |
| .output-container { | |
| background-color: rgba(255, 255, 255, 0.85); | |
| border-radius: 16px; | |
| padding: 24px; | |
| box-shadow: 0 8px 18px rgba(0, 0, 0, 0.1); | |
| margin-top: 20px; | |
| } | |
| """ | |
| def build_interface(): | |
| # Build interface with enhanced visual elements and layout | |
| with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: | |
| # Header section | |
| with gr.Column(elem_classes="app-header"): | |
| gr.Markdown("# ✨ Zonos Text-to-Speech Generator ✨") | |
| gr.Markdown("Create natural-sounding speech with customizable voice characteristics") | |
| # Main content container | |
| with gr.Column(elem_classes="container"): | |
| # First panel - Text & Model Selection | |
| with gr.Column(elem_classes="panel"): | |
| gr.Markdown('<div class="title">💬 Text & Model Configuration</div>') | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| model_choice = gr.Dropdown( | |
| choices=MODEL_NAMES, | |
| value="Zyphra/Zonos-v0.1-transformer", | |
| label="Zonos Model Type", | |
| info="Select the model variant to use.", | |
| ) | |
| text = gr.Textbox( | |
| label="Text to Synthesize", | |
| value="Zonos uses eSpeak for text to phoneme conversion!", | |
| lines=4, | |
| max_length=500, | |
| ) | |
| language = gr.Dropdown( | |
| choices=supported_language_codes, | |
| value="en-us", | |
| label="Language Code", | |
| info="Select a language code.", | |
| ) | |
| with gr.Column(scale=1): | |
| prefix_audio = gr.Audio( | |
| value="assets/silence_100ms.wav", | |
| label="Optional Prefix Audio (continue from this audio)", | |
| type="filepath", | |
| ) | |
| # Second panel - Voice Characteristics | |
| with gr.Column(elem_classes="panel"): | |
| gr.Markdown('<div class="title">🎤 Voice Characteristics</div>') | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| speaker_audio = gr.Audio( | |
| label="Optional Speaker Audio (for voice cloning)", | |
| type="filepath", | |
| ) | |
| speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False) | |
| with gr.Column(scale=2): | |
| with gr.Row(): | |
| with gr.Column(): | |
| dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="Voice Quality", elem_classes="slider-container") | |
| fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Frequency Max (Hz)", elem_classes="slider-container") | |
| vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="Voice Clarity", elem_classes="slider-container") | |
| with gr.Column(): | |
| pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Variation", elem_classes="slider-container") | |
| speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", elem_classes="slider-container") | |
| # Third panel - Generation Parameters | |
| with gr.Column(elem_classes="panel"): | |
| gr.Markdown('<div class="title">⚙️ Generation Parameters</div>') | |
| with gr.Row(): | |
| with gr.Column(): | |
| cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="Guidance Scale", elem_classes="slider-container") | |
| min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P (Randomness)", elem_classes="slider-container") | |
| with gr.Column(): | |
| seed_number = gr.Number(label="Seed", value=420, precision=0) | |
| randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True) | |
| # Emotion Panel with Tabbed Interface | |
| with gr.Accordion("🎭 Emotion Settings", open=False, elem_classes="panel"): | |
| gr.Markdown( | |
| "Adjust these sliders to control the emotional tone of the generated speech.\n" | |
| "For a neutral voice, keep 'Neutral' high and other emotions low." | |
| ) | |
| with gr.Row(elem_classes="emotion-grid"): | |
| emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness", elem_classes="slider-container") | |
| emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness", elem_classes="slider-container") | |
| emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust", elem_classes="slider-container") | |
| emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear", elem_classes="slider-container") | |
| with gr.Row(elem_classes="emotion-grid"): | |
| emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise", elem_classes="slider-container") | |
| emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger", elem_classes="slider-container") | |
| emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other", elem_classes="slider-container") | |
| emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral", elem_classes="slider-container") | |
| # Advanced Settings Panel | |
| with gr.Accordion("⚡ Advanced Settings", open=False, elem_classes="panel"): | |
| gr.Markdown( | |
| "### Unconditional Toggles\n" | |
| "Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n" | |
| 'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".' | |
| ) | |
| unconditional_keys = gr.CheckboxGroup( | |
| [ | |
| "speaker", | |
| "emotion", | |
| "vqscore_8", | |
| "fmax", | |
| "pitch_std", | |
| "speaking_rate", | |
| "dnsmos_ovrl", | |
| "speaker_noised", | |
| ], | |
| value=["emotion"], | |
| label="Unconditional Keys", | |
| ) | |
| # Generate Button and Output Area | |
| with gr.Column(elem_classes="panel output-container"): | |
| gr.Markdown('<div class="title">🔊 Generate & Output</div>') | |
| generate_button = gr.Button("Generate Audio", elem_classes="generate-button") | |
| output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True, elem_classes="audio-output") | |
| model_choice.change( | |
| fn=update_ui, | |
| inputs=[model_choice], | |
| outputs=[ | |
| text, | |
| language, | |
| speaker_audio, | |
| prefix_audio, | |
| emotion1, | |
| emotion2, | |
| emotion3, | |
| emotion4, | |
| emotion5, | |
| emotion6, | |
| emotion7, | |
| emotion8, | |
| vq_single_slider, | |
| fmax_slider, | |
| pitch_std_slider, | |
| speaking_rate_slider, | |
| dnsmos_slider, | |
| speaker_noised_checkbox, | |
| unconditional_keys, | |
| ], | |
| ) | |
| # On page load, trigger the same UI refresh | |
| demo.load( | |
| fn=update_ui, | |
| inputs=[model_choice], | |
| outputs=[ | |
| text, | |
| language, | |
| speaker_audio, | |
| prefix_audio, | |
| emotion1, | |
| emotion2, | |
| emotion3, | |
| emotion4, | |
| emotion5, | |
| emotion6, | |
| emotion7, | |
| emotion8, | |
| vq_single_slider, | |
| fmax_slider, | |
| pitch_std_slider, | |
| speaking_rate_slider, | |
| dnsmos_slider, | |
| speaker_noised_checkbox, | |
| unconditional_keys, | |
| ], | |
| ) | |
| # Generate audio on button click | |
| generate_button.click( | |
| fn=generate_audio, | |
| inputs=[ | |
| model_choice, | |
| text, | |
| language, | |
| speaker_audio, | |
| prefix_audio, | |
| emotion1, | |
| emotion2, | |
| emotion3, | |
| emotion4, | |
| emotion5, | |
| emotion6, | |
| emotion7, | |
| emotion8, | |
| vq_single_slider, | |
| fmax_slider, | |
| pitch_std_slider, | |
| speaking_rate_slider, | |
| dnsmos_slider, | |
| speaker_noised_checkbox, | |
| cfg_scale_slider, | |
| min_p_slider, | |
| seed_number, | |
| randomize_seed_toggle, | |
| unconditional_keys, | |
| ], | |
| outputs=[output_audio, seed_number], | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = build_interface() | |
| demo.launch() | |
| # # Imports | |
| # import gradio as gr | |
| # import spaces | |
| # import os | |
| # import torch | |
| # import torchaudio | |
| # import time | |
| # from zonos.model import Zonos | |
| # from zonos.conditioning import make_cond_dict, supported_language_codes | |
| # # Variables | |
| # HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| # device = "cuda" | |
| # REPO = "Zyphra/Zonos-v0.1-transformer" | |
| # model = Zonos.from_pretrained(REPO, device=device) | |
| # # Functions | |
| # def patch_cuda(): | |
| # if torch.cuda.is_available(): | |
| # for i in range(torch.cuda.device_count()): | |
| # p = torch.cuda.get_device_properties(i) | |
| # if not hasattr(p, "regs_per_multiprocessor"): | |
| # setattr(p, "regs_per_multiprocessor", 65536) | |
| # if not hasattr(p, "max_threads_per_multi_processor"): | |
| # setattr(p, "max_threads_per_multi_processor", 2048) | |
| # @spaces.GPU | |
| # def generate(input, language, speaker_audio, emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral, clarity, fmax, pitch_std, speaking_rate, dnsmos_ovrl, cfg_scale, min_p, steps, seed, randomize_seed): | |
| # if randomize_seed: seed = int(time.time()) | |
| # torch.manual_seed(seed) | |
| # speaker_embedding = None | |
| # if speaker_audio is not None: | |
| # print(1) | |
| # print(speaker_audio) | |
| # wav, sr = torchaudio.load(speaker_audio) | |
| # print(2) | |
| # print(wav) | |
| # print(sr) | |
| # speaker_embedding = (model.make_speaker_embedding(wav, sr).to(device, dtype=torch.bfloat16)) | |
| # print(3) | |
| # print(speaker_embedding) | |
| # emotion_tensor = torch.tensor([emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral], device=device, dtype=torch.bfloat16) | |
| # vq_tensor = torch.tensor([clarity] * 8, device=device, dtype=torch.bfloat16).unsqueeze(0) | |
| # print(4) | |
| # print(emotion_tensor) | |
| # print(vq_tensor) | |
| # cond_dict = make_cond_dict( | |
| # text=input, | |
| # language=language, | |
| # speaker=speaker_embedding, | |
| # emotion=emotion_tensor, | |
| # vqscore_8=vq_tensor, | |
| # fmax=float(fmax), | |
| # pitch_std=float(pitch_std), | |
| # speaking_rate=float(speaking_rate), | |
| # dnsmos_ovrl=float(dnsmos_ovrl), | |
| # device=device, | |
| # ) | |
| # print(5) | |
| # print(cond_dict) | |
| # conditioning = model.prepare_conditioning(cond_dict) | |
| # print(6) | |
| # print(conditioning) | |
| # codes = model.generate( | |
| # prefix_conditioning=conditioning, | |
| # max_new_tokens=int(steps), | |
| # cfg_scale=float(cfg_scale), | |
| # batch_size=1, | |
| # sampling_params=dict(min_p=float(min_p)), | |
| # ) | |
| # print(7) | |
| # print(codes) | |
| # wav_out = model.autoencoder.decode(codes).cpu().detach() | |
| # sr_out = model.autoencoder.sampling_rate | |
| # print(8) | |
| # print(wav_out) | |
| # print(sr_out) | |
| # if wav_out.dim() == 2 and wav_out.size(0) > 1: wav_out = wav_out[0:1, :] | |
| # print(9) | |
| # print((sr_out, wav_out.squeeze().numpy())) | |
| # return (sr_out, wav_out.squeeze().numpy()) | |
| # # Initialize | |
| # patch_cuda() | |
| # with gr.Blocks() as main: | |
| # text = gr.Textbox(label="text", value="hello, world!") | |
| # language = gr.Dropdown(choices=supported_language_codes, value="en-us", label="language") | |
| # speaker_audio = gr.Audio(label="voice reference", type="filepath") | |
| # clarity_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="clarity") | |
| # steps_slider = gr.Slider(1, 3000, 316, 1, label="steps") | |
| # dnsmos_slider = gr.Slider(1.0, 5.0, 5.0, 0.1, label="quality") | |
| # fmax_slider = gr.Slider(0, 24000, 24000, 1, label="fmax") | |
| # pitch_std_slider = gr.Slider(0.0, 1000.0, 30.0, 1, label="pitch std") | |
| # speaking_rate_slider = gr.Slider(5.0, 30.0, 15.0, 0.1, label="rate") | |
| # cfg_scale_slider = gr.Slider(1.0, 5.0, 2.5, 0.1, label="guidance") | |
| # min_p_slider = gr.Slider(0.0, 1.0, 0.05, 0.15, label="min p") | |
| # with gr.Row(): | |
| # e1 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="happy") | |
| # e2 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="sad") | |
| # e3 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="disgust") | |
| # e4 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="fear") | |
| # e5 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="surprise") | |
| # e6 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="anger") | |
| # e7 = gr.Slider(0.0, 1.0, 0.0, 0.01, label="other") | |
| # e8 = gr.Slider(0.0, 1.0, 1.0, 0.01, label="neutral") | |
| # seed_number = gr.Number(label="seed", value=42, precision=0) | |
| # randomize_seed_toggle = gr.Checkbox(label="randomize seed", value=True) | |
| # generate_button = gr.Button("generate") | |
| # output_audio = gr.Audio(label="output", type="numpy", autoplay=True) | |
| # generate_button.click(fn=generate, inputs=[text, language, speaker_audio, e1, e2, e3, e4, e5, e6, e7, e8, clarity_slider, fmax_slider, pitch_std_slider, speaking_rate_slider, dnsmos_slider, cfg_scale_slider, min_p_slider, steps_slider, seed_number, randomize_seed_toggle], outputs=output_audio) | |
| # main.launch() |