Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| from gradio_client import Client | |
| from test_overrides import _get_param_examples, _override_params | |
| _DESCRIPTION = """ | |
| Proxy to [xVASynth Gradio Space](https://huggingface.co/spaces/Pendrokar/xVASynth-TTS), as that space uses a Python version that MCP does not support. You can add this TTS as a tool using the "Use via API or MCP" instructions within the footer of this page. | |
| """ | |
| voice_models = [ | |
| ("👩 #ex04", "x_ex04"), | |
| ("🧑 #ex01", "x_ex01"), | |
| ("👱♀️ 🇬🇧 #92", "ccby_nvidia_hifi_92_F"), | |
| ("👨🦳 #6671", "ccby_nvidia_hifi_6671_M"), | |
| ] | |
| voice_models_more = [ | |
| ("👸 #ex02", "x_ex02"), | |
| ("👨🦱 #ex03", "x_ex03"), | |
| ("🧔 #6670", "ccby_nvidia_hifi_6670_M"), | |
| ("👨🦲 #9017", "ccby_nvidia_hifi_9017_M"), | |
| ("🧑 #6097", "ccby_nvidia_hifi_6097_M"), | |
| ("👩🦱 #12787", "ccby_nvidia_hifi_12787_F"), | |
| ("👵 #11614", "ccby_nv_hifi_11614_F"), | |
| ("👩🦰 #8051", "ccby_nvidia_hifi_8051_F"), | |
| ("👩🦳 #11697", "ccby_nvidia_hifi_11697_F"), | |
| ("👩🦲 #9136", "ccby_nvidia_hifi_9136_F"), | |
| ("♟ Lojban", "x_selpahi"), # v2 model for Lojban, pre- the multilingual capabilities of xVASynth v3 | |
| ] | |
| # order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA | |
| languages = [ | |
| ("🇺🇸 EN", "en"), | |
| ("🇩🇪 DE", "de"), | |
| ("🇪🇸 ES", "es"), | |
| ("🇮🇳 HI", "hi"), | |
| ("🇨🇳 ZH", "zh"), | |
| ] | |
| languages_more = [ | |
| ("🇳🇱 NL", "nl"), | |
| ("🇧🇷 PT", "pt"), | |
| ("🇮🇹 IT", "it"), | |
| ("🇵🇱 PL", "pl"), | |
| ("🇷🇴 RO", "ro"), | |
| ("🇸🇪 SV", "sv"), | |
| ("🇩🇰 DA", "da"), | |
| ("🇫🇮 FI", "fi"), | |
| ("🇭🇺 HU", "hu"), | |
| ("🇬🇷 EL", "el"), | |
| ("🇫🇷 FR", "fr"), | |
| ("🇷🇺 RU", "ru"), | |
| ("🇺🇦 UA", "uk"), | |
| ("🇹🇷 TR", "tr"), | |
| ("🇸🇦 AR", "ar"), | |
| ("🇯🇵 JP", "jp"), | |
| ("🇰🇷 KO", "ko"), | |
| ("🇻🇳 VI", "vi"), | |
| ("🇻🇦 LA", "la"), | |
| ("🇳🇬 YO", "yo"), | |
| ("Swahili", "sw"), | |
| ("Hausa", "ha"), | |
| ("Wolof", "wo"), | |
| ] | |
| lojban_lang = [ | |
| # There is no ISO 639-1 for Lojban, but jb is valid | |
| ('♟ Lojban', 'jb') | |
| ] | |
| # Translated from English by DeepMind's Gemini Pro | |
| default_text = { | |
| "ar": "هذا هو صوتي.", | |
| "da": "Sådan lyder min stemme.", | |
| "de": "So klingt meine Stimme.", | |
| "el": "Έτσι ακούγεται η φωνή μου.", | |
| "en": "This is what my voice sounds like.", | |
| "es": "Así suena mi voz.", | |
| "fi": "Näin ääneni kuulostaa.", | |
| "fr": "Voici à quoi ressemble ma voix.", | |
| "ha": "Wannan ne muryata ke.", | |
| "hi": "यह मेरी आवाज़ कैसी लगती है।", | |
| "hu": "Így hangzik a hangom.", | |
| "it": "Così suona la mia voce.", | |
| "jb": ".i ca'e gusni", | |
| "jp": "これが私の声です。", | |
| "ko": "여기 제 목소리가 어떤지 들어보세요.", | |
| "la": "Haec est vox mea sonans.", | |
| "nl": "Dit is hoe mijn stem klinkt.", | |
| "pl": "Tak brzmi mój głos.", | |
| "pt": "É assim que minha voz soa.", | |
| "ro": "Așa sună vocea mea.", | |
| "ru": "Вот как звучит мой голос.", | |
| "sv": "Såhär låter min röst.", | |
| "sw": "Baba, yetu, yetu, uliye. Mbinguni, yetu, yetu. Amiiinaa!!", #civ4 | |
| "tr": "Benim sesimin sesi böyle.", | |
| "uk": "Ось як звучить мій голос.", | |
| "vi": "Đây là giọng nói của tôi.", | |
| "wo": "Ndox li neen xewnaal ma.", | |
| "yo": "Ìyí ni ohùn mi ńlá.", | |
| "zh": "这是我的声音。", | |
| } | |
| # Component defaults | |
| input_textbox_init = { | |
| 'label': "Input Text", | |
| 'value': "This is what my voice sounds like.", | |
| 'info': "Also accepts ARPAbet symbols placed within {} brackets.", | |
| 'lines': 1, | |
| 'max_lines': 5, | |
| 'autofocus': True, | |
| 'interactive': False | |
| } | |
| pacing_slider_init = { | |
| 'value': 1.0, | |
| 'minimum': 0.5, | |
| 'maximum': 2.0, | |
| 'step': 0.1, | |
| 'label': "Duration", | |
| 'interactive': False | |
| } | |
| pitch_slider_init = { | |
| 'minimum': 0, | |
| 'maximum': 1.0, | |
| 'value': 0.5, | |
| 'step': 0.05, | |
| 'label': "Pitch", | |
| 'visible': False, | |
| 'interactive': False | |
| } | |
| energy_slider_init = { | |
| 'minimum': 0.1, | |
| 'maximum': 1.0, | |
| 'value': 1.0, | |
| 'step': 0.05, | |
| 'label': "Energy", | |
| 'visible': False, | |
| 'interactive': False | |
| } | |
| anger_slider_init = { | |
| 'minimum': 0, | |
| 'maximum': 1.0, | |
| 'value': 0, | |
| 'step': 0.05, | |
| 'label': "😠 Anger", | |
| 'info': "Tread lightly beyond 0.9", | |
| 'interactive': False | |
| } | |
| happy_slider_init = { | |
| 'minimum': 0, | |
| 'maximum': 1.0, | |
| 'value': 0, | |
| 'step': 0.05, | |
| 'label': "😃 Happiness", | |
| 'info': "Tread lightly beyond 0.7", | |
| 'interactive': False | |
| } | |
| sad_slider_init = { | |
| 'minimum': 0, | |
| 'maximum': 1.0, | |
| 'value': 0, | |
| 'step': 0.05, | |
| 'label': "😭 Sadness", | |
| 'info': "Duration increased when beyond 0.2", | |
| 'interactive': False | |
| } | |
| surprise_slider_init = { | |
| 'minimum': 0, | |
| 'maximum': 1.0, | |
| 'value': 0, | |
| 'step': 0.05, | |
| 'label': "😮 Surprise", | |
| 'info': "Oversaturates Happiness when beyond 0.3", | |
| 'interactive': False | |
| } | |
| voice_radio_init = { | |
| 'choices': [*voice_models, (f'+{len(voice_models_more)}', 'more')], | |
| 'value': "ccby_nvidia_hifi_6671_M", | |
| 'label': "Voice", | |
| 'info': "Fine-tuned voice model", | |
| 'interactive': False | |
| } | |
| deepmoji_checkbox_init = { | |
| 'label': "Use DeepMoji", | |
| 'info': "Auto adjust emotional values for English", | |
| 'value': True, | |
| 'interactive': False | |
| } | |
| class BlocksDemo: | |
| def __init__(self): | |
| self.block = self.create_interface() | |
| def create_interface(self): | |
| with gr.Blocks(css=".arpabet {background-color: gray; border-radius: 5px; font-size: 120%; padding: 0 0.1em; margin: 0 0.1em; text-align: center}") as demo: | |
| gr.Markdown("# xVASynth TTS - MCP Proxy") | |
| gr.Markdown(value=_DESCRIPTION) | |
| with gr.Row(): # Main row for inputs and language selection | |
| with gr.Column(): # Input column | |
| input_textbox = gr.Textbox(**input_textbox_init) | |
| language_radio = gr.Radio([*languages, *languages_more, *lojban_lang], interactive=False) | |
| with gr.Row(): | |
| with gr.Column(): | |
| en_examples_dropdown = gr.Dropdown(interactive=False) | |
| with gr.Column(): | |
| pacing_slider = gr.Slider(**pacing_slider_init) | |
| with gr.Column(): # Control column | |
| voice_radio = gr.Radio([*voice_models, *voice_models_more], interactive=False) | |
| pitch_slider = gr.Slider(**pitch_slider_init) | |
| energy_slider = gr.Slider(**energy_slider_init) | |
| with gr.Row(): # Main row for inputs and language selection | |
| with gr.Column(): # Input column | |
| anger_slider = gr.Slider(**anger_slider_init) | |
| sad_slider = gr.Slider(**sad_slider_init) | |
| with gr.Column(): # Input column | |
| happy_slider = gr.Slider(**happy_slider_init) | |
| surprise_slider = gr.Slider(**surprise_slider_init) | |
| deepmoji_checkbox = gr.Checkbox(**deepmoji_checkbox_init) | |
| gen_audio = gr.Button("generate", "primary", visible=False) | |
| gen_lojban = gr.Button("generate_lojban", "primary", visible=False) | |
| output_wav = gr.Audio( | |
| label="22kHz audio output", | |
| type="filepath", | |
| editable=False, | |
| autoplay=True, | |
| visible=False | |
| ) | |
| # with gr.Column(): # Input column | |
| output_arpabet = gr.HTML(label="ARPAbet") | |
| gen_audio.click( | |
| fn=self.generate, | |
| inputs=[ | |
| input_textbox, | |
| voice_radio, | |
| language_radio, | |
| pacing_slider, | |
| anger_slider, | |
| happy_slider, | |
| sad_slider, | |
| surprise_slider, | |
| deepmoji_checkbox | |
| ], | |
| outputs=[ | |
| output_wav, | |
| output_arpabet, | |
| anger_slider, | |
| happy_slider, | |
| sad_slider, | |
| surprise_slider, | |
| # xVAServer response | |
| gr.Textbox(visible=False) | |
| ] | |
| ) | |
| gen_lojban.click( | |
| fn=self.lojban, | |
| inputs=[ | |
| input_textbox, | |
| voice_radio, | |
| language_radio, | |
| pacing_slider, | |
| anger_slider, | |
| happy_slider, | |
| sad_slider, | |
| surprise_slider, | |
| deepmoji_checkbox | |
| ], | |
| outputs=[ | |
| output_wav, | |
| output_arpabet, | |
| anger_slider, | |
| happy_slider, | |
| sad_slider, | |
| surprise_slider, | |
| # xVAServer response | |
| gr.Textbox(visible=False) | |
| ] | |
| ) | |
| return demo | |
| def generate( | |
| self, | |
| input_text: str = "This is what my voice sounds like.", | |
| voice: str = "ccby_nvidia_hifi_6670_M", | |
| lang: str = "en", | |
| pacing: float = 1.0, | |
| anger: float = 0.0, | |
| happy: float = 0.0, | |
| sad: float = 0.0, | |
| surprise: float = 0.0, | |
| deepmoji_checked = 1 | |
| ): | |
| """ | |
| Convert the text to speech using xVASynth (v3) xVAPitch models. Sensitive to maxed out emotional values | |
| Args: | |
| input_text: string; from which to create the audio | |
| voice: Literal['x_ex04', 'x_ex01', 'ccby_nvidia_hifi_92_F', 'ccby_nvidia_hifi_6671_M', 'x_ex02', 'x_ex03', 'ccby_nvidia_hifi_6670_M', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_12787_F', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_9136_F']; _M/_F means a male/female voice; x_ex04/x_ex02 are American female voices; x_ex03/x_ex01 are American male voices | |
| lang: Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp']; the language of input_text | |
| pacing: float (numeric value between 0.5 and 2.0); Duration; 1.0 is default | |
| anger: float (numeric value between 0 and 1.0); 😠 Anger | |
| happy: float (numeric value between 0 and 1.0); 😃 Happiness | |
| sad: float (numeric value between 0 and 1.0); 😭 Sadness | |
| surprise: float (numeric value between 0 and 1.0); 😮 Surprise | |
| deepmoji_checked: bool; use the DeepMoji model to parse English text and futher amplify the emotional values | |
| Returns: | |
| Tuple of (output_audio_path, arpabet_html, final_anger_ratio, final_happiness_ratio, final_sadness_ratio, final_surprise_ratio, response) where output_audio_path is the filepath of output audio | |
| """ | |
| model = "Pendrokar/xVASynth-TTS" | |
| client = Client("Pendrokar/xVASynth-TTS", hf_token=os.getenv('HF_TOKEN')) | |
| endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict') | |
| api_name = '/predict' | |
| fn_index = None | |
| end_parameters = None | |
| text = input_text | |
| end_parameters = _get_param_examples( | |
| endpoints['named_endpoints'][api_name]['parameters'] | |
| ) | |
| print(end_parameters) | |
| # override some or all default parameters | |
| space_inputs = _override_params(end_parameters, model) | |
| space_inputs[0] = input_text | |
| space_inputs[1] = voice | |
| space_inputs[2] = lang | |
| space_inputs[3] = pacing | |
| space_inputs[6] = anger | |
| space_inputs[7] = happy | |
| space_inputs[8] = sad | |
| space_inputs[9] = surprise | |
| space_inputs[10] = deepmoji_checked | |
| print(space_inputs) | |
| result = client.predict( | |
| *space_inputs, | |
| api_name=api_name | |
| ) | |
| return result | |
| def lojban( | |
| self, | |
| input_text: str = "coi rodo", | |
| voice: str = "x_selpahi", | |
| lang: str = "jb", | |
| pacing: float = 1.0, | |
| anger: float = 0.0, | |
| happy: float = 0.0, | |
| sad: float = 0.0, | |
| surprise: float = 0.0, | |
| deepmoji_checked = 1 | |
| ): | |
| """ | |
| Convert the Lojban text to speech using xVASynth (v2) FastPitch 1.1 models. | |
| Args: | |
| input_text: string; from which to create the audio | |
| voice: Literal['x_selpahi']; the only viable Voice model filenames | |
| lang: Literal['jb']; the language of input_text | |
| pacing: float (numeric value between 0.5 and 2.0); Duration; 1.0 is default | |
| Returns: | |
| Tuple of (output_audio_path, arpabet_html, response) where output_audio_path is the filepath of output audio | |
| """ | |
| model = "Pendrokar/xVASynth-TTS" | |
| client = Client("Pendrokar/xVASynth-TTS", hf_token=os.getenv('HF_TOKEN')) | |
| endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict') | |
| api_name = '/predict' | |
| fn_index = None | |
| end_parameters = None | |
| text = input_text | |
| end_parameters = _get_param_examples( | |
| endpoints['named_endpoints'][api_name]['parameters'] | |
| ) | |
| print(end_parameters) | |
| # override some or all default parameters | |
| space_inputs = _override_params(end_parameters, model) | |
| space_inputs[0] = input_text | |
| space_inputs[1] = voice | |
| space_inputs[2] = 'jbo' | |
| space_inputs[3] = pacing | |
| print(space_inputs) | |
| result = client.predict( | |
| *space_inputs, | |
| api_name=api_name | |
| ) | |
| return result | |
| demo = BlocksDemo() | |
| demo.block.launch(show_api=True, show_error=True, mcp_server=True) | |