Pendrokar's picture
jbo => jb
d38cf13 verified
import os
import gradio as gr
from gradio_client import Client
from test_overrides import _get_param_examples, _override_params
_DESCRIPTION = """
Proxy to [xVASynth Gradio Space](https://huggingface.co/spaces/Pendrokar/xVASynth-TTS), as that space uses a Python version that MCP does not support. You can add this TTS as a tool using the "Use via API or MCP" instructions within the footer of this page.
"""
voice_models = [
("👩 #ex04", "x_ex04"),
("🧑 #ex01", "x_ex01"),
("👱‍♀️ 🇬🇧 #92", "ccby_nvidia_hifi_92_F"),
("👨‍🦳 #6671", "ccby_nvidia_hifi_6671_M"),
]
voice_models_more = [
("👸 #ex02", "x_ex02"),
("👨‍🦱 #ex03", "x_ex03"),
("🧔 #6670", "ccby_nvidia_hifi_6670_M"),
("👨‍🦲 #9017", "ccby_nvidia_hifi_9017_M"),
("🧑 #6097", "ccby_nvidia_hifi_6097_M"),
("👩‍🦱 #12787", "ccby_nvidia_hifi_12787_F"),
("👵 #11614", "ccby_nv_hifi_11614_F"),
("👩‍🦰 #8051", "ccby_nvidia_hifi_8051_F"),
("👩‍🦳 #11697", "ccby_nvidia_hifi_11697_F"),
("👩‍🦲 #9136", "ccby_nvidia_hifi_9136_F"),
("♟ Lojban", "x_selpahi"), # v2 model for Lojban, pre- the multilingual capabilities of xVASynth v3
]
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
languages = [
("🇺🇸 EN", "en"),
("🇩🇪 DE", "de"),
("🇪🇸 ES", "es"),
("🇮🇳 HI", "hi"),
("🇨🇳 ZH", "zh"),
]
languages_more = [
("🇳🇱 NL", "nl"),
("🇧🇷 PT", "pt"),
("🇮🇹 IT", "it"),
("🇵🇱 PL", "pl"),
("🇷🇴 RO", "ro"),
("🇸🇪 SV", "sv"),
("🇩🇰 DA", "da"),
("🇫🇮 FI", "fi"),
("🇭🇺 HU", "hu"),
("🇬🇷 EL", "el"),
("🇫🇷 FR", "fr"),
("🇷🇺 RU", "ru"),
("🇺🇦 UA", "uk"),
("🇹🇷 TR", "tr"),
("🇸🇦 AR", "ar"),
("🇯🇵 JP", "jp"),
("🇰🇷 KO", "ko"),
("🇻🇳 VI", "vi"),
("🇻🇦 LA", "la"),
("🇳🇬 YO", "yo"),
("Swahili", "sw"),
("Hausa", "ha"),
("Wolof", "wo"),
]
lojban_lang = [
# There is no ISO 639-1 for Lojban, but jb is valid
('♟ Lojban', 'jb')
]
# Translated from English by DeepMind's Gemini Pro
default_text = {
"ar": "هذا هو صوتي.",
"da": "Sådan lyder min stemme.",
"de": "So klingt meine Stimme.",
"el": "Έτσι ακούγεται η φωνή μου.",
"en": "This is what my voice sounds like.",
"es": "Así suena mi voz.",
"fi": "Näin ääneni kuulostaa.",
"fr": "Voici à quoi ressemble ma voix.",
"ha": "Wannan ne muryata ke.",
"hi": "यह मेरी आवाज़ कैसी लगती है।",
"hu": "Így hangzik a hangom.",
"it": "Così suona la mia voce.",
"jb": ".i ca'e gusni",
"jp": "これが私の声です。",
"ko": "여기 제 목소리가 어떤지 들어보세요.",
"la": "Haec est vox mea sonans.",
"nl": "Dit is hoe mijn stem klinkt.",
"pl": "Tak brzmi mój głos.",
"pt": "É assim que minha voz soa.",
"ro": "Așa sună vocea mea.",
"ru": "Вот как звучит мой голос.",
"sv": "Såhär låter min röst.",
"sw": "Baba, yetu, yetu, uliye. Mbinguni, yetu, yetu. Amiiinaa!!", #civ4
"tr": "Benim sesimin sesi böyle.",
"uk": "Ось як звучить мій голос.",
"vi": "Đây là giọng nói của tôi.",
"wo": "Ndox li neen xewnaal ma.",
"yo": "Ìyí ni ohùn mi ńlá.",
"zh": "这是我的声音。",
}
# Component defaults
input_textbox_init = {
'label': "Input Text",
'value': "This is what my voice sounds like.",
'info': "Also accepts ARPAbet symbols placed within {} brackets.",
'lines': 1,
'max_lines': 5,
'autofocus': True,
'interactive': False
}
pacing_slider_init = {
'value': 1.0,
'minimum': 0.5,
'maximum': 2.0,
'step': 0.1,
'label': "Duration",
'interactive': False
}
pitch_slider_init = {
'minimum': 0,
'maximum': 1.0,
'value': 0.5,
'step': 0.05,
'label': "Pitch",
'visible': False,
'interactive': False
}
energy_slider_init = {
'minimum': 0.1,
'maximum': 1.0,
'value': 1.0,
'step': 0.05,
'label': "Energy",
'visible': False,
'interactive': False
}
anger_slider_init = {
'minimum': 0,
'maximum': 1.0,
'value': 0,
'step': 0.05,
'label': "😠 Anger",
'info': "Tread lightly beyond 0.9",
'interactive': False
}
happy_slider_init = {
'minimum': 0,
'maximum': 1.0,
'value': 0,
'step': 0.05,
'label': "😃 Happiness",
'info': "Tread lightly beyond 0.7",
'interactive': False
}
sad_slider_init = {
'minimum': 0,
'maximum': 1.0,
'value': 0,
'step': 0.05,
'label': "😭 Sadness",
'info': "Duration increased when beyond 0.2",
'interactive': False
}
surprise_slider_init = {
'minimum': 0,
'maximum': 1.0,
'value': 0,
'step': 0.05,
'label': "😮 Surprise",
'info': "Oversaturates Happiness when beyond 0.3",
'interactive': False
}
voice_radio_init = {
'choices': [*voice_models, (f'+{len(voice_models_more)}', 'more')],
'value': "ccby_nvidia_hifi_6671_M",
'label': "Voice",
'info': "Fine-tuned voice model",
'interactive': False
}
deepmoji_checkbox_init = {
'label': "Use DeepMoji",
'info': "Auto adjust emotional values for English",
'value': True,
'interactive': False
}
class BlocksDemo:
def __init__(self):
self.block = self.create_interface()
def create_interface(self):
with gr.Blocks(css=".arpabet {background-color: gray; border-radius: 5px; font-size: 120%; padding: 0 0.1em; margin: 0 0.1em; text-align: center}") as demo:
gr.Markdown("# xVASynth TTS - MCP Proxy")
gr.Markdown(value=_DESCRIPTION)
with gr.Row(): # Main row for inputs and language selection
with gr.Column(): # Input column
input_textbox = gr.Textbox(**input_textbox_init)
language_radio = gr.Radio([*languages, *languages_more, *lojban_lang], interactive=False)
with gr.Row():
with gr.Column():
en_examples_dropdown = gr.Dropdown(interactive=False)
with gr.Column():
pacing_slider = gr.Slider(**pacing_slider_init)
with gr.Column(): # Control column
voice_radio = gr.Radio([*voice_models, *voice_models_more], interactive=False)
pitch_slider = gr.Slider(**pitch_slider_init)
energy_slider = gr.Slider(**energy_slider_init)
with gr.Row(): # Main row for inputs and language selection
with gr.Column(): # Input column
anger_slider = gr.Slider(**anger_slider_init)
sad_slider = gr.Slider(**sad_slider_init)
with gr.Column(): # Input column
happy_slider = gr.Slider(**happy_slider_init)
surprise_slider = gr.Slider(**surprise_slider_init)
deepmoji_checkbox = gr.Checkbox(**deepmoji_checkbox_init)
gen_audio = gr.Button("generate", "primary", visible=False)
gen_lojban = gr.Button("generate_lojban", "primary", visible=False)
output_wav = gr.Audio(
label="22kHz audio output",
type="filepath",
editable=False,
autoplay=True,
visible=False
)
# with gr.Column(): # Input column
output_arpabet = gr.HTML(label="ARPAbet")
gen_audio.click(
fn=self.generate,
inputs=[
input_textbox,
voice_radio,
language_radio,
pacing_slider,
anger_slider,
happy_slider,
sad_slider,
surprise_slider,
deepmoji_checkbox
],
outputs=[
output_wav,
output_arpabet,
anger_slider,
happy_slider,
sad_slider,
surprise_slider,
# xVAServer response
gr.Textbox(visible=False)
]
)
gen_lojban.click(
fn=self.lojban,
inputs=[
input_textbox,
voice_radio,
language_radio,
pacing_slider,
anger_slider,
happy_slider,
sad_slider,
surprise_slider,
deepmoji_checkbox
],
outputs=[
output_wav,
output_arpabet,
anger_slider,
happy_slider,
sad_slider,
surprise_slider,
# xVAServer response
gr.Textbox(visible=False)
]
)
return demo
def generate(
self,
input_text: str = "This is what my voice sounds like.",
voice: str = "ccby_nvidia_hifi_6670_M",
lang: str = "en",
pacing: float = 1.0,
anger: float = 0.0,
happy: float = 0.0,
sad: float = 0.0,
surprise: float = 0.0,
deepmoji_checked = 1
):
"""
Convert the text to speech using xVASynth (v3) xVAPitch models. Sensitive to maxed out emotional values
Args:
input_text: string; from which to create the audio
voice: Literal['x_ex04', 'x_ex01', 'ccby_nvidia_hifi_92_F', 'ccby_nvidia_hifi_6671_M', 'x_ex02', 'x_ex03', 'ccby_nvidia_hifi_6670_M', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_12787_F', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_9136_F']; _M/_F means a male/female voice; x_ex04/x_ex02 are American female voices; x_ex03/x_ex01 are American male voices
lang: Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp']; the language of input_text
pacing: float (numeric value between 0.5 and 2.0); Duration; 1.0 is default
anger: float (numeric value between 0 and 1.0); 😠 Anger
happy: float (numeric value between 0 and 1.0); 😃 Happiness
sad: float (numeric value between 0 and 1.0); 😭 Sadness
surprise: float (numeric value between 0 and 1.0); 😮 Surprise
deepmoji_checked: bool; use the DeepMoji model to parse English text and futher amplify the emotional values
Returns:
Tuple of (output_audio_path, arpabet_html, final_anger_ratio, final_happiness_ratio, final_sadness_ratio, final_surprise_ratio, response) where output_audio_path is the filepath of output audio
"""
model = "Pendrokar/xVASynth-TTS"
client = Client("Pendrokar/xVASynth-TTS", hf_token=os.getenv('HF_TOKEN'))
endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
api_name = '/predict'
fn_index = None
end_parameters = None
text = input_text
end_parameters = _get_param_examples(
endpoints['named_endpoints'][api_name]['parameters']
)
print(end_parameters)
# override some or all default parameters
space_inputs = _override_params(end_parameters, model)
space_inputs[0] = input_text
space_inputs[1] = voice
space_inputs[2] = lang
space_inputs[3] = pacing
space_inputs[6] = anger
space_inputs[7] = happy
space_inputs[8] = sad
space_inputs[9] = surprise
space_inputs[10] = deepmoji_checked
print(space_inputs)
result = client.predict(
*space_inputs,
api_name=api_name
)
return result
def lojban(
self,
input_text: str = "coi rodo",
voice: str = "x_selpahi",
lang: str = "jb",
pacing: float = 1.0,
anger: float = 0.0,
happy: float = 0.0,
sad: float = 0.0,
surprise: float = 0.0,
deepmoji_checked = 1
):
"""
Convert the Lojban text to speech using xVASynth (v2) FastPitch 1.1 models.
Args:
input_text: string; from which to create the audio
voice: Literal['x_selpahi']; the only viable Voice model filenames
lang: Literal['jb']; the language of input_text
pacing: float (numeric value between 0.5 and 2.0); Duration; 1.0 is default
Returns:
Tuple of (output_audio_path, arpabet_html, response) where output_audio_path is the filepath of output audio
"""
model = "Pendrokar/xVASynth-TTS"
client = Client("Pendrokar/xVASynth-TTS", hf_token=os.getenv('HF_TOKEN'))
endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
api_name = '/predict'
fn_index = None
end_parameters = None
text = input_text
end_parameters = _get_param_examples(
endpoints['named_endpoints'][api_name]['parameters']
)
print(end_parameters)
# override some or all default parameters
space_inputs = _override_params(end_parameters, model)
space_inputs[0] = input_text
space_inputs[1] = voice
space_inputs[2] = 'jbo'
space_inputs[3] = pacing
print(space_inputs)
result = client.predict(
*space_inputs,
api_name=api_name
)
return result
demo = BlocksDemo()
demo.block.launch(show_api=True, show_error=True, mcp_server=True)