Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import os | |
| import sys | |
| import time | |
| import requests | |
| import json | |
| from subprocess import Popen, PIPE | |
| import threading | |
| from huggingface_hub import hf_hub_download | |
| import gradio as gr | |
| hf_model_name = "Pendrokar/xvapitch_nvidia" | |
| hf_cache_models_path = '/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/61b10e60b22bc21c1e072f72f1108b9c2b21e94c/' | |
| models_path = '/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/61b10e60b22bc21c1e072f72f1108b9c2b21e94c/' | |
| try: | |
| os.symlink('/home/user/.cache/huggingface/hub/models--Pendrokar--TorchMoji/snapshots/58217568daaf64d3621245dd5c88c94e651a08d6', '/home/user/app/resources/app/plugins/deepmoji_plugings/model', target_is_directory=True) | |
| except: | |
| print('Failed to create symlink to DeepMoji model, may already be there.') | |
| voice_models = [ | |
| ("Male #6671", "ccby_nvidia_hifi_6671_M"), | |
| ("Male #6670", "ccby_nvidia_hifi_6670_M"), | |
| ("Male #9017", "ccby_nvidia_hifi_9017_M"), | |
| ("Male #6097", "ccby_nvidia_hifi_6097_M"), | |
| ("Female #92", "ccby_nvidia_hifi_92_F"), | |
| ("Female #11697", "ccby_nvidia_hifi_11697_F"), | |
| ("Female #12787", "ccby_nvidia_hifi_12787_F"), | |
| ("Female #11614", "ccby_nv_hifi_11614_F"), | |
| ("Female #8051", "ccby_nvidia_hifi_8051_F"), | |
| ("Female #9136", "ccby_nvidia_hifi_9136_F"), | |
| ] | |
| current_voice_model = None | |
| base_speaker_emb = '' | |
| # order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA | |
| languages = [ | |
| ("🇬🇧 EN", "en"), | |
| ("🇩🇪 DE", "de"), | |
| ("🇪🇸 ES", "es"), | |
| ("🇮🇹 IT", "it"), | |
| ("🇳🇱 NL", "nl"), | |
| ("🇵🇹 PT", "pt"), | |
| ("🇵🇱 PL", "pl"), | |
| ("🇷🇴 RO", "ro"), | |
| ("🇸🇪 SV", "sv"), | |
| ("🇩🇰 DA", "da"), | |
| ("🇫🇮 FI", "fi"), | |
| ("🇭🇺 HU", "hu"), | |
| ("🇬🇷 EL", "el"), | |
| ("🇫🇷 FR", "fr"), | |
| ("🇷🇺 RU", "ru"), | |
| ("🇺🇦 UK", "uk"), | |
| ("🇹🇷 TR", "tr"), | |
| ("🇸🇦 AR", "ar"), | |
| ("🇮🇳 HI", "hi"), | |
| ("🇯🇵 JP", "jp"), | |
| ("🇰🇷 KO", "ko"), | |
| ("🇨🇳 ZH", "zh"), | |
| ("🇻🇳 VI", "vi"), | |
| ("🇻🇦 LA", "la"), | |
| ("HA", "ha"), | |
| ("SW", "sw"), | |
| ("🇳🇬 YO", "yo"), | |
| ("WO", "wo"), | |
| ] | |
| # Translated from English by DeepMind's Gemini Pro | |
| default_text = { | |
| "ar": "هذا هو صوتي.", | |
| "da": "Sådan lyder min stemme.", | |
| "de": "So klingt meine Stimme.", | |
| "el": "Έτσι ακούγεται η φωνή μου.", | |
| "en": "This is what my voice sounds like.", | |
| "es": "Así suena mi voz.", | |
| "fi": "Näin ääneni kuulostaa.", | |
| "fr": "Voici à quoi ressemble ma voix.", | |
| "ha": "Wannan ne muryata ke.", | |
| "hi": "यह मेरी आवाज़ कैसी लगती है।", | |
| "hu": "Így hangzik a hangom.", | |
| "it": "Così suona la mia voce.", | |
| "jp": "これが私の声です。", | |
| "ko": "여기 제 목소리가 어떤지 들어보세요.", | |
| "la": "Haec est vox mea sonans.", | |
| "nl": "Dit is hoe mijn stem klinkt.", | |
| "pl": "Tak brzmi mój głos.", | |
| "pt": "É assim que minha voz soa.", | |
| "ro": "Așa sună vocea mea.", | |
| "ru": "Вот как звучит мой голос.", | |
| "sv": "Såhär låter min röst.", | |
| "sw": "Sauti yangu inasikika hivi.", | |
| "tr": "Benim sesimin sesi böyle.", | |
| "uk": "Ось як звучить мій голос.", | |
| "vi": "Đây là giọng nói của tôi.", | |
| "wo": "Ndox li neen xewnaal ma.", | |
| "yo": "Ìyí ni ohùn mi ńlá.", | |
| "zh": "这是我的声音。", | |
| } | |
| def run_xvaserver(): | |
| # start the process without waiting for a response | |
| print('Running xVAServer subprocess...\n') | |
| xvaserver = Popen(['python', f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/server.py'], stdout=PIPE, stderr=PIPE, cwd=f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/') | |
| # Wait for a moment to ensure the server starts up | |
| time.sleep(10) | |
| # Check if the server is running | |
| if xvaserver.poll() is not None: | |
| print("Web server failed to start.") | |
| sys.exit(0) | |
| # contact local xVASynth server | |
| print('Attempting to connect to xVASynth...') | |
| try: | |
| response = requests.get('http://0.0.0.0:8008') | |
| response.raise_for_status() # If the response contains an HTTP error status code, raise an exception | |
| except requests.exceptions.RequestException as err: | |
| print('Failed to connect!') | |
| return | |
| print('xVAServer running on port 8008') | |
| # load default model | |
| load_model("ccby_nvidia_hifi_6671_M") | |
| # Wait for the process to exit | |
| xvaserver.wait() | |
| def load_model(voice_model_name): | |
| model_path = models_path + voice_model_name | |
| model_type = 'xVAPitch' | |
| language = 'en' | |
| data = { | |
| 'outputs': None, | |
| 'version': '3.0', | |
| 'model': model_path, | |
| 'modelType': model_type, | |
| 'base_lang': language, | |
| 'pluginsContext': '{}', | |
| } | |
| embs = base_speaker_emb | |
| try: | |
| response = requests.post('http://0.0.0.0:8008/loadModel', json=data, timeout=60) | |
| response.raise_for_status() # If the response contains an HTTP error status code, raise an exception | |
| current_voice_model = voice_model_name | |
| with open(model_path + '.json', 'r', encoding='utf-8') as f: | |
| voice_model_json = json.load(f) | |
| embs = voice_model_json['games'][0]['base_speaker_emb'] | |
| except requests.exceptions.RequestException as err: | |
| print('Failed to load voice model!') | |
| return embs | |
| def predict( | |
| input_text, | |
| voice, | |
| lang, | |
| pacing, | |
| pitch, | |
| energy, | |
| anger, | |
| happy, | |
| sad, | |
| surprise, | |
| use_deepmoji | |
| ): | |
| # grab only the first 1000 characters | |
| input_text = input_text[:1000] | |
| # load voice model if not the current model | |
| if (current_voice_model != voice): | |
| base_speaker_emb = load_model(voice) | |
| model_type = 'xVAPitch' | |
| pace = pacing if pacing else 1.0 | |
| save_path = '/tmp/xvapitch_audio_sample.wav' | |
| language = lang | |
| use_sr = 0 | |
| use_cleanup = 0 | |
| pluginsContext = {} | |
| pluginsContext["mantella_settings"] = { | |
| "emAngry": (anger if anger > 0 else 0), | |
| "emHappy": (happy if happy > 0 else 0), | |
| "emSad": (sad if sad > 0 else 0), | |
| "emSurprise": (surprise if surprise > 0 else 0), | |
| "run_model": use_deepmoji | |
| } | |
| data = { | |
| 'pluginsContext': json.dumps(pluginsContext), | |
| 'modelType': model_type, | |
| # pad with whitespaces as a workaround to avoid cutoffs | |
| 'sequence': input_text.center(len(input_text) + 2, ' '), | |
| 'pace': pace, | |
| 'outfile': save_path, | |
| 'vocoder': 'n/a', | |
| 'base_lang': language, | |
| 'base_emb': base_speaker_emb, | |
| 'useSR': use_sr, | |
| 'useCleanup': use_cleanup, | |
| } | |
| try: | |
| response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60) | |
| response.raise_for_status() # If the response contains an HTTP error status code, raise an exception | |
| # response_data = json.loads(response.text) | |
| except requests.exceptions.RequestException as err: | |
| print('Failed to synthesize!') | |
| print('server.log contents:') | |
| with open('resources/app/server.log', 'r') as f: | |
| print(f.read()) | |
| return ['', err] | |
| print('server.log contents:') | |
| with open('resources/app/server.log', 'r') as f: | |
| print(f.read()) | |
| return [save_path, response.text] | |
| input_textbox = gr.Textbox( | |
| label="Input Text", | |
| value="This is what my voice sounds like.", | |
| info="Also accepts ARPAbet symbols placed within {} brackets.", | |
| lines=1, | |
| max_lines=5, | |
| autofocus=True | |
| ) | |
| pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration") | |
| pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False) | |
| energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False) | |
| anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😠 Anger", info="Tread lightly beyond 0.9") | |
| happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😃 Happiness", info="Tread lightly beyond 0.7") | |
| sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😭 Sadness", info="Duration increased when beyond 0.2") | |
| surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😮 Surprise", info="Does not play well with Happiness with either being beyond 0.3") | |
| voice_radio = gr.Radio( | |
| voice_models, | |
| value="ccby_nvidia_hifi_6671_M", | |
| label="Voice", | |
| info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model" | |
| ) | |
| def set_default_text(lang): | |
| input_textbox = gr.Textbox( | |
| label="Input Text", | |
| value=default_text[lang], | |
| lines=1, | |
| max_lines=5, | |
| autofocus=True | |
| ) | |
| language_radio = gr.Radio( | |
| languages, | |
| value="en", | |
| label="Language", | |
| info="Will be more monotone and have an English accent. Tested mostly by a native Briton." | |
| ) | |
| # language_radio.change(set_default_text) | |
| deepmoji_checkbox = gr.Checkbox(label="Use DeepMoji", info="Auto adjust emotional values") | |
| gradio_app = gr.Interface( | |
| predict, | |
| [ | |
| input_textbox, | |
| voice_radio, | |
| language_radio, | |
| pacing_slider, | |
| pitch_slider, | |
| energy_slider, | |
| anger_slider, | |
| happy_slider, | |
| sad_slider, | |
| surprise_slider, | |
| deepmoji_checkbox | |
| ], | |
| outputs=[ | |
| gr.Audio(label="22kHz audio output", type="filepath"), | |
| gr.Textbox(label="xVASynth Server Response") | |
| ], | |
| title="xVASynth (WIP)", | |
| clear_btn=gr.Button(visible=False) | |
| # examples=[ | |
| # ["Once, I headed in much deeper. But I doubt I'll ever do that again.", 1], | |
| # ["You love hurting me, huh?", 1.5], | |
| # ["Ah, I see. Well, I'm afraid I can't help with that.", 1], | |
| # ["Embrace your demise!", 1], | |
| # ["Never come back!", 1] | |
| # ], | |
| # cache_examples=None | |
| ) | |
| if __name__ == "__main__": | |
| # Run the web server in a separate thread | |
| web_server_thread = threading.Thread(target=run_xvaserver) | |
| print('Starting xVAServer thread') | |
| web_server_thread.start() | |
| print('running Gradio interface') | |
| gradio_app.launch() | |
| # Wait for the web server thread to finish (shouldn't be reached in normal execution) | |
| web_server_thread.join() | |