| import numpy as np |
| import soundfile |
| import msinference |
|
|
|
|
| def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.', |
| voice='af_ZA_google-nwu_1919', |
| speed=1.4, |
| affect = True |
| ): |
| '''returns 24kHZ np.array TTS |
| |
| voice : 'en_US/vctk_low#p276' # from English voices -> https://audeering.github.io/shift/ |
| |
| or |
| |
| voice : 'af_ZA_google-nwu_1919' # from english non-native accents -> https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6 |
| |
| or |
| |
| voice : 'deu' # foreign langs -> https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv |
| ''' |
|
|
| |
|
|
| |
|
|
| if ('en_US/' in voice) or ('en_UK/' in voice): |
| a = '' if affect else 'v2/' |
| style_vector = msinference.compute_style('assets/wavs/style_vector/' + a + voice.replace( |
| '/', '_').replace('#', '_').replace( |
| 'cmu-arctic', 'cmu_arctic').replace( |
| '_low', '') + '.wav') |
|
|
| x = msinference.inference(text, |
| style_vector) |
|
|
| |
|
|
| elif '_' in voice: |
| style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace( |
| '/', '_').replace('#', '_').replace( |
| 'cmu-arctic', 'cmu_arctic').replace( |
| '_low', '') + '.wav') |
|
|
| x = msinference.inference(text, |
| style_vector) |
|
|
|
|
| |
|
|
| else: |
|
|
| |
| x = msinference.foreign(text=text, |
| lang=voice, |
| speed=speed) |
|
|
| |
|
|
| x /= np.abs(x).max() + 1e-7 |
| print(x.shape, 'TTS OK') |
| return x |
|
|
| soundfile.write(f'demo.wav', tts_entry(), 24000) |
|
|