Spaces:

Dionyssos
/

SHIFT

Sleeping

App Files Files Community

Dionyssos commited on Sep 14, 2025

Commit

b2b0a60

1 Parent(s): 8099af0

in

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +10 -6
app.py +467 -0
audionar.py +623 -0
requirements.txt +14 -0
textual.py +536 -0
tts.py +847 -0
wav/af_ZA_google-nwu_0184.wav +0 -0
wav/af_ZA_google-nwu_1919.wav +0 -0
wav/af_ZA_google-nwu_2418.wav +0 -0
wav/af_ZA_google-nwu_6590.wav +0 -0
wav/af_ZA_google-nwu_7130.wav +0 -0
wav/af_ZA_google-nwu_7214.wav +0 -0
wav/af_ZA_google-nwu_8148.wav +0 -0
wav/af_ZA_google-nwu_8924.wav +0 -0
wav/af_ZA_google-nwu_8963.wav +0 -0
wav/bn_multi_00737.wav +0 -0
wav/bn_multi_00779.wav +0 -0
wav/bn_multi_01232.wav +0 -0
wav/bn_multi_01701.wav +0 -0
wav/bn_multi_03042.wav +0 -0
wav/bn_multi_0834.wav +0 -0
wav/bn_multi_1010.wav +0 -0
wav/bn_multi_3108.wav +0 -0
wav/bn_multi_3713.wav +0 -0
wav/bn_multi_3958.wav +0 -0
wav/bn_multi_4046.wav +0 -0
wav/bn_multi_4811.wav +0 -0
wav/bn_multi_5958.wav +0 -0
wav/bn_multi_9169.wav +0 -0
wav/bn_multi_rm.wav +0 -0
wav/de_DE_m-ailabs_angela_merkel.wav +0 -0
wav/de_DE_m-ailabs_eva_k.wav +0 -0
wav/de_DE_m-ailabs_karlsson.wav +0 -0
wav/de_DE_m-ailabs_ramona_deininger.wav +0 -0
wav/de_DE_m-ailabs_rebecca_braunert_plunkett.wav +0 -0
wav/de_DE_thorsten-emotion_amused.wav +0 -0
wav/el_GR_rapunzelina.wav +0 -0
wav/en_UK_apope.wav +0 -0
wav/en_US_cmu_arctic_aew.wav +0 -0
wav/en_US_cmu_arctic_aup.wav +0 -0
wav/en_US_cmu_arctic_awb.wav +0 -0
wav/en_US_cmu_arctic_awbrms.wav +0 -0
wav/en_US_cmu_arctic_axb.wav +0 -0
wav/en_US_cmu_arctic_bdl.wav +0 -0
wav/en_US_cmu_arctic_clb.wav +0 -0
wav/en_US_cmu_arctic_eey.wav +0 -0
wav/en_US_cmu_arctic_fem.wav +0 -0
wav/en_US_cmu_arctic_gka.wav +0 -0
wav/en_US_cmu_arctic_jmk.wav +0 -0
wav/en_US_cmu_arctic_ksp.wav +0 -0

README.md CHANGED Viewed

@@ -1,14 +1,18 @@
 ---
-title: SHIFT
-emoji: 🐨
-colorFrom: indigo
 colorTo: gray
 sdk: gradio
-sdk_version: 5.45.0
 app_file: app.py
-pinned: false
 license: cc-by-nc-4.0
-short_description: https://shift-europe.eu/
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Speech analysis
+emoji: 💤
+colorFrom: gray
 colorTo: gray
 sdk: gradio
+sdk_version: 5.41.1
 app_file: app.py
+short_description: TTS for CPU
 license: cc-by-nc-4.0
+tags:
+- non-AR
+- affective
+- shift
+- tts
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,467 @@

+# -*- coding: utf-8 -*-
+import typing
+import gradio as gr
+import numpy as np
+import os
+import torch
+import torch.nn as nn
+import audiofile
+from tts import StyleTTS2
+from textual import only_greek_or_only_latin, transliterate_number, fix_vocals
+import textwrap
+import nltk
+from audionar import VitsModel, VitsTokenizer
+nltk.download('punkt', download_dir='./')
+nltk.download('punkt_tab', download_dir='./')
+nltk.data.path.append('.')
+language_names = ['Ancient greek',
+                  'English',
+                  'Deutsch',
+                  'French',
+                  'Hungarian',
+                  'Romanian',
+                  'Serbian (Approx.)']
+def audionar_tts(text=None,
+                 lang='Romanian'):
+    # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
+    lang_map = {
+            'ancient greek': 'grc',
+            'english': 'eng',
+            'deutsch': 'deu',
+            'french': 'fra',
+            'hungarian': 'hun',
+            'romanian': 'ron',
+            'serbian (approx.)': 'rmc-script_latin',
+        }
+    if text is None or text.strip() == '':
+        text = 'No Audio or Txt Input'
+    if lang not in language_names:  # StyleTTS2
+        text = only_greek_or_only_latin(text, lang='eng')
+        x = _tts.inference(text,
+                           ref_s='wav/' + lang + '.wav')[0, 0, :].numpy()  # 24 Khz
+    else:  # VITS
+        lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
+        global cached_lang_code, cached_net_g, cached_tokenizer
+        if 'cached_lang_code' not in globals() or cached_lang_code != lang_code:
+            cached_lang_code = lang_code
+            cached_net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval()
+            cached_tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
+        net_g = cached_net_g
+        tokenizer = cached_tokenizer
+        text = only_greek_or_only_latin(text, lang=lang_code)
+        text = transliterate_number(text, lang=lang_code)
+        text = fix_vocals(text, lang=lang_code)
+        sentences = textwrap.wrap(text, width=439)
+        total_audio_parts = []
+        for sentence in sentences:
+            inputs = cached_tokenizer(sentence, return_tensors="pt")
+            with torch.no_grad():
+                audio_part = cached_net_g(
+                    input_ids=inputs.input_ids,
+                    attention_mask=inputs.attention_mask,
+                    lang_code=lang_code,
+                )[0, :]
+            total_audio_parts.append(audio_part)
+        x = torch.cat(total_audio_parts).cpu().numpy()
+    x = x[None, :]
+    x = np.concatenate([0.49 * x, 0.51 * x], 0)
+    wavfile = '_vits_.wav'
+    audiofile.write(wavfile, x, 16000)
+    return wavfile  # 2x file for [audio out & state to pass to the Emotion reco tAB]
+# TTS
+# VOICES = [f'wav/{vox}' for vox in os.listdir('wav')]
+# add unidecode (to parse non-roman characters for the StyleTTS2
+# # for the VITS it should better skip the unknown letters - dont use unidecode())
+# at generation fill the state of "last tts"
+#    at record fill the state of "last record" and place in list of voice/langs for TTS
+VOICES = ['jv_ID_google-gmu_04982.wav',
+        # 'it_IT_mls_1595.wav',
+        'en_US_vctk_p303.wav',
+        'en_US_vctk_p306.wav',
+        'it_IT_mls_8842.wav',
+        'en_US_cmu_arctic_ksp.wav',
+        'jv_ID_google-gmu_05970.wav',
+        'en_US_vctk_p318.wav',
+        'ha_NE_openbible.wav',
+        'ne_NP_ne-google_0883.wav',
+        'en_US_vctk_p280.wav',
+        'bn_multi_1010.wav',
+        'en_US_vctk_p259.wav',
+        'it_IT_mls_844.wav',
+        'en_US_vctk_p269.wav',
+        'en_US_vctk_p285.wav',
+        'de_DE_m-ailabs_angela_merkel.wav',
+        'en_US_vctk_p316.wav',
+        'en_US_vctk_p362.wav',
+        'jv_ID_google-gmu_06207.wav',
+        'tn_ZA_google-nwu_9061.wav',
+        'fr_FR_tom.wav',
+        'en_US_vctk_p233.wav',
+        'it_IT_mls_4975.wav',
+        'en_US_vctk_p236.wav',
+        'bn_multi_01232.wav',
+        'bn_multi_5958.wav',
+        'it_IT_mls_9185.wav',
+        'en_US_vctk_p248.wav',
+        'en_US_vctk_p287.wav',
+        'it_IT_mls_9772.wav',
+        'te_IN_cmu-indic_sk.wav',
+        'tn_ZA_google-nwu_8333.wav',
+        'en_US_vctk_p260.wav',
+        'en_US_vctk_p247.wav',
+        'en_US_vctk_p329.wav',
+        'en_US_cmu_arctic_fem.wav',
+        'en_US_cmu_arctic_rms.wav',
+        'en_US_vctk_p308.wav',
+        'jv_ID_google-gmu_08736.wav',
+        'en_US_vctk_p245.wav',
+        'fr_FR_m-ailabs_nadine_eckert_boulet.wav',
+        'jv_ID_google-gmu_03314.wav',
+        'en_US_vctk_p239.wav',
+        'jv_ID_google-gmu_05540.wav',
+        'it_IT_mls_7440.wav',
+        'en_US_vctk_p310.wav',
+        'en_US_vctk_p237.wav',
+        'en_US_hifi-tts_92.wav',
+        'en_US_cmu_arctic_aew.wav',
+        'ne_NP_ne-google_2099.wav',
+        'en_US_vctk_p226.wav',
+        'af_ZA_google-nwu_1919.wav',
+        'jv_ID_google-gmu_03727.wav',
+        'en_US_vctk_p317.wav',
+        'tn_ZA_google-nwu_0378.wav',
+        'nl_pmk.wav',
+        'en_US_vctk_p286.wav',
+        'tn_ZA_google-nwu_3342.wav',
+        # 'en_US_vctk_p343.wav',
+        'de_DE_m-ailabs_ramona_deininger.wav',
+        'jv_ID_google-gmu_03424.wav',
+        'en_US_vctk_p341.wav',
+        'jv_ID_google-gmu_03187.wav',
+        'ne_NP_ne-google_3960.wav',
+        'jv_ID_google-gmu_06080.wav',
+        'ne_NP_ne-google_3997.wav',
+        # 'en_US_vctk_p267.wav',
+        'en_US_vctk_p240.wav',
+        'ne_NP_ne-google_5687.wav',
+        'ne_NP_ne-google_9407.wav',
+        'jv_ID_google-gmu_05667.wav',
+        'jv_ID_google-gmu_01519.wav',
+        'ne_NP_ne-google_7957.wav',
+        'it_IT_mls_4705.wav',
+        'ne_NP_ne-google_6329.wav',
+        'it_IT_mls_1725.wav',
+        'tn_ZA_google-nwu_8914.wav',
+        'en_US_ljspeech.wav',
+        'tn_ZA_google-nwu_4850.wav',
+        'en_US_vctk_p238.wav',
+        'en_US_vctk_p302.wav',
+        'jv_ID_google-gmu_08178.wav',
+        'en_US_vctk_p313.wav',
+        'af_ZA_google-nwu_2418.wav',
+        'bn_multi_00737.wav',
+        'en_US_vctk_p275.wav', # y
+        'af_ZA_google-nwu_0184.wav',
+        'jv_ID_google-gmu_07638.wav',
+        'ne_NP_ne-google_6587.wav',
+        'ne_NP_ne-google_0258.wav',
+        'en_US_vctk_p232.wav',
+        'en_US_vctk_p336.wav',
+        'jv_ID_google-gmu_09039.wav',
+        'en_US_vctk_p312.wav',
+        'af_ZA_google-nwu_8148.wav',
+        'en_US_vctk_p326.wav',
+        'en_US_vctk_p264.wav',
+        'en_US_vctk_p295.wav',
+        # 'en_US_vctk_p298.wav',
+        'es_ES_m-ailabs_victor_villarraza.wav',
+        'pl_PL_m-ailabs_nina_brown.wav',
+        'tn_ZA_google-nwu_9365.wav',
+        'en_US_vctk_p294.wav',
+        'jv_ID_google-gmu_00658.wav',
+        'jv_ID_google-gmu_08305.wav',
+        'en_US_vctk_p330.wav',
+        'gu_IN_cmu-indic_cmu_indic_guj_dp.wav',
+        'jv_ID_google-gmu_05219.wav',
+        'en_US_vctk_p284.wav',
+        'de_DE_m-ailabs_eva_k.wav',
+        # 'bn_multi_00779.wav',
+        'en_UK_apope.wav',
+        'en_US_vctk_p345.wav',
+        'it_IT_mls_6744.wav',
+        'en_US_vctk_p347.wav',
+        'en_US_m-ailabs_mary_ann.wav',
+        'en_US_m-ailabs_elliot_miller.wav',
+        'en_US_vctk_p279.wav',
+        'ru_RU_multi_nikolaev.wav',
+        'bn_multi_4811.wav',
+        'tn_ZA_google-nwu_7693.wav',
+        'bn_multi_01701.wav',
+        'en_US_vctk_p262.wav',
+        # 'en_US_vctk_p266.wav',
+        'en_US_vctk_p243.wav',
+        'en_US_vctk_p297.wav',
+        'en_US_vctk_p278.wav',
+        'jv_ID_google-gmu_02059.wav',
+        'en_US_vctk_p231.wav',
+        'te_IN_cmu-indic_kpn.wav',
+        'en_US_vctk_p250.wav',
+        'it_IT_mls_4974.wav',
+        'en_US_cmu_arctic_awbrms.wav',
+        # 'en_US_vctk_p263.wav',
+        'nl_femal.wav',
+        'tn_ZA_google-nwu_6116.wav',
+        'jv_ID_google-gmu_06383.wav',
+        'en_US_vctk_p225.wav',
+        'en_US_vctk_p228.wav',
+        'it_IT_mls_277.wav',
+        'tn_ZA_google-nwu_7866.wav',
+        'en_US_vctk_p300.wav',
+        'ne_NP_ne-google_0649.wav',
+        'es_ES_carlfm.wav',
+        'jv_ID_google-gmu_06510.wav',
+        'de_DE_m-ailabs_rebecca_braunert_plunkett.wav',
+        'en_US_vctk_p340.wav',
+        'en_US_cmu_arctic_gka.wav',
+        'ne_NP_ne-google_2027.wav',
+        'jv_ID_google-gmu_09724.wav',
+        'en_US_vctk_p361.wav',
+        'ne_NP_ne-google_6834.wav',
+        'jv_ID_google-gmu_02326.wav',
+        'fr_FR_m-ailabs_zeckou.wav',
+        'tn_ZA_google-nwu_1932.wav',
+        # 'female-20-happy.wav',
+        'tn_ZA_google-nwu_1483.wav',
+        'de_DE_thorsten-emotion_amused.wav',
+        'ru_RU_multi_minaev.wav',
+        'sw_lanfrica.wav',
+        'en_US_vctk_p271.wav',
+        'tn_ZA_google-nwu_0441.wav',
+        'it_IT_mls_6001.wav',
+        'en_US_vctk_p305.wav',
+        'it_IT_mls_8828.wav',
+        'jv_ID_google-gmu_08002.wav',
+        'it_IT_mls_2033.wav',
+        'tn_ZA_google-nwu_3629.wav',
+        'it_IT_mls_6348.wav',
+        'en_US_cmu_arctic_axb.wav',
+        'it_IT_mls_8181.wav',
+        'en_US_vctk_p230.wav',
+        'af_ZA_google-nwu_7214.wav',
+        'nl_nathalie.wav',
+        'it_IT_mls_8207.wav',
+        'ko_KO_kss.wav',
+        'af_ZA_google-nwu_6590.wav',
+        'jv_ID_google-gmu_00264.wav',
+        'tn_ZA_google-nwu_6234.wav',
+        'jv_ID_google-gmu_05522.wav',
+        'en_US_cmu_arctic_lnh.wav',
+        'en_US_vctk_p272.wav',
+        'en_US_cmu_arctic_slp.wav',
+        'en_US_vctk_p299.wav',
+        'en_US_hifi-tts_9017.wav',
+        'it_IT_mls_4998.wav',
+        'it_IT_mls_6299.wav',
+        'en_US_cmu_arctic_rxr.wav',
+        # 'female-46-neutral.wav',
+        'jv_ID_google-gmu_01392.wav',
+        'tn_ZA_google-nwu_8512.wav',
+        'en_US_vctk_p244.wav',
+        # 'bn_multi_3108.wav',
+        # 'it_IT_mls_7405.wav',
+        # 'bn_multi_3713.wav',
+        # 'yo_openbible.wav',
+        # 'jv_ID_google-gmu_01932.wav',
+        'en_US_vctk_p270.wav',
+        'tn_ZA_google-nwu_6459.wav',
+        'bn_multi_4046.wav',
+        'en_US_vctk_p288.wav',
+        'en_US_vctk_p251.wav',
+        'es_ES_m-ailabs_tux.wav',
+        'tn_ZA_google-nwu_6206.wav',
+        'bn_multi_9169.wav',
+        # 'en_US_vctk_p293.wav',
+        # 'en_US_vctk_p255.wav',
+        'af_ZA_google-nwu_8963.wav',
+        # 'en_US_vctk_p265.wav',
+        'gu_IN_cmu-indic_cmu_indic_guj_ad.wav',
+        'jv_ID_google-gmu_07335.wav',
+        'en_US_vctk_p323.wav',
+        'en_US_vctk_p281.wav',
+        'en_US_cmu_arctic_bdl.wav',
+        'en_US_m-ailabs_judy_bieber.wav',
+        'it_IT_mls_10446.wav',
+        'en_US_vctk_p261.wav',
+        'en_US_vctk_p292.wav',
+        'te_IN_cmu-indic_ss.wav',
+        'en_US_vctk_p311.wav',
+        'it_IT_mls_12428.wav',
+        'en_US_cmu_arctic_aup.wav',
+        'jv_ID_google-gmu_04679.wav',
+        'it_IT_mls_4971.wav',
+        'en_US_cmu_arctic_ljm.wav',
+        'fa_haaniye.wav',
+        'en_US_vctk_p339.wav',
+        'tn_ZA_google-nwu_7896.wav',
+        'en_US_vctk_p253.wav',
+        'it_IT_mls_5421.wav',
+        # 'ne_NP_ne-google_0546.wav',
+        'vi_VN_vais1000.wav',
+        'en_US_vctk_p229.wav',
+        'en_US_vctk_p254.wav',
+        'en_US_vctk_p258.wav',
+        'it_IT_mls_7936.wav',
+        'en_US_vctk_p301.wav',
+        'tn_ZA_google-nwu_0045.wav',
+        'it_IT_mls_659.wav',
+        'tn_ZA_google-nwu_7674.wav',
+        'it_IT_mls_12804.wav',
+        'el_GR_rapunzelina.wav',
+        'en_US_hifi-tts_6097.wav',
+        'en_US_vctk_p257.wav',
+        'jv_ID_google-gmu_07875.wav',
+        'it_IT_mls_1157.wav',
+        'it_IT_mls_643.wav',
+        'en_US_vctk_p304.wav',
+        'ru_RU_multi_hajdurova.wav',
+        'it_IT_mls_8461.wav',
+        'bn_multi_3958.wav',
+        'it_IT_mls_1989.wav',
+        'en_US_vctk_p249.wav',
+        # 'bn_multi_0834.wav',
+        'en_US_vctk_p307.wav',
+        'es_ES_m-ailabs_karen_savage.wav',
+        'fr_FR_m-ailabs_bernard.wav',
+        'en_US_vctk_p252.wav',
+        'en_US_cmu_arctic_jmk.wav',
+        'en_US_vctk_p333.wav',
+        'tn_ZA_google-nwu_4506.wav',
+        'ne_NP_ne-google_0283.wav',
+        'de_DE_m-ailabs_karlsson.wav',
+        'en_US_cmu_arctic_awb.wav',
+        'en_US_vctk_p246.wav',
+        'en_US_cmu_arctic_clb.wav',
+        'en_US_vctk_p364.wav',
+        'nl_flemishguy.wav',
+        'en_US_vctk_p276.wav', # y
+        # 'en_US_vctk_p274.wav',
+        'fr_FR_m-ailabs_gilles_g_le_blanc.wav',
+        'it_IT_mls_7444.wav',
+        'style_o22050.wav',
+        'en_US_vctk_s5.wav',
+        'en_US_vctk_p268.wav',
+        'it_IT_mls_6807.wav',
+        'it_IT_mls_2019.wav',
+        # 'male-60-angry.wav',
+        'af_ZA_google-nwu_8924.wav',
+        'en_US_vctk_p374.wav',
+        'en_US_vctk_p363.wav',
+        'it_IT_mls_644.wav',
+        'ne_NP_ne-google_3614.wav',
+        'en_US_vctk_p241.wav',
+        'ne_NP_ne-google_3154.wav',
+        'en_US_vctk_p234.wav',
+        'it_IT_mls_8384.wav',
+        'fr_FR_m-ailabs_ezwa.wav',
+        'it_IT_mls_5010.wav',
+        'en_US_vctk_p351.wav',
+        'en_US_cmu_arctic_eey.wav',
+        'jv_ID_google-gmu_04285.wav',
+        'jv_ID_google-gmu_06941.wav',
+        'hu_HU_diana-majlinger.wav',
+        'tn_ZA_google-nwu_2839.wav',
+        'bn_multi_03042.wav',
+        'tn_ZA_google-nwu_5628.wav',
+        'it_IT_mls_4649.wav',
+        'af_ZA_google-nwu_7130.wav',
+        'en_US_cmu_arctic_slt.wav',
+        'jv_ID_google-gmu_04175.wav',
+        'gu_IN_cmu-indic_cmu_indic_guj_kt.wav',
+        'jv_ID_google-gmu_00027.wav',
+        'jv_ID_google-gmu_02884.wav',
+        'en_US_vctk_p360.wav',
+        'en_US_vctk_p334.wav',
+        # 'male-27-sad.wav',
+        'tn_ZA_google-nwu_1498.wav',
+        'fi_FI_harri-tapani-ylilammi.wav',
+        'bn_multi_rm.wav',
+        'ne_NP_ne-google_2139.wav',
+        'pl_PL_m-ailabs_piotr_nater.wav',
+        'fr_FR_siwis.wav',
+        'nl_bart-de-leeuw.wav',
+        'jv_ID_google-gmu_04715.wav',
+        'en_US_vctk_p283.wav',
+        'en_US_vctk_p314.wav',
+        'en_US_vctk_p335.wav',
+        'jv_ID_google-gmu_07765.wav',
+        'en_US_vctk_p273.wav'
+        ]
+VOICES = [t[:-4] for t in VOICES]  # crop .wav for visuals in gr.DropDown
+_tts = StyleTTS2().to('cpu')
+with gr.Blocks(theme='huggingface') as demo:
+    with gr.Row():
+        text_input = gr.Textbox(
+            label="Type text for TTS:",
+            placeholder="Type Text for TTS",
+            lines=4,
+            value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
+        )
+        choice_dropdown = gr.Dropdown(
+            choices=language_names + VOICES,
+            label="Vox",
+            value=language_names[0]
+        )
+        generate_button = gr.Button("Generate Audio", variant="primary")
+    output_audio = gr.Audio(label="TTS Output")
+    generate_button.click(
+        fn=audionar_tts,
+        inputs=[text_input, choice_dropdown],
+        outputs=[output_audio]
+    )
+demo.launch(debug=True)

audionar.py ADDED Viewed

	@@ -0,0 +1,623 @@

+import math
+import numpy as np
+import torch
+from torch import nn
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+import json
+import os
+import re
+from transformers.tokenization_utils import PreTrainedTokenizer
+import phonemizer
+import torch.nn.functional as F
+OSCILLATION = {
+        'deu': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
+        'rmc-script_latin': [2, 2, 1, 2, 2],
+        'hun': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
+        'fra': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
+        'eng': [1, 2, 2, 1, 2, 2],
+        'grc': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
+        'ron': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2],
+    }
+def has_non_roman_characters(input_string):
+    # Find any character outside the ASCII range
+    non_roman_pattern = re.compile(r"[^\x00-\x7F]")
+    # Search the input string for non-Roman characters
+    match = non_roman_pattern.search(input_string)
+    has_non_roman = match is not None
+    return has_non_roman
+class VitsConfig(PretrainedConfig):
+    model_type = "vits"
+    def __init__(
+        self,
+        vocab_size=38,
+        hidden_size=192,
+        num_hidden_layers=6,
+        num_attention_heads=2,
+        use_bias=True,
+        ffn_dim=768,
+        ffn_kernel_size=3,
+        flow_size=192,
+        # hidden_act="relu",
+        upsample_initial_channel=512,
+        upsample_rates=[8, 8, 2, 2],
+        upsample_kernel_sizes=[16, 16, 4, 4],
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        prior_encoder_num_flows=4,
+        prior_encoder_num_wavenet_layers=4,
+        wavenet_kernel_size=5,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_bias = use_bias
+        self.ffn_dim = ffn_dim
+        self.ffn_kernel_size = ffn_kernel_size
+        self.flow_size = flow_size
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.prior_encoder_num_flows = prior_encoder_num_flows
+        self.prior_encoder_num_wavenet_layers = prior_encoder_num_wavenet_layers
+        self.wavenet_kernel_size = wavenet_kernel_size
+        super().__init__()
+class VitsWaveNet(torch.nn.Module):
+    def __init__(self, config, num_layers):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_layers = num_layers
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        # if hasattr(nn.utils.parametrizations, "weight_norm"):
+        #     # raise ValueError
+        weight_norm = nn.utils.parametrizations.weight_norm
+        # else:
+        #     raise ValueError
+        #     # weight_norm = nn.utils.weight_norm
+        for i in range(num_layers):
+            in_layer = torch.nn.Conv1d(
+                in_channels=config.hidden_size,
+                out_channels=2 * config.hidden_size,
+                kernel_size=config.wavenet_kernel_size,
+                dilation=1,
+                padding=2,
+            )
+            in_layer = weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+            # last one is not necessary
+            if i < num_layers - 1:
+                res_skip_channels = 2 * config.hidden_size
+            else:
+                res_skip_channels = config.hidden_size
+            res_skip_layer = torch.nn.Conv1d(config.hidden_size, res_skip_channels, 1)
+            res_skip_layer = weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+    def forward(self,
+                inputs):
+        outputs = torch.zeros_like(inputs)
+        num_channels = torch.IntTensor([self.hidden_size])[0]
+        for i in range(self.num_layers):
+            in_act = self.in_layers[i](inputs)
+            # global_states = torch.zeros_like(hidden_states)  # style ?
+            # acts = fused_add_tanh_sigmoid_multiply(hidden_states, global_states, num_channels_tensor[0])
+            # --
+            # def fused_add_tanh_sigmoid_multiply(input_a, input_b, num_channels):
+            # in_act = input_a #  + input_b
+            t_act = torch.tanh(in_act[:, :num_channels, :])
+            s_act = torch.sigmoid(in_act[:, num_channels:, :])
+            acts = t_act * s_act
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.num_layers - 1:
+                res_acts = res_skip_acts[:, : self.hidden_size, :]
+                inputs = inputs + res_acts
+                outputs = outputs + res_skip_acts[:, self.hidden_size :, :]
+            else:
+                outputs = outputs + res_skip_acts
+        return outputs
+# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
+class HifiGanResidualBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation[i],
+                    padding=self.get_padding(kernel_size, dilation[i]),
+                )
+                for i in range(len(dilation))
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=1,
+                    padding=self.get_padding(kernel_size, 1),
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+    def get_padding(self, kernel_size, dilation=1):
+        # 1, 3, 5, 15
+        return (kernel_size * dilation - dilation) // 2
+    def forward(self, hidden_states):
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            residual = hidden_states
+            hidden_states = nn.functional.leaky_relu(hidden_states, negative_slope=self.leaky_relu_slope)
+            hidden_states = conv1(hidden_states)
+            hidden_states = nn.functional.leaky_relu(hidden_states, negative_slope=self.leaky_relu_slope)
+            hidden_states = conv2(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+class VitsHifiGan(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1d(
+            config.flow_size,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+        self.upsampler = nn.ModuleList()
+        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.upsampler.append(
+                nn.ConvTranspose1d(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_size=kernel_size,
+                    stride=upsample_rate,
+                    padding=(kernel_size - upsample_rate) // 2,
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.upsampler)):
+            channels = config.upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation))
+        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3, bias=False)
+    def forward(self,
+                spectrogram):
+        hidden_states = self.conv_pre(spectrogram)
+        for i in range(self.num_upsamples):
+            hidden_states = F.leaky_relu(hidden_states, negative_slope=.1, inplace=True)
+            hidden_states = self.upsampler[i](hidden_states)
+            res_state = self.resblocks[i * self.num_kernels](hidden_states)
+            for j in range(1, self.num_kernels):
+                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+            hidden_states = res_state / self.num_kernels
+        hidden_states = F.leaky_relu(hidden_states, negative_slope=.01, inplace=True)
+        hidden_states = self.conv_post(hidden_states)
+        waveform = torch.tanh(hidden_states)
+        return waveform
+class VitsResidualCouplingLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.half_channels = config.flow_size // 2
+        self.conv_pre = nn.Conv1d(self.half_channels, config.hidden_size, 1)
+        self.wavenet = VitsWaveNet(config, num_layers=config.prior_encoder_num_wavenet_layers)
+        self.conv_post = nn.Conv1d(config.hidden_size, self.half_channels, 1)
+    def forward(self,
+                x,
+                reverse=False):
+        first_half, second_half = torch.split(x, [self.half_channels] * 2, dim=1)
+        hidden_states = self.conv_pre(first_half)
+        hidden_states = self.wavenet(hidden_states)
+        mean = self.conv_post(hidden_states)
+        second_half = (second_half - mean)
+        outputs = torch.cat([first_half, second_half], dim=1)
+        return outputs
+class VitsResidualCouplingBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.flows = nn.ModuleList()
+        for _ in range(config.prior_encoder_num_flows):
+            self.flows.append(VitsResidualCouplingLayer(config))
+    def forward(self, x, reverse=False):
+        # x L [1, 192, 481]
+        for flow in reversed(self.flows):
+            x = torch.flip(x, [1])  # flipud CHANNELs
+            x = flow(x, reverse=True)
+        return x
+class VitsAttention(nn.Module):
+    """has no positional info"""
+    def __init__(self, config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.scaling = self.head_dim**-0.5
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+    def _shape(self, tensor, seq_len, bsz):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states,
+        layer_head_mask = None,
+        output_attentions = False,
+    ):
+        bsz, tgt_len, _ = hidden_states.size()
+        # Q
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # K/V
+        hidden_states = hidden_states[:, :40, :]  # drop time-frames from k/v [bs*2, time, 96=ch]
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_output = torch.bmm(attn_weights,
+                                value_states)
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+class VitsFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv_1 = nn.Conv1d(config.hidden_size, config.ffn_dim, config.ffn_kernel_size, padding=1)
+        self.conv_2 = nn.Conv1d(config.ffn_dim, config.hidden_size, config.ffn_kernel_size, padding=1)
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = F.relu(self.conv_1(hidden_states))  # inplace changes sound ;
+        hidden_states = self.conv_2(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
+        return hidden_states
+class VitsEncoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = VitsAttention(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
+        self.feed_forward = VitsFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
+    def forward(
+        self,
+        hidden_states,
+        output_attentions = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.attention(
+            hidden_states=hidden_states,
+            # attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.layer_norm(residual + hidden_states)
+        residual = hidden_states
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(residual + hidden_states)
+        outputs = (hidden_states,)
+        return outputs
+class VitsEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([VitsEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+    def forward(
+        self,
+        hidden_states):
+        for _layer in self.layers:
+            layer_outputs = _layer(hidden_states)
+            hidden_states = layer_outputs[0]
+        return hidden_states
+class VitsTextEncoder(nn.Module):
+    """
+    Has VitsEncoder
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.encoder = VitsEncoder(config)  # 6 Layers of VitsAttention
+        self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
+    def forward(self,
+                input_ids
+                ):
+        hidden_states = self.embed_tokens(input_ids) * 4      #Actually4-or-4.856406460551018-@-845-len-ids-deu
+        stats = self.project(self.encoder(hidden_states=hidden_states).transpose(1, 2)).transpose(1, 2)
+        return stats[:, :, :self.config.flow_size]  # prior_means
+class VitsPreTrainedModel(PreTrainedModel):
+    config_class = VitsConfig
+    base_model_prefix = "vits"
+    main_input_name = "input_ids"
+    supports_gradient_checkpointing = True
+class VitsModel(VitsPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.text_encoder = VitsTextEncoder(config)  # has VitsEncoder that includes 6L of VitsAttention
+        self.flow = VitsResidualCouplingBlock(config)
+        self.decoder = VitsHifiGan(config)
+    def forward(
+        self,
+        input_ids = None,
+        attention_mask = None,
+        speaker_id = None,
+        output_attentions = None,
+        output_hidden_states = None,
+        return_dict = None,
+        labels = None,
+        speed = None,
+        lang_code = 'deu',  # speed oscillation pattern per voice/lang
+    ):
+        mask_dtype = self.text_encoder.embed_tokens.weight.dtype
+        if attention_mask is not None:
+            input_padding_mask = attention_mask.unsqueeze(-1).to(mask_dtype)
+        else:
+            raise ValueError
+            input_padding_mask = torch.ones_like(input_ids).unsqueeze(-1).to(mask_dtype)
+        prior_means = self.text_encoder(input_ids=input_ids)
+        input_padding_mask = input_padding_mask.transpose(1, 2)
+        bs, in_len, _ = prior_means.shape
+        # VITS Duration Oscillation
+        pattern = OSCILLATION.get(lang_code, [1, 2, 1])
+        duration = torch.tensor(pattern,
+                                device=prior_means.device).repeat(int(in_len / len(pattern)) + 2)[None, None, :in_len]   # perhaps define [1, 2, 1] per voice or language
+        duration[:, :, 0] = 4
+        duration[:, :, -1] = 3
+        # ATTN
+        predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
+        indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
+        output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
+        output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
+        attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
+        batch_size, _, output_length, input_length = attn_mask.shape
+        cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
+        indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
+        valid_indices = indices.unsqueeze(0) < cum_duration
+        valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
+        padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
+        attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
+        attn = attn[:, 0, :, :]
+        attn = attn + 1e-4 * torch.rand_like(attn)
+        attn /= attn.sum(2, keepdims=True)
+        #print(attn)
+        prior_means = torch.matmul(attn, prior_means)  # try attn to contain .5/.5 instead of 1/0 so it smoothly interpolates repeated prior_means
+        #prior_means = F.interpolate(prior_means.transpose(1,2),   int(1.74 * prior_means.shape[1]), mode='linear').transpose(1,2)  # extend for slow speed
+        # prior means have now been replicated x duration of each prior mean
+        latents = self.flow(prior_means.transpose(1, 2), # + torch.randn_like(prior_means) * .94,
+                            reverse=True)
+        waveform = self.decoder(latents)  # [bs, 1, 16000]
+        return waveform[:, 0, :]
+class VitsTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "vocab.json"}
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        pad_token="<pad>",
+        unk_token="<unk>",
+        language=None,
+        add_blank=True,
+        normalize=True,
+        phonemize=True,
+        is_uroman=False,
+        **kwargs,
+    ):
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.language = language
+        self.add_blank = add_blank
+        self.normalize = normalize
+        self.phonemize = phonemize
+        self.is_uroman = is_uroman
+        super().__init__(
+            pad_token=pad_token,
+            unk_token=unk_token,
+            language=language,
+            add_blank=add_blank,
+            normalize=normalize,
+            phonemize=phonemize,
+            is_uroman=is_uroman,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def normalize_text(self, input_string):
+        """Lowercase the input string, respecting any special token ids that may be part or entirely upper-cased."""
+        all_vocabulary = list(self.encoder.keys()) + list(self.added_tokens_encoder.keys())
+        filtered_text = ""
+        i = 0
+        while i < len(input_string):
+            found_match = False
+            for word in all_vocabulary:
+                if input_string[i : i + len(word)] == word:
+                    filtered_text += word
+                    i += len(word)
+                    found_match = True
+                    break
+            if not found_match:
+                filtered_text += input_string[i].lower()
+                i += 1
+        return filtered_text
+    def _preprocess_char(self, text):
+        """Special treatment of characters in certain languages"""
+        if self.language == "ron":
+            text = text.replace("ț", "ţ")
+        return text
+    def prepare_for_tokenization(
+        self, text: str, is_split_into_words: bool = False, normalize = None, **kwargs):
+        normalize = normalize if normalize is not None else self.normalize
+        if normalize:
+            # normalise for casing
+            text = self.normalize_text(text)
+        filtered_text = self._preprocess_char(text)
+        if has_non_roman_characters(filtered_text) and self.is_uroman:
+            # 7 langs -  For now replace all to romans in app.py
+            raise ValueError
+        if self.phonemize:
+            if not is_phonemizer_available():
+                raise ImportError("Please install the `phonemizer` Python package to use this tokenizer.")
+            filtered_text = phonemizer.phonemize(
+                filtered_text,
+                language="en-us",
+                backend="espeak",
+                strip=True,
+                preserve_punctuation=True,
+                with_stress=True,
+            )
+            filtered_text = re.sub(r"\s+", " ", filtered_text)
+        elif normalize:
+            # strip any chars outside of the vocab (punctuation)
+            filtered_text = "".join(list(filter(lambda char: char in self.encoder, filtered_text))).strip()
+        return filtered_text, kwargs
+    def _tokenize(self, text):
+        """Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters."""
+        tokens = list(text)
+        if self.add_blank:
+            # sounds dyslexi if no space between letters
+            # sounds disconnected if >2 spaces between letters
+            interspersed = [self._convert_id_to_token(0)] * (len(tokens) * 2) # + 1)  # +1 rises slice index error if tokens odd
+            interspersed[::2] = tokens
+            tokens = interspersed + [self._convert_id_to_token(0)]  # append one last space (it has indexing error ::2 mismatch if tokens is odd)
+        return tokens
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+omegaconf
+nltk
+librosa
+phonemizer
+audiofile
+num2words
+numpy<2.0.0
+gradio==5.27.0
+Numbers2Words-Greek
+einops
+torch
+pydantic==2.10.6
+transformers==4.49.0
+sentencepiece

textual.py ADDED Viewed

	@@ -0,0 +1,536 @@

+import re
+import unicodedata
+from num2words import num2words
+from num2word_greek.numbers2words import convert_numbers
+def only_greek_or_only_latin(text, lang='grc'):
+    '''
+        str: The converted string in the specified target script.
+             Characters not found in any mapping are preserved as is.
+             Latin accented characters in the input (e.g., 'É', 'ü') will
+             be preserved in their lowercase form (e.g., 'é', 'ü') if
+             converting to Latin.
+    '''
+    # --- Mapping Dictionaries ---
+    # Keys are in lowercase as input text is case-folded.
+    # If the output needs to maintain original casing, additional logic is required.
+    latin_to_greek_map = {
+        'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε',
+        'ch': 'τσο', # Example of a multi-character Latin sequence
+        'z': 'ζ', 'h': 'χ', 'i': 'ι', 'k': 'κ', 'l': 'λ',
+        'm': 'μ', 'n': 'ν', 'x': 'ξ', 'o': 'ο', 'p': 'π',
+        'v': 'β', 'sc': 'σκ', 'r': 'ρ', 's': 'σ', 't': 'τ',
+        'u': 'ου', 'f': 'φ', 'c': 'σ', 'w': 'β', 'y': 'γ',
+    }
+    greek_to_latin_map = {
+        'ου': 'ou', # Prioritize common diphthongs/digraphs
+        'α': 'a', 'β': 'v', 'γ': 'g', 'δ': 'd', 'ε': 'e',
+        'ζ': 'z', 'η': 'i', 'θ': 'th', 'ι': 'i', 'κ': 'k',
+        'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x', 'ο': 'o',
+        'π': 'p', 'ρ': 'r', 'σ': 's', 'τ': 't', 'υ': 'y', # 'y' is a common transliteration for upsilon
+        'φ': 'f', 'χ': 'ch', 'ψ': 'ps', 'ω': 'o',
+        'ς': 's', # Final sigma
+    }
+    cyrillic_to_latin_map = {
+        'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo', 'ж': 'zh',
+        'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o',
+        'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
+        'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu',
+        'я': 'ya',
+    }
+    # Direct Cyrillic to Greek mapping based on phonetic similarity.
+    # These are approximations and may not be universally accepted transliterations.
+    cyrillic_to_greek_map = {
+        'а': 'α', 'б': 'β', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε', 'ё': 'ιο', 'ж': 'ζ',
+        'з': 'ζ', 'и': 'ι', 'й': 'ι', 'κ': 'κ', 'λ': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο',
+        'π': 'π', 'ρ': 'ρ', 'σ': 'σ', 'τ': 'τ', 'у': 'ου', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
+        'ч': 'τσ', # or τζ depending on desired sound
+        'ш': 'σ', 'щ': 'σ', # approximations
+        'ъ': '', 'ы': 'ι', 'ь': '', 'э': 'ε', 'ю': 'ιου',
+        'я': 'ια',
+    }
+    # Convert the input text to lowercase, preserving accents for Latin characters.
+    # casefold() is used for more robust caseless matching across Unicode characters.
+    lowercased_text = text.lower()  #casefold()
+    output_chars = []
+    current_index = 0
+    if lang == 'grc':
+        # Combine all relevant maps for direct lookup to Greek
+        conversion_map = {**latin_to_greek_map, **cyrillic_to_greek_map}
+        # Sort keys by length in reverse order to handle multi-character sequences first
+        sorted_source_keys = sorted(
+            list(latin_to_greek_map.keys()) + list(cyrillic_to_greek_map.keys()),
+            key=len,
+            reverse=True
+        )
+        while current_index < len(lowercased_text):
+            found_conversion = False
+            for key in sorted_source_keys:
+                if lowercased_text.startswith(key, current_index):
+                    output_chars.append(conversion_map[key])
+                    current_index += len(key)
+                    found_conversion = True
+                    break
+            if not found_conversion:
+                # If no specific mapping found, append the character as is.
+                # This handles unmapped characters and already Greek characters.
+                output_chars.append(lowercased_text[current_index])
+                current_index += 1
+        return ''.join(output_chars)
+    else: # Default to 'lat' conversion
+        # Combine Greek to Latin and Cyrillic to Latin maps.
+        # Cyrillic map keys will take precedence in case of overlap if defined after Greek.
+        combined_to_latin_map = {**greek_to_latin_map, **cyrillic_to_latin_map}
+        # Sort all relevant source keys by length in reverse for replacement
+        sorted_source_keys = sorted(
+            list(greek_to_latin_map.keys()) + list(cyrillic_to_latin_map.keys()),
+            key=len,
+            reverse=True
+        )
+        while current_index < len(lowercased_text):
+            found_conversion = False
+            for key in sorted_source_keys:
+                if lowercased_text.startswith(key, current_index):
+                    latin_equivalent = combined_to_latin_map[key]
+                    # Strip accents ONLY if the source character was from the Greek map.
+                    # This preserves accents on original Latin characters (like 'é')
+                    # and allows for intentional accent stripping from Greek transliterations.
+                    if key in greek_to_latin_map:
+                        normalized_latin = unicodedata.normalize('NFD', latin_equivalent)
+                        stripped_latin = ''.join(c for c in normalized_latin if not unicodedata.combining(c))
+                        output_chars.append(stripped_latin)
+                    else:
+                        output_chars.append(latin_equivalent)
+                    current_index += len(key)
+                    found_conversion = True
+                    break
+            if not found_conversion:
+                # If no conversion happened from Greek or Cyrillic, append the character as is.
+                # This preserves existing Latin characters (including accented ones from input),
+                # numbers, punctuation, and other symbols.
+                output_chars.append(lowercased_text[current_index])
+                current_index += 1
+        return ''.join(output_chars)
+# =====================================================
+#
+def fix_vocals(text, lang='ron'):
+    # Longer phrases should come before shorter ones to prevent partial matches.
+    ron_replacements = {
+        'ţ': 'ț',
+        'ț': 'ts',
+        'î': 'u',
+        'â': 'a',
+        'ş': 's',
+        'w': 'oui',
+        'k': 'c',
+        'l': 'll',
+        # Math symbols
+        'sqrt': ' rădăcina pătrată din ',
+        '^': ' la puterea ',
+        '+': ' plus ',
+        ' - ': ' minus ',  # only replace if standalone so to not say minus if is a-b-c
+        '*': ' ori ',  # times
+        '/': ' împărțit la ',  # divided by
+        '=': ' egal cu ',  # equals
+        'pi': ' pi ',
+        '<': ' mai mic decât ',
+        '>': ' mai mare decât',
+        '%': ' la sută ', # percent (from previous)
+        '(': ' paranteză deschisă ',
+        ')': ' paranteză închisă ',
+        '[': ' paranteză pătrată deschisă ',
+        ']': ' paranteză pătrată închisă ',
+        '{': ' acoladă deschisă ',
+        '}': ' acoladă închisă ',
+        '≠': ' nu este egal cu ',
+        '≤': ' mai mic sau egal cu ',
+        '≥': ' mai mare sau egal cu ',
+        '≈': ' aproximativ ',
+        '∞': ' infinit ',
+        '€': ' euro ',
+        '$': ' dolar ',
+        '£': ' liră ',
+        '&': ' și ',  # and
+        '@': ' la ',  # at
+        '#': ' diez ',  # hash
+        '∑': ' sumă ',
+        '∫': ' integrală ',
+        '√': ' rădăcina pătrată a ', # more generic square root
+    }
+    eng_replacements = {
+        'wik': 'weaky',
+        'sh': 'ss',
+        'ch': 'ttss',
+        'oo': 'oeo',
+        # Math symbols for English
+        'sqrt': ' square root of ',
+        '^': ' to the power of ',
+        '+': ' plus ',
+        ' - ': ' minus ',
+        '*': ' times ',
+        ' / ': ' divided by ',
+        '=': ' equals ',
+        'pi': ' pi ',
+        '<': ' less than ',
+        '>': ' greater than ',
+        # Additional common math symbols from previous list
+        '%': ' percent ',
+        '(': ' open parenthesis ',
+        ')': ' close parenthesis ',
+        '[': ' open bracket ',
+        ']': ' close bracket ',
+        '{': ' open curly brace ',
+        '}': ' close curly brace ',
+        '∑': ' sum ',
+        '∫': ' integral ',
+        '√': ' square root of ',
+        '≠': ' not equals ',
+        '≤': ' less than or equals ',
+        '≥': ' greater than or equals ',
+        '≈': ' approximately ',
+        '∞': ' infinity ',
+        '€': ' euro ',
+        '$': ' dollar ',
+        '£': ' pound ',
+        '&': ' and ',
+        '@': ' at ',
+        '#': ' hash ',
+    }
+    serbian_replacements = {
+        'rn': 'rrn',
+        'ć': 'č',
+        'c': 'č',
+        'đ': 'd',
+        'j': 'i',
+        'l': 'lll',
+        'w': 'v',
+        #  https://huggingface.co/facebook/mms-tts-rmc-script_latin
+        'sqrt': 'kvadratni koren iz',
+        '^': ' na stepen ',
+        '+': ' plus ',
+        ' - ': ' minus ',
+        '*': ' puta ',
+        ' / ': ' podeljeno sa ',
+        '=': ' jednako ',
+        'pi': ' pi ',
+        '<': ' manje od ',
+        '>': ' veće od ',
+        '%': ' procenat ',
+        '(': ' otvorena zagrada ',
+        ')': ' zatvorena zagrada ',
+        '[': ' otvorena uglasta zagrada ',
+        ']': ' zatvorena uglasta zagrada ',
+        '{': ' otvorena vitičasta zagrada ',
+        '}': ' zatvorena vitičasta zagrada ',
+        '∑': ' suma ',
+        '∫': ' integral ',
+        '√': ' kvadratni koren ',
+        '≠': ' nije jednako ',
+        '≤': ' manje ili jednako od ',
+        '≥': ' veće ili jednako od ',
+        '≈': ' približno ',
+        '∞': ' beskonačnost ',
+        '€': ' evro ',
+        '$': ' dolar ',
+        '£': ' funta ',
+        '&': ' i ',
+        '@': ' et ',
+        '#': ' taraba ',
+        # Others
+        #     'rn': 'rrn',
+        # 'ć': 'č',
+        # 'c': 'č',
+        # 'đ': 'd',
+        # 'l': 'le',
+        # 'ij': 'i',
+        # 'ji': 'i',
+        # 'j': 'i',
+        # 'služ': 'sloooozz',  # 'službeno'
+        # 'suver': 'siuveeerra',  # 'suverena'
+        # 'država': 'dirrezav',  # 'država'
+        # 'iči': 'ici',  # 'Graniči'
+        # 's ': 'se',  # a s with space
+        # 'q': 'ku',
+        # 'w': 'aou',
+        # 'z': 's',
+        # "š": "s",
+        # 'th': 'ta',
+        # 'v': 'vv',
+        # "ć": "č",
+        # "đ": "ď",
+        # "lj": "ľ",
+        # "nj": "ň",
+        # "ž": "z",
+        # "c": "č"
+    }
+    deu_replacements = {
+        'sch': 'sh',
+        'ch': 'kh',
+        'ie': 'ee',
+        'ei': 'ai',
+        'ä': 'ae',
+        'ö': 'oe',
+        'ü': 'ue',
+        'ß': 'ss',
+        # Math symbols for German
+        'sqrt': ' Quadratwurzel aus ',
+        '^': ' hoch ',
+        '+': ' plus ',
+        ' - ': ' minus ',
+        '*': ' mal ',
+        ' / ': ' geteilt durch ',
+        '=': ' gleich ',
+        'pi': ' pi ',
+        '<': ' kleiner als ',
+        '>': ' größer als',
+        # Additional common math symbols from previous list
+        '%': ' prozent ',
+        '(': ' Klammer auf ',
+        ')': ' Klammer zu ',
+        '[': ' eckige Klammer auf ',
+        ']': ' eckige Klammer zu ',
+        '{': ' geschweifte Klammer auf ',
+        '}': ' geschweifte Klammer zu ',
+        '∑': ' Summe ',
+        '∫': ' Integral ',
+        '√': ' Quadratwurzel ',
+        '≠': ' ungleich ',
+        '≤': ' kleiner oder gleich ',
+        '≥': ' größer oder gleich ',
+        '≈': ' ungefähr ',
+        '∞': ' unendlich ',
+        '€': ' euro ',
+        '$': ' dollar ',
+        '£': ' pfund ',
+        '&': ' und ',
+        '@': ' at ', # 'Klammeraffe' is also common but 'at' is simpler
+        '#': ' raute ',
+    }
+    fra_replacements = {
+        # French specific phonetic replacements (add as needed)
+        # e.g., 'ç': 's', 'é': 'e', etc.
+        'w': 'v',
+        # Math symbols for French
+        'sqrt': ' racine carrée de ',
+        '^': ' à la puissance ',
+        '+': ' plus ',
+        ' - ': ' moins ',  # tiré ;
+        '*': ' fois ',
+        ' / ': ' divisé par ',
+        '=': ' égale ',
+        'pi': ' pi ',
+        '<': ' inférieur à ',
+        '>': ' supérieur à ',
+        # Add more common math symbols as needed for French
+        '%': ' pour cent ',
+        '(': ' parenthèse ouverte ',
+        ')': ' parenthèse fermée ',
+        '[': ' crochet ouvert ',
+        ']': ' crochet fermé ',
+        '{': ' accolade ouverte ',
+        '}': ' accolade fermée ',
+        '∑': ' somme ',
+        '∫': ' intégrale ',
+        '√': ' racine carrée ',
+        '≠': ' n\'égale pas ',
+        '≤': ' inférieur ou égal à ',
+        '≥': ' supérieur ou égal à ',
+        '≈': ' approximativement ',
+        '∞': ' infini ',
+        '€': ' euro ',
+        '$': ' dollar ',
+        '£': ' livre ',
+        '&': ' et ',
+        '@': ' arobase ',
+        '#': ' dièse ',
+    }
+    hun_replacements = {
+        # Hungarian specific phonetic replacements (add as needed)
+        # e.g., 'á': 'a', 'é': 'e', etc.
+        'ch': 'ts',
+        'cs': 'tz',
+        'g': 'gk',
+        'w': 'v',
+        'z': 'zz',
+        # Math symbols for Hungarian
+        'sqrt': ' négyzetgyök ',
+        '^': ' hatvány ',
+        '+': ' plusz ',
+        ' - ': ' mínusz ',
+        '*': ' szorozva ',
+        ' / ': ' osztva ',
+        '=': ' egyenlő ',
+        'pi': ' pi ',
+        '<': ' kisebb mint ',
+        '>': ' nagyobb mint ',
+        # Add more common math symbols as needed for Hungarian
+        '%': ' százalék ',
+        '(': ' nyitó zárójel ',
+        ')': ' záró zárójel ',
+        '[': ' nyitó szögletes zárójel ',
+        ']': ' záró szögletes zárójel ',
+        '{': ' nyitó kapcsos zárójel ',
+        '}': ' záró kapcsos zárójel ',
+        '∑': ' szumma ',
+        '∫': ' integrál ',
+        '√': ' négyzetgyök ',
+        '≠': ' nem egyenlő ',
+        '≤': ' kisebb vagy egyenlő ',
+        '≥': ' nagyobb vagy egyenlő ',
+        '≈': ' körülbelül ',
+        '∞': ' végtelen ',
+        '€': ' euró ',
+        '$': ' dollár ',
+        '£': ' font ',
+        '&': ' és ',
+        '@': ' kukac ',
+        '#': ' kettőskereszt ',
+    }
+    grc_replacements = {
+        # Ancient Greek specific phonetic replacements (add as needed)
+        # These are more about transliterating Greek letters if they are in the input text.
+        # Math symbols for Ancient Greek (literal translations)
+        'sqrt': ' τετραγωνικὴ ῥίζα ',
+        '^': ' εἰς τὴν δύναμιν ',
+        '+': ' σὺν ',
+        ' - ': ' χωρὶς ',
+        '*': ' πο��λάκις ',
+        ' / ': ' διαιρέω ',
+        '=': ' ἴσον ',
+        'pi': ' πῖ ',
+        '<': ' ἔλαττον ',
+        '>': ' μεῖζον ',
+        # Add more common math symbols as needed for Ancient Greek
+        '%': ' τοῖς ἑκατόν ', # tois hekaton - 'of the hundred'
+        '(': ' ἀνοικτὴ παρένθεσις ',
+        ')': ' κλειστὴ παρένθεσις ',
+        '[': ' ἀνοικτὴ ἀγκύλη ',
+        ']': ' κλειστὴ ἀγκύλη ',
+        '{': ' ἀνοικτὴ σγουρὴ ἀγκύλη ',
+        '}': ' κλειστὴ σγουρὴ ἀγκύλη ',
+        '∑': ' ἄθροισμα ',
+        '∫': ' ὁλοκλήρωμα ',
+        '√': ' τετραγωνικὴ ῥίζα ',
+        '≠': ' οὐκ ἴσον ',
+        '≤': ' ἔλαττον ἢ ἴσον ',
+        '≥': ' μεῖζον ἢ ἴσον ',
+        '≈': ' περίπου ',
+        '∞': ' ἄπειρον ',
+        '€': ' εὐρώ ',
+        '$': ' δολάριον ',
+        '£': ' λίρα ',
+        '&': ' καὶ ',
+        '@': ' ἀτ ', # at
+        '#': ' δίεση ', # hash
+    }
+    # Select the appropriate replacement dictionary based on the language
+    replacements_map = {
+        'grc': grc_replacements,
+        'ron': ron_replacements,
+        'eng': eng_replacements,
+        'deu': deu_replacements,
+        'fra': fra_replacements,
+        'hun': hun_replacements,
+        'rmc-script_latin': serbian_replacements,
+    }
+    current_replacements = replacements_map.get(lang)
+    if current_replacements:
+        # Sort replacements by length of the key in descending order.
+        # This is crucial for correctly replacing multi-character strings (like 'sqrt', 'sch')
+        # before their shorter substrings ('s', 'ch', 'q', 'r', 't').
+        sorted_replacements = sorted(current_replacements.items(), key=lambda item: len(item[0]), reverse=True)
+        for old, new in sorted_replacements:
+            text = text.replace(old, new)
+        return text
+    else:
+        # If the language is not supported, return the original text
+        print(f"Warning: Language '{lang}' not supported for text replacement. Returning original text.")
+        return text
+def _num2words(text='01234', lang=None):
+    if lang == 'grc':
+        return convert_numbers(text)
+    return num2words(text, lang=lang)  # HAS TO BE kwarg lang=lang
+def transliterate_number(number_string,
+                         lang=None):
+    if lang == 'rmc-script_latin':
+        lang = 'sr'
+        exponential_pronoun = ' puta deset na stepen od '
+        comma = ' tačka '
+    elif lang == 'ron':
+        lang = 'ro'
+        exponential_pronoun = ' tízszer a erejéig '
+        comma = ' virgulă '
+    elif lang == 'hun':
+        lang = 'hu'
+        exponential_pronoun = ' tízszer a erejéig '
+        comma = ' virgula '
+    elif lang == 'deu':
+        exponential_pronoun = ' mal zehn hoch '
+        comma = ' komma '
+    elif lang == 'fra':
+        lang = 'fr'
+        exponential_pronoun = ' puissance '
+        comma = 'virgule'
+    elif lang == 'grc':
+        exponential_pronoun = ' εις την δυναμην του '
+        comma = 'κομμα'
+    else:
+        lang = lang[:2]
+        exponential_pronoun = ' times ten to the power of '
+        comma = ' point '
+    def replace_number(match):
+        prefix = match.group(1) or ""
+        number_part = match.group(2)
+        suffix = match.group(5) or ""
+        try:
+            if 'e' in number_part.lower():
+                base, exponent = number_part.lower().split('e')
+                words = _num2words(base, lang=lang) + exponential_pronoun + _num2words(exponent, lang=lang)
+            elif '.' in number_part:
+                integer_part, decimal_part = number_part.split('.')
+                words = _num2words(integer_part, lang=lang) + comma + " ".join(
+                    [_num2words(digit, lang=lang) for digit in decimal_part])
+            else:
+                words = _num2words(number_part, lang=lang)
+            return prefix + words + suffix
+        except ValueError:
+            return match.group(0)  # Return original if conversion fails
+    pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
+    return re.sub(pattern, replace_number, number_string)

tts.py ADDED Viewed

	@@ -0,0 +1,847 @@

+import torch
+import nltk
+nltk.download('punkt', download_dir='./')      # COMMENT IF DOWNLOADED
+nltk.download('punkt_tab', download_dir='./')  # COMMENT IF DOWNLOADED
+nltk.data.path.append('.')
+import librosa
+import audiofile
+import torch.nn.functional as F
+import math
+import numpy as np
+import torch.nn as nn
+import string
+import textwrap
+import phonemizer
+from espeak_util import set_espeak_library
+from transformers import AlbertConfig, AlbertModel
+from huggingface_hub import hf_hub_download
+from nltk.tokenize import word_tokenize
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils.parametrizations import weight_norm
+from torch.nn.utils import spectral_norm
+_pad = "$"
+_punctuation = ';:,.!?¡¿—…"«»“” '
+_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+MAX_PHONEMES = 424   # For OOM is the max length of single (non-split) sentence for StyleTTS2 inference
+symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
+dicts = {}
+for i in range(len((symbols))):
+    dicts[symbols[i]] = i
+class TextCleaner:
+    def __init__(self, dummy=None):
+        self.word_index_dictionary = dicts
+        print(len(dicts))
+    def __call__(self, text):
+        indexes = []
+        for char in text:
+            try:
+                indexes.append(self.word_index_dictionary[char])
+            except KeyError:
+                # `=NONVOCAL    == \x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f
+                # print(f'NonVOCAL {char}', end='\r')
+                pass
+        return indexes
+set_espeak_library()
+textclenaer = TextCleaner()
+global_phonemizer = phonemizer.backend.EspeakBackend(language="en-us", preserve_punctuation=True, with_stress=True)
+def _del_prefix(d):
+    # del ".module"
+    out = {}
+    for k, v in d.items():
+        out[k[7:]] = v
+    return out
+class StyleTTS2(nn.Module):
+    def __init__(self):
+        super().__init__()
+        albert_base_configuration = AlbertConfig(vocab_size=178,
+                                                 hidden_size=768,
+                                                 num_attention_heads=12,
+                                                 intermediate_size=2048,
+                                                 max_position_embeddings=512,
+                                                 num_hidden_layers=12,
+                                                 dropout=0.1)
+        self.bert = AlbertModel(albert_base_configuration)
+        state_dict = torch.load(hf_hub_download(repo_id='dkounadis/artificial-styletts2',
+                                                filename='Utils/PLBERT/step_1000000.pth'),
+                                map_location='cpu')['net']
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            name = k[7:] # remove `module.`
+            if name.startswith('encoder.'):
+                name = name[8:] # remove `encoder.`
+                new_state_dict[name] = v
+        del new_state_dict["embeddings.position_ids"]
+        self.bert.load_state_dict(new_state_dict, strict=True)
+        self.decoder = Decoder(dim_in=512,
+                        style_dim=128,
+                        dim_out=80,  # n_mels
+                        resblock_kernel_sizes=[3, 7, 11],
+                        upsample_rates=[10, 5, 3, 2],
+                        upsample_initial_channel=512,
+                        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                        upsample_kernel_sizes=[20, 10, 6, 4])
+        self.text_encoder = TextEncoder(channels=512,
+                                kernel_size=5,
+                                depth=3,  # args['model_params']['n_layer'],
+                                n_symbols=178,  # args['model_params']['n_token']
+                                )
+        self.predictor = ProsodyPredictor(style_dim=128,
+                                    d_hid=512,
+                                    nlayers=3,  # OFFICIAL config.nlayers=5;
+                                    max_dur=50)
+        self.style_encoder = StyleEncoder()
+        self.predictor_encoder = StyleEncoder()
+        self.bert_encoder = torch.nn.Linear(self.bert.config.hidden_size, 512)
+        self.mel_spec = MelSpec()
+        params = torch.load(hf_hub_download(repo_id='yl4579/StyleTTS2-LibriTTS',
+                                            filename='Models/LibriTTS/epochs_2nd_00020.pth'),
+                            map_location='cpu')['net']
+        self.bert.load_state_dict(_del_prefix(params['bert']), strict=True)
+        self.bert_encoder.load_state_dict(_del_prefix(params['bert_encoder']), strict=True)
+        self.predictor.load_state_dict(_del_prefix(params['predictor']), strict=True)
+        self.decoder.load_state_dict(_del_prefix(params['decoder']), strict=True)
+        self.text_encoder.load_state_dict(_del_prefix(params['text_encoder']), strict=True)
+        self.predictor_encoder.load_state_dict(_del_prefix(params['predictor_encoder']), strict=True)
+        self.style_encoder.load_state_dict(_del_prefix(params['style_encoder']), strict=True)
+        # FOR LSTM
+        for n, p in self.named_parameters():
+                p.requires_grad = False
+        self.eval()
+    def device(self):
+        return self.style_encoder.unshared.weight.device
+    def compute_style(self, wav_file=None):
+        x, sr = librosa.load(wav_file, sr=24000)
+        x, _ = librosa.effects.trim(x, top_db=30)
+        if sr != 24000:
+            x = librosa.resample(x, sr, 24000)
+        # LOGMEL - Has 16KHz default basisc - Called on 24KHz .wav
+        x = torch.from_numpy(x[None, :]).to(device=self.device(),
+                                            dtype=torch.float)
+        mel_tensor = (torch.log(1e-5 + self.mel_spec(x)) + 4) / 4
+        #mel_tensor = preprocess(audio).to(device)
+        ref_s = self.style_encoder(mel_tensor)
+        ref_p = self.predictor_encoder(mel_tensor)  # [bs, 11, 1, 128]
+        s = torch.cat([ref_s, ref_p], dim=3)  # [bs, 11, 1, 256]
+        s = s[:, :, 0, :].transpose(1, 2)  # [1, 128, 11]
+        return s  # [1, 128, 11]
+    def inference(self,
+                  text,
+                  ref_s=None):
+        '''text may become too long when phonemized'''
+        if isinstance(ref_s, str):
+            ref_s = self.compute_style(ref_s)
+        else:
+            pass # assume ref_s = precomputed style vector
+        # text = transliterate_number(text, lang='en').strip()
+        # as we are in english transliteration is already done by the text cleaner?
+        # somehow we have phonemes in text that try to be rephonemized
+        # The ds txt should be only ascii
+        if isinstance(text, str):
+            _translator = str.maketrans('', '', string.punctuation)
+            text = [sub_sent.translate(_translator) + '.' for sub_sent in textwrap.wrap(text, 74)]
+            # # text = nltk.sent_tokenize(text)
+            # # text = [i for sent in sentences for i in textwrap.wrap(sent, width=120)]
+            # # text = textwrap.wrap(text, width=MAX_PHONEMES)  # phonemes thus sent_tokenize() can't split them in sentences
+        device = ref_s.device
+        total = []
+        for _t in text:
+            _t = global_phonemizer.phonemize([_t])
+            _t = word_tokenize(_t[0])
+            _t = ' '.join(_t)
+            tokens = textclenaer(_t)[:MAX_PHONEMES] + [4]  # textclenaer('.;?!') = [4,1,6,5] # append . punctuation to assure proper sound termination (pulse Issue)
+            # After filter we should assure is terminating as a sentence
+            # print(len(_t), len(tokens), 'Msi')#, textclenaer('.;?!'))
+            # ================================= Delete Phonemes If len(phonemes) > len(text)  === OOM during training
+            tokens.insert(0, 0)
+            tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
+            with torch.no_grad():
+                hidden_states = self.text_encoder(tokens)
+                bert_dur = self.bert(tokens, attention_mask=torch.ones_like(tokens)
+                                     ).last_hidden_state
+                d_en = self.bert_encoder(bert_dur).transpose(-1, -2)
+                aln_trg, F0_pred, N_pred = self.predictor(d_en=d_en, s=ref_s[:, 128:, :])
+                asr = torch.bmm(aln_trg, hidden_states)
+                asr = asr.transpose(1, 2)
+                asr_new = torch.zeros_like(asr)
+                asr_new[:, :, 0] = asr[:, :, 0]
+                asr_new[:, :, 1:] = asr[:, :, 0:-1]
+                asr = asr_new
+                x = self.decoder(asr=asr,
+                            F0_curve=F0_pred,
+                            N=N_pred,
+                            s=ref_s[:, :128, :])  # different part of ref_s
+                # print(x.shape, 'TTS TTS TTS TTS')
+                if x.shape[2] < 100:
+                    x = torch.zeros(1, 1, 1000, device=self.device())  # silence if this sentence was empty
+            # NORMALIS / Crop Scratch at end (The endingscratch sound is not solved even with nltk.sentence split & punctuation)
+            x = x[..., 40:-4000]
+            # x /= x.abs().max() + 1e-7   # preserve as torch
+            # return x
+            if x.shape[2] == 0:
+                # nohing to vocode
+                x = torch.zeros(1, 1, 1000, device=self.device())
+            total.append(x)
+        # --
+        total = 1.94 * torch.cat(total, 2)  # 1.94 * Perhaps exceeding -1,1 affects MIMI encode
+        total /= 1.02 * total.abs().max() + 1e-7
+        # --
+        return total
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+def _tile(x,
+          length=None):
+    x = x.repeat(1, 1, int(length / x.shape[2]) + 1)[:, :, :length]
+    return x
+class AdaIN1d(nn.Module):
+    # used by HiFiGan & ProsodyPredictor
+    def __init__(self, style_dim, num_features):
+        super().__init__()
+        self.norm = nn.InstanceNorm1d(num_features, affine=False)
+        self.fc = nn.Linear(style_dim, num_features*2)
+    def forward(self, x, s):
+        # x = torch.Size([1, 512, 248])     same as output
+        # s = torch.Size([1, 7, 1, 128])
+        s = self.fc(s.transpose(1, 2)).transpose(1, 2)
+        s = _tile(s, length=x.shape[2])
+        gamma, beta = torch.chunk(s, chunks=2, dim=1)
+        return (1+gamma) * self.norm(x) + beta
+class AdaINResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
+        super(AdaINResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        # self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        # self.convs2.apply(init_weights)
+        self.adain1 = nn.ModuleList([
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+        ])
+        self.adain2 = nn.ModuleList([
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+        ])
+        self.alpha1 = nn.ParameterList(
+            [nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))])
+        self.alpha2 = nn.ParameterList(
+            [nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))])
+    def forward(self, x, s):
+        for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
+            xt = n1(x, s)  # THIS IS ADAIN - EXPECTS conv1d dims
+            xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2)  # Snake1D
+            xt = c1(xt)
+            xt = n2(xt, s)  # THIS IS ADAIN - EXPECTS conv1d dims
+            xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2)  # Snake1D
+            xt = c2(xt)
+            x = xt + x
+        return x
+class SourceModuleHnNSF(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.harmonic_num = 8
+        self.l_linear = torch.nn.Linear(self.harmonic_num + 1, 1)
+        self.upsample_scale = 300
+    def forward(self, x):
+        # --
+        x = torch.multiply(x, torch.FloatTensor(
+            [[range(1, self.harmonic_num + 2)]]).to(x.device))  # [1, 145200, 9]
+        # modulo of negative f0_values => -21 % 10 = 9 as -3*10 + 9 = 21 NOTICE THAT f0_values IS SIGNED
+        rad_values = x / 25647 #).clamp(0, 1)
+        # rad_values = torch.where(torch.logical_or(rad_values < 0, rad_values > 1), 0.5, rad_values)
+        rad_values = rad_values % 1  # % of neg values
+        rad_values = F.interpolate(rad_values.transpose(1, 2),
+                                                     scale_factor=1/self.upsample_scale,
+                                                     mode='linear').transpose(1, 2)
+        # 1.89 sounds also nice has woofer at punctuation
+        phase = torch.cumsum(rad_values, dim=1) * 1.84 * np.pi
+        phase = F.interpolate(phase.transpose(1, 2) * self.upsample_scale,
+                              scale_factor=self.upsample_scale, mode='linear').transpose(1, 2)
+        x = .009 * phase.sin()
+        # --
+        x = self.l_linear(x).tanh()
+        return x
+class Generator(torch.nn.Module):
+    def __init__(self,
+                 style_dim,
+                 resblock_kernel_sizes,
+                 upsample_rates,
+                 upsample_initial_channel,
+                 resblock_dilation_sizes,
+                 upsample_kernel_sizes):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.m_source = SourceModuleHnNSF()
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.noise_convs = nn.ModuleList()
+        self.ups = nn.ModuleList()
+        self.noise_res = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel//(2**i),
+                                                        upsample_initial_channel//(
+                                                            2**(i+1)),
+                                                        k, u, padding=(u//2 + u % 2), output_padding=u % 2)))
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1:])
+                self.noise_convs.append(Conv1d(
+                    1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
+                self.noise_res.append(AdaINResBlock1(
+                    c_cur, 7, [1, 3, 5], style_dim))
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+                self.noise_res.append(AdaINResBlock1(
+                    c_cur, 11, [1, 3, 5], style_dim))
+        self.resblocks = nn.ModuleList()
+        self.alphas = nn.ParameterList()
+        self.alphas.append(nn.Parameter(
+            torch.ones(1, upsample_initial_channel, 1)))
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel//(2**(i+1))
+            self.alphas.append(nn.Parameter(torch.ones(1, ch, 1)))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(AdaINResBlock1(ch, k, d, style_dim))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+    def forward(self, x, s, f0):
+        # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 484]) GENERAT 249
+        f0 = self.f0_upsamp(f0).transpose(1, 2)
+        # x.shape=torch.Size([1, 512, 484]) s.shape=torch.Size([1, 1, 1, 128]) f0.shape=torch.Size([1, 145200, 1]) GENERAT 253
+        # [1, 145400, 1] f0 enters already upsampled to full wav 24kHz length
+        har_source = self.m_source(f0)
+        har_source = har_source.transpose(1, 2)
+        for i in range(self.num_upsamples):
+            x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2)
+            x_source = self.noise_convs[i](har_source)
+            x_source = self.noise_res[i](x_source, s)
+            x = self.ups[i](x)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x, s)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x, s)
+            x = xs / self.num_kernels
+        # x = x + (1 / self.alphas[i+1]) * (torch.sin(self.alphas[i+1] * x) ** 2)  # noisy
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+class AdainResBlk1d(nn.Module):
+    # also used in ProsodyPredictor()
+    def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
+                 upsample='none', dropout_p=0.0):
+        super().__init__()
+        self.actv = actv
+        self.upsample_type = upsample
+        self.upsample = UpSample1d(upsample)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out, style_dim)
+        if upsample == 'none':
+            self.pool = nn.Identity()
+        else:
+            self.pool = weight_norm(nn.ConvTranspose1d(
+                dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
+    def _build_weights(self, dim_in, dim_out, style_dim):
+        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
+        self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
+        self.norm1 = AdaIN1d(style_dim, dim_in)
+        self.norm2 = AdaIN1d(style_dim, dim_out)
+        if self.learned_sc:
+            self.conv1x1 = weight_norm(
+                nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
+    def _shortcut(self, x):
+        x = self.upsample(x)
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        return x
+    def _residual(self, x, s):
+        x = self.norm1(x, s)
+        x = self.actv(x)
+        x = self.pool(x)
+        x = self.conv1(x)
+        x = self.norm2(x, s)
+        x = self.actv(x)
+        x = self.conv2(x)
+        return x
+    def forward(self, x, s):
+        out = self._residual(x, s)
+        out = (out + self._shortcut(x)) / math.sqrt(2)
+        return out
+class UpSample1d(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        else:
+            return F.interpolate(x, scale_factor=2, mode='nearest-exact')
+class Decoder(nn.Module):
+    def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80,
+                 resblock_kernel_sizes=[3, 7, 11],
+                 upsample_rates=[10, 5, 3, 2],
+                 upsample_initial_channel=512,
+                 resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                 upsample_kernel_sizes=[20, 10, 6, 4]):
+        super().__init__()
+        self.decode = nn.ModuleList()
+        self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(
+            1024 + 2 + 64, 512, style_dim, upsample=True))
+        self.F0_conv = weight_norm(
+            nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))  # smooth
+        self.N_conv = weight_norm(
+            nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
+        self.asr_res = nn.Sequential(
+            weight_norm(nn.Conv1d(512, 64, kernel_size=1)),
+        )
+        self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates,
+                                   upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes)
+    def forward(self, asr=None, F0_curve=None, N=None, s=None):
+        F0 = self.F0_conv(F0_curve)
+        N = self.N_conv(N)
+        x = torch.cat([asr, F0, N], axis=1)
+        x = self.encode(x, s)
+        asr_res = self.asr_res(asr)
+        res = True
+        for block in self.decode:
+            if res:
+                x = torch.cat([x, asr_res, F0, N], axis=1)
+            x = block(x, s)
+            if block.upsample_type != "none":
+                res = False
+        x = self.generator(x, s, F0_curve)
+        return x
+class MelSpec(torch.nn.Module):
+    def __init__(self,
+                 sample_rate=17402, # https://github.com/fakerybakery/styletts2-cli/blob/main/msinference.py = Default 16000. However 17400 vocalises better also "en_US/vctk_p274"
+                 n_fft=2048,
+                 win_length=1200,
+                 hop_length=300,
+                 n_mels=80
+                 ):
+        '''avoids dependency on torchaudio'''
+        super().__init__()
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        # --
+        f_min = 0.0
+        f_max = float(sample_rate // 2)
+        all_freqs = torch.linspace(0, sample_rate // 2, n_fft//2+1)
+        m_min = 2595.0 * math.log10(1.0 + (f_min / 700.0))
+        m_max = 2595.0 * math.log10(1.0 + (f_max / 700.0))
+        m_pts = torch.linspace(m_min, m_max, n_mels + 2)
+        f_pts = 700.0 * (10 ** (m_pts / 2595.0) - 1.0)
+        f_diff = f_pts[1:] - f_pts[:-1]  # (n_mels + 1)
+        slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1)
+        zero = torch.zeros(1)
+        down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_mels)
+        up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_mels)
+        fb = torch.max(zero, torch.min(down_slopes, up_slopes))
+        # --
+        self.register_buffer('fb', fb, persistent=False)
+        window = torch.hann_window(self.win_length)
+        self.register_buffer('window', window, persistent=False)
+    def forward(self, x):
+        spec_f = torch.stft(x,
+                            self.n_fft,
+                            self.hop_length,
+                            self.win_length,
+                            self.window,
+                            center=True,
+                            pad_mode="reflect",
+                            normalized=False,
+                            onesided=True,
+                            return_complex=True)  # [bs, 1025, 56]
+        mel_specgram = torch.matmul(spec_f.abs().pow(2).transpose(1, 2), self.fb).transpose(1, 2)
+        return mel_specgram[:, None, :, :]  # [bs, 1, 80, time]
+class LearnedDownSample(nn.Module):
+    def __init__(self, dim_in):
+        super().__init__()
+        self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(
+                3, 3), stride=(2, 2), groups=dim_in, padding=1))
+    def forward(self, x):
+        return self.conv(x)
+class ResBlk(nn.Module):
+    def __init__(self,
+                 dim_in, dim_out):
+        super().__init__()
+        self.actv = nn.LeakyReLU(0.2)   # .07 also nice
+        self.downsample_res = LearnedDownSample(dim_in)
+        self.learned_sc = dim_in != dim_out
+        self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
+        self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
+        if self.learned_sc:
+            self.conv1x1 = spectral_norm(
+                nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
+    def _shortcut(self, x):
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        if x.shape[3] % 2 != 0:  # [bs, 128, Freq, Time]
+            x = torch.cat([x, x[:, :, :, -1:]], dim=3)
+        return F.interpolate(x, scale_factor=.5, mode='nearest-exact')  # F.avg_pool2d(x, 2)
+    def _residual(self, x):
+        x = self.actv(x)
+        x = self.conv1(x)
+        x = self.downsample_res(x)
+        x = self.actv(x)
+        x = self.conv2(x)
+        return x
+    def forward(self, x):
+        x = self._shortcut(x) + self._residual(x)
+        return x / math.sqrt(2)  # unit variance
+class StyleEncoder(nn.Module):
+    #  for both acoustic & prosodic ref_s/p
+    def __init__(self,
+                 dim_in=64,
+                 style_dim=128,
+                 max_conv_dim=512):
+        super().__init__()
+        blocks = [spectral_norm(nn.Conv2d(1, dim_in, 3, stride=1, padding=1))]
+        for _ in range(4):
+            dim_out = min(dim_in * 2,
+                          max_conv_dim)
+            blocks += [ResBlk(dim_in, dim_out)]
+            dim_in = dim_out
+        blocks += [nn.LeakyReLU(0.24),  # w/o this activation - produces no speech
+                   spectral_norm(nn.Conv2d(dim_out, dim_out, 5, stride=1, padding=0)),
+                   nn.LeakyReLU(0.2)  # 0.3 sounds nice
+                   ]
+        self.shared = nn.Sequential(*blocks)
+        self.unshared = nn.Linear(dim_out, style_dim)
+    def forward(self, x):
+        x = self.shared(x)
+        x = x.mean(3, keepdims=True)  # comment this line for time varying style vector
+        x = x.transpose(1, 3)
+        s = self.unshared(x)
+        return s
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True):
+        super().__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+    def forward(self, x):
+        return self.linear_layer(x)
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+class TextEncoder(nn.Module):
+    def __init__(self, channels, kernel_size, depth, n_symbols):
+        super().__init__()
+        self.embedding = nn.Embedding(n_symbols, channels)
+        padding = (kernel_size - 1) // 2
+        self.cnn = nn.ModuleList()
+        for _ in range(depth):
+            self.cnn.append(nn.Sequential(
+                weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
+                LayerNorm(channels),
+                nn.LeakyReLU(0.24))
+                            )
+        self.lstm = nn.LSTM(channels, channels//2, 1,
+                            batch_first=True, bidirectional=True)
+    def forward(self, x):
+        x = self.embedding(x)  # [B, T, emb]
+        x = x.transpose(1, 2)
+        for c in self.cnn:
+            x = c(x)
+        x = x.transpose(1, 2)
+        x, _ = self.lstm(x)
+        return x
+class AdaLayerNorm(nn.Module):
+    def __init__(self, style_dim, channels=None, eps=1e-5):
+        super().__init__()
+        self.eps = eps
+        self.fc = nn.Linear(style_dim, 1024)
+    def forward(self, x, s):
+        h = self.fc(s)
+        gamma = h[:, :, :512]
+        beta = h[:, :, 512:1024]
+        x = F.layer_norm(x, (512, ), eps=self.eps)
+        x = (1 + gamma) * x + beta
+        return x  # [1, 75, 512]
+class ProsodyPredictor(nn.Module):
+    def __init__(self, style_dim, d_hid, nlayers, max_dur=50):
+        super().__init__()
+        self.text_encoder = DurationEncoder(sty_dim=style_dim,
+                                            d_model=d_hid,
+                                            nlayers=nlayers)  # called outside forward
+        self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2,
+                            1, batch_first=True, bidirectional=True)
+        self.duration_proj = LinearNorm(d_hid, max_dur)
+        self.shared = nn.LSTM(d_hid + style_dim, d_hid //
+                              2, 1, batch_first=True, bidirectional=True)
+        self.F0 = nn.ModuleList([
+            AdainResBlk1d(d_hid, d_hid, style_dim),
+            AdainResBlk1d(d_hid, d_hid // 2,  style_dim, upsample=True),
+            AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim),
+            ])
+        self.N = nn.ModuleList([
+            AdainResBlk1d(d_hid, d_hid, style_dim),
+            AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True),
+            AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim)
+            ])
+        self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
+        self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
+    def F0Ntrain(self, x, s):
+        x, _ = self.shared(x)  # [bs, time, ch] LSTM
+        x = x.transpose(1, 2)  # [bs, ch, time]
+        F0 = x
+        for block in self.F0:
+            # print(f'LOOP {F0.shape=} {s.shape=}\n')
+            # )N F0.shape=torch.Size([1, 512, 147]) s.shape=torch.Size([1, 128])
+            # This is an AdainResBlk1d expects conv1d dimensions
+            F0 = block(F0, s)
+        F0 = self.F0_proj(F0)
+        N = x
+        for block in self.N:
+            N = block(N, s)
+        N = self.N_proj(N)
+        return F0, N
+    def forward(self, d_en=None, s=None):
+        blend = self.text_encoder(d_en, s)
+        x, _ = self.lstm(blend)
+        dur = self.duration_proj(x)  # [bs, 150, 50]
+        _, input_length, classifier_50 = dur.shape
+        dur = dur[0, :, :]
+        dur = torch.sigmoid(dur).sum(1)
+        dur = dur.round().clamp(min=1).to(torch.int64)
+        aln_trg = torch.zeros(1,
+                              dur.sum(),
+                              input_length,
+                              device=s.device)
+        c_frame = 0
+        for i in range(input_length):
+            aln_trg[:, c_frame:c_frame + dur[i], i] = 1
+            c_frame += dur[i]
+        en = torch.bmm(aln_trg, blend)
+        F0_pred, N_pred = self.F0Ntrain(en, s)
+        return aln_trg, F0_pred, N_pred
+class DurationEncoder(nn.Module):
+    def __init__(self, sty_dim=128, d_model=512, nlayers=3):
+        super().__init__()
+        self.lstms = nn.ModuleList()
+        for _ in range(nlayers):
+            self.lstms.append(nn.LSTM(d_model + sty_dim,
+                                      d_model // 2,
+                                      num_layers=1,
+                                      batch_first=True,
+                                      bidirectional=True
+                                      ))
+            self.lstms.append(AdaLayerNorm(sty_dim, d_model))
+    def forward(self, x, style):
+        _, _, input_lengths = x.shape  # [bs, 512, time]
+        style = _tile(style, length=x.shape[2]).transpose(1, 2)
+        x = x.transpose(1, 2)
+        for block in self.lstms:
+            if isinstance(block, AdaLayerNorm):
+                x = block(x, style)  # LSTM has transposed x
+            else:
+                x = torch.cat([x, style], axis=2)
+                # LSTM
+                x,_ = block(x)  # expects [bs, time, chan]  OUTPUTS [bs, time, 2*chan]  2x FROM BIDIRECTIONAL
+        return torch.cat([x, style], axis=2)  # predictor.lstm()

wav/af_ZA_google-nwu_0184.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_1919.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_2418.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_6590.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_7130.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_7214.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_8148.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_8924.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_8963.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_00737.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_00779.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_01232.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_01701.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_03042.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_0834.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_1010.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_3108.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_3713.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_3958.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_4046.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_4811.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_5958.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_9169.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_rm.wav ADDED Viewed

Binary file (92 kB). View file

wav/de_DE_m-ailabs_angela_merkel.wav ADDED Viewed

Binary file (90.7 kB). View file

wav/de_DE_m-ailabs_eva_k.wav ADDED Viewed

Binary file (92.7 kB). View file

wav/de_DE_m-ailabs_karlsson.wav ADDED Viewed

Binary file (92.7 kB). View file

wav/de_DE_m-ailabs_ramona_deininger.wav ADDED Viewed

Binary file (91.2 kB). View file

wav/de_DE_m-ailabs_rebecca_braunert_plunkett.wav ADDED Viewed

Binary file (91.2 kB). View file

wav/de_DE_thorsten-emotion_amused.wav ADDED Viewed

Binary file (92 kB). View file

wav/el_GR_rapunzelina.wav ADDED Viewed

Binary file (92 kB). View file

wav/en_UK_apope.wav ADDED Viewed

Binary file (92 kB). View file

wav/en_US_cmu_arctic_aew.wav ADDED Viewed

Binary file (92 kB). View file

wav/en_US_cmu_arctic_aup.wav ADDED Viewed

Binary file (94.3 kB). View file

wav/en_US_cmu_arctic_awb.wav ADDED Viewed

Binary file (92 kB). View file

wav/en_US_cmu_arctic_awbrms.wav ADDED Viewed

Binary file (92.7 kB). View file

wav/en_US_cmu_arctic_axb.wav ADDED Viewed

Binary file (92 kB). View file

wav/en_US_cmu_arctic_bdl.wav ADDED Viewed

Binary file (94.8 kB). View file

wav/en_US_cmu_arctic_clb.wav ADDED Viewed

Binary file (92 kB). View file

wav/en_US_cmu_arctic_eey.wav ADDED Viewed

Binary file (95.3 kB). View file

wav/en_US_cmu_arctic_fem.wav ADDED Viewed

Binary file (94.8 kB). View file

wav/en_US_cmu_arctic_gka.wav ADDED Viewed

Binary file (95.3 kB). View file

wav/en_US_cmu_arctic_jmk.wav ADDED Viewed

Binary file (93.2 kB). View file

wav/en_US_cmu_arctic_ksp.wav ADDED Viewed

Binary file (92 kB). View file