Spaces:
Running
Running
T5 shorter descriptions
Browse files- app.py +7 -7
- textual.py +7 -7
app.py
CHANGED
|
@@ -63,20 +63,20 @@ def audionar_tts(text=None,
|
|
| 63 |
x = np.zeros(4 * 16000, dtype=np.float32) # If no txt 4s of audiogen
|
| 64 |
|
| 65 |
elif lang not in language_names: # text exists / StyleTTS2
|
| 66 |
-
|
| 67 |
text = only_greek_or_only_latin(text, lang='eng')
|
| 68 |
|
| 69 |
x = _tts.inference(text,
|
| 70 |
ref_s='wav/' + lang + '.wav')[0, 0, :].numpy() # 24 Khz
|
| 71 |
-
|
| 72 |
if x.shape[0] > 10:
|
| 73 |
|
| 74 |
x = audresample.resample(signal=x.astype(np.float32),
|
| 75 |
original_rate=24000,
|
| 76 |
target_rate=16000)[0, :] # 16 KHz
|
| 77 |
-
|
| 78 |
else: # VITS
|
| 79 |
-
|
| 80 |
lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
|
| 81 |
|
| 82 |
global cached_lang_code, cached_net_g, cached_tokenizer
|
|
@@ -111,19 +111,19 @@ def audionar_tts(text=None,
|
|
| 111 |
|
| 112 |
if soundscape and soundscape.strip():
|
| 113 |
|
| 114 |
-
|
| 115 |
speech_duration_secs = len(x) / 16000
|
| 116 |
target_duration = max(speech_duration_secs + 0.74, 2.0)
|
| 117 |
|
| 118 |
|
| 119 |
background_audio = audiogen.generate(
|
| 120 |
-
soundscape,
|
| 121 |
duration=target_duration,
|
| 122 |
max_tokens=min( max(7, int(max_tokens)), 288 ) # limit sounds tokens (clone beyond)
|
| 123 |
).numpy()
|
| 124 |
|
| 125 |
# PAD
|
| 126 |
-
|
| 127 |
len_speech = len(x)
|
| 128 |
len_background = len(background_audio)
|
| 129 |
|
|
|
|
| 63 |
x = np.zeros(4 * 16000, dtype=np.float32) # If no txt 4s of audiogen
|
| 64 |
|
| 65 |
elif lang not in language_names: # text exists / StyleTTS2
|
| 66 |
+
|
| 67 |
text = only_greek_or_only_latin(text, lang='eng')
|
| 68 |
|
| 69 |
x = _tts.inference(text,
|
| 70 |
ref_s='wav/' + lang + '.wav')[0, 0, :].numpy() # 24 Khz
|
| 71 |
+
|
| 72 |
if x.shape[0] > 10:
|
| 73 |
|
| 74 |
x = audresample.resample(signal=x.astype(np.float32),
|
| 75 |
original_rate=24000,
|
| 76 |
target_rate=16000)[0, :] # 16 KHz
|
| 77 |
+
|
| 78 |
else: # VITS
|
| 79 |
+
|
| 80 |
lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
|
| 81 |
|
| 82 |
global cached_lang_code, cached_net_g, cached_tokenizer
|
|
|
|
| 111 |
|
| 112 |
if soundscape and soundscape.strip():
|
| 113 |
|
| 114 |
+
|
| 115 |
speech_duration_secs = len(x) / 16000
|
| 116 |
target_duration = max(speech_duration_secs + 0.74, 2.0)
|
| 117 |
|
| 118 |
|
| 119 |
background_audio = audiogen.generate(
|
| 120 |
+
soundscape[:64], # to have shape of cross attention not grow large of T5 Num tokens
|
| 121 |
duration=target_duration,
|
| 122 |
max_tokens=min( max(7, int(max_tokens)), 288 ) # limit sounds tokens (clone beyond)
|
| 123 |
).numpy()
|
| 124 |
|
| 125 |
# PAD
|
| 126 |
+
|
| 127 |
len_speech = len(x)
|
| 128 |
len_background = len(background_audio)
|
| 129 |
|
textual.py
CHANGED
|
@@ -41,12 +41,12 @@ def only_greek_or_only_latin(text, lang='grc'):
|
|
| 41 |
# 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
|
| 42 |
# 'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu',
|
| 43 |
# 'я': 'ya',
|
| 44 |
-
#
|
| 45 |
-
'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': '
|
| 46 |
'з': 'z', 'и': 'i', 'ј': 'j', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n',
|
| 47 |
-
'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': '
|
| 48 |
-
'х': 'h', 'ц': '
|
| 49 |
-
'ђ': 'd', 'љ': '
|
| 50 |
'ё': 'yo', 'й': 'y', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '',
|
| 51 |
'э': 'e', 'ю': 'io', 'я': 'ia',
|
| 52 |
'ѓ': 'y', 'ѕ': 's', 'ќ': 'k',
|
|
@@ -166,7 +166,7 @@ def fix_vocals(text, lang='ron'):
|
|
| 166 |
'^': ' la puterea ',
|
| 167 |
'+': ' plus ',
|
| 168 |
' - ': ' minus ', # only replace if standalone so to not say minus if is a-b-c
|
| 169 |
-
'*': ' ori ', # times
|
| 170 |
'/': ' împărțit la ', # divided by
|
| 171 |
'=': ' egal cu ', # equals
|
| 172 |
'pi': ' pi ',
|
|
@@ -199,7 +199,7 @@ def fix_vocals(text, lang='ron'):
|
|
| 199 |
'^': ' to the power of ',
|
| 200 |
'+': ' plus ',
|
| 201 |
' - ': ' minus ',
|
| 202 |
-
'*': ' times ',
|
| 203 |
' / ': ' divided by ',
|
| 204 |
'=': ' equals ',
|
| 205 |
'pi': ' pi ',
|
|
|
|
| 41 |
# 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
|
| 42 |
# 'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu',
|
| 43 |
# 'я': 'ya',
|
| 44 |
+
# ----------------кључеви
|
| 45 |
+
'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'di', 'е': 'e', 'ж': 'zu',
|
| 46 |
'з': 'z', 'и': 'i', 'ј': 'j', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n',
|
| 47 |
+
'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'ou', 'ф': 'f',
|
| 48 |
+
'х': 'h', 'ц': 'ts', 'ч': 'ts', 'ш': 'sch',
|
| 49 |
+
'ђ': 'd', 'љ': 'li', 'њ': 'nj', 'ћ': 'c', 'џ': 'dsz',
|
| 50 |
'ё': 'yo', 'й': 'y', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '',
|
| 51 |
'э': 'e', 'ю': 'io', 'я': 'ia',
|
| 52 |
'ѓ': 'y', 'ѕ': 's', 'ќ': 'k',
|
|
|
|
| 166 |
'^': ' la puterea ',
|
| 167 |
'+': ' plus ',
|
| 168 |
' - ': ' minus ', # only replace if standalone so to not say minus if is a-b-c
|
| 169 |
+
# '*': ' ori ', # times
|
| 170 |
'/': ' împărțit la ', # divided by
|
| 171 |
'=': ' egal cu ', # equals
|
| 172 |
'pi': ' pi ',
|
|
|
|
| 199 |
'^': ' to the power of ',
|
| 200 |
'+': ' plus ',
|
| 201 |
' - ': ' minus ',
|
| 202 |
+
# '*': ' times ',
|
| 203 |
' / ': ' divided by ',
|
| 204 |
'=': ' equals ',
|
| 205 |
'pi': ' pi ',
|