Spaces:

Dionyssos
/

speech-analysis2

Running

Dionyssos commited on Sep 23

Commit

d3db00d

1 Parent(s): b2772ad

T5 shorter descriptions

Files changed (2) hide show

app.py CHANGED Viewed

@@ -63,20 +63,20 @@ def audionar_tts(text=None,
         x = np.zeros(4 * 16000, dtype=np.float32)  # If no txt 4s of audiogen
     elif lang not in language_names:  # text exists / StyleTTS2
             text = only_greek_or_only_latin(text, lang='eng')
             x = _tts.inference(text,
                             ref_s='wav/' + lang + '.wav')[0, 0, :].numpy()  # 24 Khz
             if x.shape[0] > 10:
                 x = audresample.resample(signal=x.astype(np.float32),
                                         original_rate=24000,
                                         target_rate=16000)[0, :]   # 16 KHz
     else:  # VITS
             lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
             global cached_lang_code, cached_net_g, cached_tokenizer
@@ -111,19 +111,19 @@ def audionar_tts(text=None,
     if soundscape and soundscape.strip():
         speech_duration_secs = len(x) / 16000
         target_duration = max(speech_duration_secs + 0.74, 2.0)
         background_audio = audiogen.generate(
-            soundscape,
             duration=target_duration,
             max_tokens=min( max(7, int(max_tokens)), 288 )  # limit sounds tokens (clone beyond)
          ).numpy()
         # PAD
         len_speech = len(x)
         len_background = len(background_audio)

         x = np.zeros(4 * 16000, dtype=np.float32)  # If no txt 4s of audiogen
     elif lang not in language_names:  # text exists / StyleTTS2
             text = only_greek_or_only_latin(text, lang='eng')
             x = _tts.inference(text,
                             ref_s='wav/' + lang + '.wav')[0, 0, :].numpy()  # 24 Khz
             if x.shape[0] > 10:
                 x = audresample.resample(signal=x.astype(np.float32),
                                         original_rate=24000,
                                         target_rate=16000)[0, :]   # 16 KHz
     else:  # VITS
             lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
             global cached_lang_code, cached_net_g, cached_tokenizer
     if soundscape and soundscape.strip():
         speech_duration_secs = len(x) / 16000
         target_duration = max(speech_duration_secs + 0.74, 2.0)
         background_audio = audiogen.generate(
+            soundscape[:64],  # to have shape of cross attention not grow large of T5 Num tokens
             duration=target_duration,
             max_tokens=min( max(7, int(max_tokens)), 288 )  # limit sounds tokens (clone beyond)
          ).numpy()
         # PAD
         len_speech = len(x)
         len_background = len(background_audio)

textual.py CHANGED Viewed

@@ -41,12 +41,12 @@ def only_greek_or_only_latin(text, lang='grc'):
         # 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
         # 'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu',
         # 'я': 'ya',
-        # ----------------
-        'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ж': 'zh',
         'з': 'z', 'и': 'i', 'ј': 'j', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n',
-        'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f',
-        'х': 'h', 'ц': 'c', 'ч': 'ts', 'ш': 'sch',
-        'ђ': 'd', 'љ': 'lj', 'њ': 'nj', 'ћ': 'c', 'џ': 'dsz',
         'ё': 'yo', 'й': 'y', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '',
         'э': 'e', 'ю': 'io', 'я': 'ia',
         'ѓ': 'y', 'ѕ': 's', 'ќ': 'k',
@@ -166,7 +166,7 @@ def fix_vocals(text, lang='ron'):
         '^': ' la puterea ',
         '+': ' plus ',
         ' - ': ' minus ',  # only replace if standalone so to not say minus if is a-b-c
-        '*': ' ori ',  # times
         '/': ' împărțit la ',  # divided by
         '=': ' egal cu ',  # equals
         'pi': ' pi ',
@@ -199,7 +199,7 @@ def fix_vocals(text, lang='ron'):
         '^': ' to the power of ',
         '+': ' plus ',
         ' - ': ' minus ',
-        '*': ' times ',
         ' / ': ' divided by ',
         '=': ' equals ',
         'pi': ' pi ',

         # 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
         # 'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu',
         # 'я': 'ya',
+        # ----------------кључеви
+        'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'di', 'е': 'e', 'ж': 'zu',
         'з': 'z', 'и': 'i', 'ј': 'j', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n',
+        'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'ou', 'ф': 'f',
+        'х': 'h', 'ц': 'ts', 'ч': 'ts', 'ш': 'sch',
+        'ђ': 'd', 'љ': 'li', 'њ': 'nj', 'ћ': 'c', 'џ': 'dsz',
         'ё': 'yo', 'й': 'y', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '',
         'э': 'e', 'ю': 'io', 'я': 'ia',
         'ѓ': 'y', 'ѕ': 's', 'ќ': 'k',
         '^': ' la puterea ',
         '+': ' plus ',
         ' - ': ' minus ',  # only replace if standalone so to not say minus if is a-b-c
+        # '*': ' ori ',  # times
         '/': ' împărțit la ',  # divided by
         '=': ' egal cu ',  # equals
         'pi': ' pi ',
         '^': ' to the power of ',
         '+': ' plus ',
         ' - ': ' minus ',
+        # '*': ' times ',
         ' / ': ' divided by ',
         '=': ' equals ',
         'pi': ' pi ',