Spaces:

HusseinBashir
/

Somali_tts

Runtime error

App Files Files Community

HusseinBashir commited on May 28

Commit

1b065b4

verified ·

1 Parent(s): 9b0a893

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -8

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ import scipy.io.wavfile
 from transformers import VitsModel, AutoTokenizer
 import re
 import time
-from scipy.signal import resample
 # Load model and tokenizer
 model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
@@ -91,23 +90,26 @@ def normalize_text(text):
         '&': 'iyo',
         '@': 'at',
         '#': 'hash',
-        '.': 'dhibic',
     }
     for sym, word in symbol_map.items():
         text = text.replace(sym, ' ' + word + ' ')
     # Special rule for 'z' or 'Z' prefix or suffix to sound as 's'
     def replace_z(match):
         word = match.group()
         if word.startswith('z'):
             word = 's' + word[1:]
         if word.endswith('z'):
             word = word[:-1] + 's'
         return word
     text = re.sub(r'\b[z][a-z]*\b', replace_z, text)  # words starting with z
     text = re.sub(r'\b[a-z]*[z]\b', replace_z, text)  # words ending with z
     text = text.replace("kh", "qa").replace("sh", "sha'a").replace("dh", "dha'a")
     return text
@@ -116,18 +118,16 @@ def tts(text):
     paragraphs = [p for p in text.strip().split("\n") if p.strip()]
     audio_list = []
     n = len(paragraphs)
     if n <= 5:
         max_duration = 30  # seconds
     elif n <= 20:
         max_duration = 60
-    elif n <= 50:
-        max_duration = 120
-    elif n <= 100:
-        max_duration = 240
     else:
-        max_duration = 300  # 5 minutes max
     waveforms = []
     for para in paragraphs:
         norm_para = normalize_text(para)
@@ -136,20 +136,26 @@ def tts(text):
             waveform = model(**inputs).waveform.squeeze().cpu().numpy()
         waveforms.append(waveform)
     total_samples = sum(wf.shape[0] for wf in waveforms)
     sampling_rate = model.config.sampling_rate
     total_duration = total_samples / sampling_rate
     speed_factor = total_duration / max_duration if total_duration > max_duration else 1.0
     for i, wf in enumerate(waveforms):
         new_length = int(len(wf) / speed_factor)
         waveforms[i] = resample(wf, new_length)
     pause = np.zeros(int(sampling_rate * 0.3))
     for i, wf in enumerate(waveforms):
         audio_list.append(wf)
-        if i < len(waveforms) - 1:
             audio_list.append(pause)
     final_audio = np.concatenate(audio_list)

 from transformers import VitsModel, AutoTokenizer
 import re
 import time
 # Load model and tokenizer
 model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
         '&': 'iyo',
         '@': 'at',
         '#': 'hash',
     }
     for sym, word in symbol_map.items():
         text = text.replace(sym, ' ' + word + ' ')
     # Special rule for 'z' or 'Z' prefix or suffix to sound as 's'
+    # Replace 'z' or 'Z' at start or end of word with 's'
     def replace_z(match):
         word = match.group()
+        # Replace z or Z at start or end with s
         if word.startswith('z'):
             word = 's' + word[1:]
         if word.endswith('z'):
             word = word[:-1] + 's'
         return word
+    # Apply regex word by word for words containing z or Z
     text = re.sub(r'\b[z][a-z]*\b', replace_z, text)  # words starting with z
     text = re.sub(r'\b[a-z]*[z]\b', replace_z, text)  # words ending with z
+    # Optional character normalization (kuma jirto 'z' sababtoo ah hadda la maamulo)
     text = text.replace("kh", "qa").replace("sh", "sha'a").replace("dh", "dha'a")
     return text
     paragraphs = [p for p in text.strip().split("\n") if p.strip()]
     audio_list = []
+    # Calculate max total duration allowed based on paragraph count
     n = len(paragraphs)
     if n <= 5:
         max_duration = 30  # seconds
     elif n <= 20:
         max_duration = 60
     else:
+        max_duration = 120
+    # Generate waveform per paragraph and keep track of lengths
     waveforms = []
     for para in paragraphs:
         norm_para = normalize_text(para)
             waveform = model(**inputs).waveform.squeeze().cpu().numpy()
         waveforms.append(waveform)
+    # Calculate total length of raw waveform (in samples)
     total_samples = sum(wf.shape[0] for wf in waveforms)
     sampling_rate = model.config.sampling_rate
+    # Compute speed factor to fit into max_duration seconds
     total_duration = total_samples / sampling_rate
     speed_factor = total_duration / max_duration if total_duration > max_duration else 1.0
+    # Adjust waveforms speed by resampling (speed up if needed)
+    from scipy.signal import resample
     for i, wf in enumerate(waveforms):
         new_length = int(len(wf) / speed_factor)
         waveforms[i] = resample(wf, new_length)
+    # Add 0.3 sec pause between paragraphs except last one
     pause = np.zeros(int(sampling_rate * 0.3))
     for i, wf in enumerate(waveforms):
         audio_list.append(wf)
+        if i < len(waveforms) -1:
             audio_list.append(pause)
     final_audio = np.concatenate(audio_list)