Spaces:

HusseinBashir
/

Somali_tts

Runtime error

App Files Files Community

HusseinBashir commited on May 28

Commit

35e3ab8

verified ·

1 Parent(s): 1b065b4

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -61

app.py CHANGED Viewed

@@ -4,9 +4,8 @@ import numpy as np
 import scipy.io.wavfile
 from transformers import VitsModel, AutoTokenizer
 import re
-import time
-# Load model and tokenizer
 model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
 tokenizer = AutoTokenizer.from_pretrained("saleolow/somali-mms-tts")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -68,7 +67,6 @@ def number_to_words(number):
         return str(number)
 def normalize_text(text):
-    text = text.lower()
     # Remove commas from numbers like 1,000,000
     text = re.sub(r'(\d{1,3})(,\d{3})+', lambda m: m.group(0).replace(",", ""), text)
@@ -86,81 +84,40 @@ def normalize_text(text):
         '$': 'doolar',
         '=': 'egwal',
         '+': 'balaas',
-        '%': 'boqolkiiba',
-        '&': 'iyo',
-        '@': 'at',
-        '#': 'hash',
     }
     for sym, word in symbol_map.items():
         text = text.replace(sym, ' ' + word + ' ')
-    # Special rule for 'z' or 'Z' prefix or suffix to sound as 's'
-    # Replace 'z' or 'Z' at start or end of word with 's'
-    def replace_z(match):
-        word = match.group()
-        # Replace z or Z at start or end with s
-        if word.startswith('z'):
-            word = 's' + word[1:]
-        if word.endswith('z'):
-            word = word[:-1] + 's'
-        return word
-    # Apply regex word by word for words containing z or Z
-    text = re.sub(r'\b[z][a-z]*\b', replace_z, text)  # words starting with z
-    text = re.sub(r'\b[a-z]*[z]\b', replace_z, text)  # words ending with z
-    # Optional character normalization (kuma jirto 'z' sababtoo ah hadda la maamulo)
-    text = text.replace("kh", "qa").replace("sh", "sha'a").replace("dh", "dha'a")
     return text
 def tts(text):
-    paragraphs = [p for p in text.strip().split("\n") if p.strip()]
     audio_list = []
-    # Calculate max total duration allowed based on paragraph count
-    n = len(paragraphs)
-    if n <= 5:
-        max_duration = 30  # seconds
-    elif n <= 20:
-        max_duration = 60
-    else:
-        max_duration = 120
-    # Generate waveform per paragraph and keep track of lengths
-    waveforms = []
-    for para in paragraphs:
         norm_para = normalize_text(para)
         inputs = tokenizer(norm_para, return_tensors="pt").to(device)
         with torch.no_grad():
             waveform = model(**inputs).waveform.squeeze().cpu().numpy()
-        waveforms.append(waveform)
-    # Calculate total length of raw waveform (in samples)
-    total_samples = sum(wf.shape[0] for wf in waveforms)
-    sampling_rate = model.config.sampling_rate
-    # Compute speed factor to fit into max_duration seconds
-    total_duration = total_samples / sampling_rate
-    speed_factor = total_duration / max_duration if total_duration > max_duration else 1.0
-    # Adjust waveforms speed by resampling (speed up if needed)
-    from scipy.signal import resample
-    for i, wf in enumerate(waveforms):
-        new_length = int(len(wf) / speed_factor)
-        waveforms[i] = resample(wf, new_length)
-    # Add 0.3 sec pause between paragraphs except last one
-    pause = np.zeros(int(sampling_rate * 0.3))
-    for i, wf in enumerate(waveforms):
-        audio_list.append(wf)
-        if i < len(waveforms) -1:
-            audio_list.append(pause)
     final_audio = np.concatenate(audio_list)
-    filename = f"output_{int(time.time())}.wav"
-    scipy.io.wavfile.write(filename, rate=sampling_rate, data=(final_audio * 32767).astype(np.int16))
     return filename
 gr.Interface(

 import scipy.io.wavfile
 from transformers import VitsModel, AutoTokenizer
 import re
+# Load fine-tuned model from Hugging Face Hub or local path
 model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
 tokenizer = AutoTokenizer.from_pretrained("saleolow/somali-mms-tts")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         return str(number)
 def normalize_text(text):
     # Remove commas from numbers like 1,000,000
     text = re.sub(r'(\d{1,3})(,\d{3})+', lambda m: m.group(0).replace(",", ""), text)
         '$': 'doolar',
         '=': 'egwal',
         '+': 'balaas',
+        '-': 'miinas'
     }
     for sym, word in symbol_map.items():
         text = text.replace(sym, ' ' + word + ' ')
+    # Optional character normalization
+    text = text.replace("KH", "qa").replace("Z", "S")
+    text = text.replace("SH", "SHa'a").replace("DH", "Dha'a")
+    text = text.replace("ZamZam", "SamSam")
     return text
 def tts(text):
+    paragraphs = text.strip().split("\n")
     audio_list = []
+    for i, para in enumerate(paragraphs):
+        if not para.strip():
+            continue
         norm_para = normalize_text(para)
         inputs = tokenizer(norm_para, return_tensors="pt").to(device)
         with torch.no_grad():
             waveform = model(**inputs).waveform.squeeze().cpu().numpy()
+        # Add pause between paragraphs (only if it's not the last one)
+        if i < len(paragraphs) - 1:
+            pause = np.zeros(int(model.config.sampling_rate * 0.8))  # 0.8 seconds pause
+            audio_list.append(np.concatenate((waveform, pause)))
+        else:
+            audio_list.append(waveform)
     final_audio = np.concatenate(audio_list)
+    filename = "output.wav"
+    scipy.io.wavfile.write(filename, rate=model.config.sampling_rate, data=(final_audio * 32767).astype(np.int16))
     return filename
 gr.Interface(