Spaces:

Somalitts
/

8aad

Runtime error

App Files Files Community

Somalitts commited on Jul 20, 2025

Commit

c872044

verified ·

1 Parent(s): 5a3bbd1

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -44

app.py CHANGED Viewed

@@ -3,15 +3,16 @@ import torch
 import torchaudio
 import re
 import os
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from speechbrain.pretrained import EncoderClassifier
-import numpy as np
 # --- Configuration ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
-VOICE_SAMPLE_FILES = ["1.wav"]  # Hubi in faylkan tayadiisu fiican tahay
 EMBEDDING_DIR = "speaker_embeddings"
 os.makedirs(EMBEDDING_DIR, exist_ok=True)
@@ -57,7 +58,7 @@ def get_speaker_embedding(wav_file_path):
     except Exception as e:
         raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
-# Number to words functions (as before) ...
 number_words = {
     0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
     6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
@@ -68,6 +69,7 @@ number_words = {
     60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
     100: "boqol", 1000: "kun",
 }
 def number_to_words(n):
     if n in number_words:
         return number_words[n]
@@ -83,27 +85,23 @@ def number_to_words(n):
         return (number_to_words(n // 1_000_000) + " milyan" if n // 1_000_000 > 1 else "milyan") + (
             " iyo " + number_to_words(n % 1_000_000) if n % 1_000_000 else "")
     return str(n)
 def replace_numbers_with_words(text):
     return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
     text = re.sub(r'[^\w\s\']', '', text)
     return text
-# **Jumladaha kala saar (split into sentences) function**
 def split_into_sentences(text):
-    # Qaar ka mid ah hababka fudud ee jumladaha kala saarista
     sentence_endings = re.compile(r'(?<=[.!?])\s+')
     sentences = sentence_endings.split(text)
-    # Haddii qoraalka uusan lahayn calaamadaha dhamaadka jumlada, iska hubi oo qaybi ereyo waaweyn
-    if len(sentences) == 1:
-        # Ku kala jar ereyo waaweyn maxaa yeelay lama helin calaamad
-        sentences = re.split(r'(?<=\.)\s+|(?<=\?)\s+|(?<=!)\s+', text)
-    # Nadiifi meelaha banaan iyo jumladaha madhan
-    sentences = [s.strip() for s in sentences if s.strip()]
-    return sentences
 def text_to_speech(text, voice_choice):
     if not text or not voice_choice:
         gr.Warning("Fadlan geli qoraal oo dooro cod.")
@@ -111,52 +109,65 @@ def text_to_speech(text, voice_choice):
     speaker_embedding = get_speaker_embedding(voice_choice)
-    sentences = split_into_sentences(text)
-    all_audios = []
-    for i, sentence in enumerate(sentences):
-        normalized_text = normalize_text(sentence)
-        inputs = processor(text=normalized_text, return_tensors="pt").to(device)
-        with torch.no_grad():
-            speech = model.generate(
-                input_ids=inputs["input_ids"],
-                speaker_embeddings=speaker_embedding.unsqueeze(0),
-                do_sample=True,
-                top_k=50,
-                temperature=0.75,
-                repetition_penalty=1.2,
-                max_new_tokens=512
-            )
-            audio = vocoder(speech).cpu()
-        all_audios.append(audio)
-        # Nasasho 0.5 ilbiriqsi haddii uusan ahayn jumladii ugu dambeysay
-        if i < len(sentences) - 1:
-            pause = torch.zeros((1, int(16000 * 0.5)))  # 0.5 sec silence
-            all_audios.append(pause)
-    final_audio = torch.cat(all_audios, dim=1)
-    return (16000, final_audio.numpy())
 iface = gr.Interface(
     fn=text_to_speech,
     inputs=[
         gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)", lines=7, placeholder="Qoraalka geli halkan..."),
         gr.Dropdown(
             VOICE_SAMPLE_FILES,
-            label="Select Voice",
-            info="Dooro codka aad rabto inaad isticmaasho.",
             value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
         )
     ],
-    outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
     title="Multi-Voice Somali Text-to-Speech",
-    description="Geli qoraal Soomaali ah, dooro cod, kadibna riix 'Submit' si aad u abuurto hadal."
 )
 if __name__ == "__main__":
     if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
-        raise FileNotFoundError("Fadlan hubi inaad faylasha codka soo gelisay Space-ka.")
     print("Diyaarinta codadka...")
     for voice_file in VOICE_SAMPLE_FILES:

 import torchaudio
 import re
 import os
+import numpy as np
+import scipy.io.wavfile
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from speechbrain.pretrained import EncoderClassifier
 # --- Configuration ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+VOICE_SAMPLE_FILES = ["1.wav"]  # Codka tusaale ahaan
 EMBEDDING_DIR = "speaker_embeddings"
 os.makedirs(EMBEDDING_DIR, exist_ok=True)
     except Exception as e:
         raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
+# --- Number words dictionary and functions ---
 number_words = {
     0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
     6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
     60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
     100: "boqol", 1000: "kun",
 }
 def number_to_words(n):
     if n in number_words:
         return number_words[n]
         return (number_to_words(n // 1_000_000) + " milyan" if n // 1_000_000 > 1 else "milyan") + (
             " iyo " + number_to_words(n % 1_000_000) if n % 1_000_000 else "")
     return str(n)
 def replace_numbers_with_words(text):
     return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
     text = re.sub(r'[^\w\s\']', '', text)
     return text
+# --- Helper to split text into sentences ---
 def split_into_sentences(text):
     sentence_endings = re.compile(r'(?<=[.!?])\s+')
     sentences = sentence_endings.split(text)
+    return [s.strip() for s in sentences if s.strip()]
+# --- Main TTS function with pauses between sentences ---
 def text_to_speech(text, voice_choice):
     if not text or not voice_choice:
         gr.Warning("Fadlan geli qoraal oo dooro cod.")
     speaker_embedding = get_speaker_embedding(voice_choice)
+    paragraphs = text.strip().split("\n")
+    audio_chunks = []
+    for para in paragraphs:
+        para = para.strip()
+        if not para:
+            continue
+        sentences = split_into_sentences(para)
+        for idx, sentence in enumerate(sentences):
+            norm_sentence = normalize_text(sentence)
+            inputs = processor(text=norm_sentence, return_tensors="pt").to(device)
+            with torch.no_grad():
+                speech = model.generate(
+                    input_ids=inputs["input_ids"],
+                    speaker_embeddings=speaker_embedding.unsqueeze(0),
+                    do_sample=True,
+                    top_k=50,
+                    temperature=0.75,
+                    repetition_penalty=1.2,
+                    max_new_tokens=512
+                )
+                audio = vocoder(speech).cpu().squeeze().numpy()
+            audio_chunks.append(audio)
+            # Pause 0.5 sec between sentences (not after last)
+            if idx < len(sentences) - 1:
+                pause = np.zeros(int(16000 * 0.5))
+                audio_chunks.append(pause)
+        # Pause 0.8 sec between paragraphs (optional)
+        pause_para = np.zeros(int(16000 * 0.8))
+        audio_chunks.append(pause_para)
+    final_audio = np.concatenate(audio_chunks)
+    return (16000, final_audio)
+# --- Gradio Interface ---
 iface = gr.Interface(
     fn=text_to_speech,
     inputs=[
         gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)", lines=7, placeholder="Qoraalka geli halkan..."),
         gr.Dropdown(
             VOICE_SAMPLE_FILES,
+            label="Dooro Codka (Select Voice)",
             value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
         )
     ],
+    outputs=gr.Audio(label="Codka La Abuuray (Generated Audio)", type="numpy"),
     title="Multi-Voice Somali Text-to-Speech",
+    description="Geli qoraal Soomaali ah, dooro cod, kadib riix 'Submit' si aad u abuurto hadal."
 )
+# --- Launch App ---
 if __name__ == "__main__":
     if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
+        raise FileNotFoundError("Fadlan hubi inaad faylasha codka ku dartay.")
     print("Diyaarinta codadka...")
     for voice_file in VOICE_SAMPLE_FILES: