Somalitts commited on
Commit
c872044
Β·
verified Β·
1 Parent(s): 5a3bbd1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -44
app.py CHANGED
@@ -3,15 +3,16 @@ import torch
3
  import torchaudio
4
  import re
5
  import os
 
 
6
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
  from speechbrain.pretrained import EncoderClassifier
8
- import numpy as np
9
 
10
  # --- Configuration ---
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
  print(f"Using device: {device}")
13
 
14
- VOICE_SAMPLE_FILES = ["1.wav"] # Hubi in faylkan tayadiisu fiican tahay
15
  EMBEDDING_DIR = "speaker_embeddings"
16
  os.makedirs(EMBEDDING_DIR, exist_ok=True)
17
 
@@ -57,7 +58,7 @@ def get_speaker_embedding(wav_file_path):
57
  except Exception as e:
58
  raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
59
 
60
- # Number to words functions (as before) ...
61
  number_words = {
62
  0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
63
  6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
@@ -68,6 +69,7 @@ number_words = {
68
  60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
69
  100: "boqol", 1000: "kun",
70
  }
 
71
  def number_to_words(n):
72
  if n in number_words:
73
  return number_words[n]
@@ -83,27 +85,23 @@ def number_to_words(n):
83
  return (number_to_words(n // 1_000_000) + " milyan" if n // 1_000_000 > 1 else "milyan") + (
84
  " iyo " + number_to_words(n % 1_000_000) if n % 1_000_000 else "")
85
  return str(n)
 
86
  def replace_numbers_with_words(text):
87
  return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 
88
  def normalize_text(text):
89
  text = text.lower()
90
  text = replace_numbers_with_words(text)
91
  text = re.sub(r'[^\w\s\']', '', text)
92
  return text
93
 
94
- # **Jumladaha kala saar (split into sentences) function**
95
  def split_into_sentences(text):
96
- # Qaar ka mid ah hababka fudud ee jumladaha kala saarista
97
  sentence_endings = re.compile(r'(?<=[.!?])\s+')
98
  sentences = sentence_endings.split(text)
99
- # Haddii qoraalka uusan lahayn calaamadaha dhamaadka jumlada, iska hubi oo qaybi ereyo waaweyn
100
- if len(sentences) == 1:
101
- # Ku kala jar ereyo waaweyn maxaa yeelay lama helin calaamad
102
- sentences = re.split(r'(?<=\.)\s+|(?<=\?)\s+|(?<=!)\s+', text)
103
- # Nadiifi meelaha banaan iyo jumladaha madhan
104
- sentences = [s.strip() for s in sentences if s.strip()]
105
- return sentences
106
 
 
107
  def text_to_speech(text, voice_choice):
108
  if not text or not voice_choice:
109
  gr.Warning("Fadlan geli qoraal oo dooro cod.")
@@ -111,52 +109,65 @@ def text_to_speech(text, voice_choice):
111
 
112
  speaker_embedding = get_speaker_embedding(voice_choice)
113
 
114
- sentences = split_into_sentences(text)
115
-
116
- all_audios = []
117
- for i, sentence in enumerate(sentences):
118
- normalized_text = normalize_text(sentence)
119
- inputs = processor(text=normalized_text, return_tensors="pt").to(device)
120
- with torch.no_grad():
121
- speech = model.generate(
122
- input_ids=inputs["input_ids"],
123
- speaker_embeddings=speaker_embedding.unsqueeze(0),
124
- do_sample=True,
125
- top_k=50,
126
- temperature=0.75,
127
- repetition_penalty=1.2,
128
- max_new_tokens=512
129
- )
130
- audio = vocoder(speech).cpu()
131
-
132
- all_audios.append(audio)
133
- # Nasasho 0.5 ilbiriqsi haddii uusan ahayn jumladii ugu dambeysay
134
- if i < len(sentences) - 1:
135
- pause = torch.zeros((1, int(16000 * 0.5))) # 0.5 sec silence
136
- all_audios.append(pause)
137
-
138
- final_audio = torch.cat(all_audios, dim=1)
139
- return (16000, final_audio.numpy())
140
-
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  iface = gr.Interface(
142
  fn=text_to_speech,
143
  inputs=[
144
  gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)", lines=7, placeholder="Qoraalka geli halkan..."),
145
  gr.Dropdown(
146
  VOICE_SAMPLE_FILES,
147
- label="Select Voice",
148
- info="Dooro codka aad rabto inaad isticmaasho.",
149
  value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
150
  )
151
  ],
152
- outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
153
  title="Multi-Voice Somali Text-to-Speech",
154
- description="Geli qoraal Soomaali ah, dooro cod, kadibna riix 'Submit' si aad u abuurto hadal."
155
  )
156
 
 
157
  if __name__ == "__main__":
158
  if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
159
- raise FileNotFoundError("Fadlan hubi inaad faylasha codka soo gelisay Space-ka.")
160
 
161
  print("Diyaarinta codadka...")
162
  for voice_file in VOICE_SAMPLE_FILES:
 
3
  import torchaudio
4
  import re
5
  import os
6
+ import numpy as np
7
+ import scipy.io.wavfile
8
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
  from speechbrain.pretrained import EncoderClassifier
 
10
 
11
  # --- Configuration ---
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
  print(f"Using device: {device}")
14
 
15
+ VOICE_SAMPLE_FILES = ["1.wav"] # Codka tusaale ahaan
16
  EMBEDDING_DIR = "speaker_embeddings"
17
  os.makedirs(EMBEDDING_DIR, exist_ok=True)
18
 
 
58
  except Exception as e:
59
  raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
60
 
61
+ # --- Number words dictionary and functions ---
62
  number_words = {
63
  0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
64
  6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
 
69
  60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
70
  100: "boqol", 1000: "kun",
71
  }
72
+
73
  def number_to_words(n):
74
  if n in number_words:
75
  return number_words[n]
 
85
  return (number_to_words(n // 1_000_000) + " milyan" if n // 1_000_000 > 1 else "milyan") + (
86
  " iyo " + number_to_words(n % 1_000_000) if n % 1_000_000 else "")
87
  return str(n)
88
+
89
  def replace_numbers_with_words(text):
90
  return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
91
+
92
  def normalize_text(text):
93
  text = text.lower()
94
  text = replace_numbers_with_words(text)
95
  text = re.sub(r'[^\w\s\']', '', text)
96
  return text
97
 
98
+ # --- Helper to split text into sentences ---
99
  def split_into_sentences(text):
 
100
  sentence_endings = re.compile(r'(?<=[.!?])\s+')
101
  sentences = sentence_endings.split(text)
102
+ return [s.strip() for s in sentences if s.strip()]
 
 
 
 
 
 
103
 
104
+ # --- Main TTS function with pauses between sentences ---
105
  def text_to_speech(text, voice_choice):
106
  if not text or not voice_choice:
107
  gr.Warning("Fadlan geli qoraal oo dooro cod.")
 
109
 
110
  speaker_embedding = get_speaker_embedding(voice_choice)
111
 
112
+ paragraphs = text.strip().split("\n")
113
+ audio_chunks = []
114
+
115
+ for para in paragraphs:
116
+ para = para.strip()
117
+ if not para:
118
+ continue
119
+ sentences = split_into_sentences(para)
120
+
121
+ for idx, sentence in enumerate(sentences):
122
+ norm_sentence = normalize_text(sentence)
123
+ inputs = processor(text=norm_sentence, return_tensors="pt").to(device)
124
+
125
+ with torch.no_grad():
126
+ speech = model.generate(
127
+ input_ids=inputs["input_ids"],
128
+ speaker_embeddings=speaker_embedding.unsqueeze(0),
129
+ do_sample=True,
130
+ top_k=50,
131
+ temperature=0.75,
132
+ repetition_penalty=1.2,
133
+ max_new_tokens=512
134
+ )
135
+ audio = vocoder(speech).cpu().squeeze().numpy()
136
+
137
+ audio_chunks.append(audio)
138
+
139
+ # Pause 0.5 sec between sentences (not after last)
140
+ if idx < len(sentences) - 1:
141
+ pause = np.zeros(int(16000 * 0.5))
142
+ audio_chunks.append(pause)
143
+
144
+ # Pause 0.8 sec between paragraphs (optional)
145
+ pause_para = np.zeros(int(16000 * 0.8))
146
+ audio_chunks.append(pause_para)
147
+
148
+ final_audio = np.concatenate(audio_chunks)
149
+ return (16000, final_audio)
150
+
151
+ # --- Gradio Interface ---
152
  iface = gr.Interface(
153
  fn=text_to_speech,
154
  inputs=[
155
  gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)", lines=7, placeholder="Qoraalka geli halkan..."),
156
  gr.Dropdown(
157
  VOICE_SAMPLE_FILES,
158
+ label="Dooro Codka (Select Voice)",
 
159
  value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
160
  )
161
  ],
162
+ outputs=gr.Audio(label="Codka La Abuuray (Generated Audio)", type="numpy"),
163
  title="Multi-Voice Somali Text-to-Speech",
164
+ description="Geli qoraal Soomaali ah, dooro cod, kadib riix 'Submit' si aad u abuurto hadal."
165
  )
166
 
167
+ # --- Launch App ---
168
  if __name__ == "__main__":
169
  if not all(os.path.exists(f) for f in VOICE_SAMPLE_FILES):
170
+ raise FileNotFoundError("Fadlan hubi inaad faylasha codka ku dartay.")
171
 
172
  print("Diyaarinta codadka...")
173
  for voice_file in VOICE_SAMPLE_FILES: