HusseinBashir commited on
Commit
1b065b4
·
verified ·
1 Parent(s): 9b0a893

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -8
app.py CHANGED
@@ -5,7 +5,6 @@ import scipy.io.wavfile
5
  from transformers import VitsModel, AutoTokenizer
6
  import re
7
  import time
8
- from scipy.signal import resample
9
 
10
  # Load model and tokenizer
11
  model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
@@ -91,23 +90,26 @@ def normalize_text(text):
91
  '&': 'iyo',
92
  '@': 'at',
93
  '#': 'hash',
94
- '.': 'dhibic',
95
  }
96
  for sym, word in symbol_map.items():
97
  text = text.replace(sym, ' ' + word + ' ')
98
 
99
  # Special rule for 'z' or 'Z' prefix or suffix to sound as 's'
 
100
  def replace_z(match):
101
  word = match.group()
 
102
  if word.startswith('z'):
103
  word = 's' + word[1:]
104
  if word.endswith('z'):
105
  word = word[:-1] + 's'
106
  return word
107
 
 
108
  text = re.sub(r'\b[z][a-z]*\b', replace_z, text) # words starting with z
109
  text = re.sub(r'\b[a-z]*[z]\b', replace_z, text) # words ending with z
110
 
 
111
  text = text.replace("kh", "qa").replace("sh", "sha'a").replace("dh", "dha'a")
112
 
113
  return text
@@ -116,18 +118,16 @@ def tts(text):
116
  paragraphs = [p for p in text.strip().split("\n") if p.strip()]
117
  audio_list = []
118
 
 
119
  n = len(paragraphs)
120
  if n <= 5:
121
  max_duration = 30 # seconds
122
  elif n <= 20:
123
  max_duration = 60
124
- elif n <= 50:
125
- max_duration = 120
126
- elif n <= 100:
127
- max_duration = 240
128
  else:
129
- max_duration = 300 # 5 minutes max
130
 
 
131
  waveforms = []
132
  for para in paragraphs:
133
  norm_para = normalize_text(para)
@@ -136,20 +136,26 @@ def tts(text):
136
  waveform = model(**inputs).waveform.squeeze().cpu().numpy()
137
  waveforms.append(waveform)
138
 
 
139
  total_samples = sum(wf.shape[0] for wf in waveforms)
140
  sampling_rate = model.config.sampling_rate
141
 
 
142
  total_duration = total_samples / sampling_rate
143
  speed_factor = total_duration / max_duration if total_duration > max_duration else 1.0
144
 
 
 
 
145
  for i, wf in enumerate(waveforms):
146
  new_length = int(len(wf) / speed_factor)
147
  waveforms[i] = resample(wf, new_length)
148
 
 
149
  pause = np.zeros(int(sampling_rate * 0.3))
150
  for i, wf in enumerate(waveforms):
151
  audio_list.append(wf)
152
- if i < len(waveforms) - 1:
153
  audio_list.append(pause)
154
 
155
  final_audio = np.concatenate(audio_list)
 
5
  from transformers import VitsModel, AutoTokenizer
6
  import re
7
  import time
 
8
 
9
  # Load model and tokenizer
10
  model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
 
90
  '&': 'iyo',
91
  '@': 'at',
92
  '#': 'hash',
 
93
  }
94
  for sym, word in symbol_map.items():
95
  text = text.replace(sym, ' ' + word + ' ')
96
 
97
  # Special rule for 'z' or 'Z' prefix or suffix to sound as 's'
98
+ # Replace 'z' or 'Z' at start or end of word with 's'
99
  def replace_z(match):
100
  word = match.group()
101
+ # Replace z or Z at start or end with s
102
  if word.startswith('z'):
103
  word = 's' + word[1:]
104
  if word.endswith('z'):
105
  word = word[:-1] + 's'
106
  return word
107
 
108
+ # Apply regex word by word for words containing z or Z
109
  text = re.sub(r'\b[z][a-z]*\b', replace_z, text) # words starting with z
110
  text = re.sub(r'\b[a-z]*[z]\b', replace_z, text) # words ending with z
111
 
112
+ # Optional character normalization (kuma jirto 'z' sababtoo ah hadda la maamulo)
113
  text = text.replace("kh", "qa").replace("sh", "sha'a").replace("dh", "dha'a")
114
 
115
  return text
 
118
  paragraphs = [p for p in text.strip().split("\n") if p.strip()]
119
  audio_list = []
120
 
121
+ # Calculate max total duration allowed based on paragraph count
122
  n = len(paragraphs)
123
  if n <= 5:
124
  max_duration = 30 # seconds
125
  elif n <= 20:
126
  max_duration = 60
 
 
 
 
127
  else:
128
+ max_duration = 120
129
 
130
+ # Generate waveform per paragraph and keep track of lengths
131
  waveforms = []
132
  for para in paragraphs:
133
  norm_para = normalize_text(para)
 
136
  waveform = model(**inputs).waveform.squeeze().cpu().numpy()
137
  waveforms.append(waveform)
138
 
139
+ # Calculate total length of raw waveform (in samples)
140
  total_samples = sum(wf.shape[0] for wf in waveforms)
141
  sampling_rate = model.config.sampling_rate
142
 
143
+ # Compute speed factor to fit into max_duration seconds
144
  total_duration = total_samples / sampling_rate
145
  speed_factor = total_duration / max_duration if total_duration > max_duration else 1.0
146
 
147
+ # Adjust waveforms speed by resampling (speed up if needed)
148
+ from scipy.signal import resample
149
+
150
  for i, wf in enumerate(waveforms):
151
  new_length = int(len(wf) / speed_factor)
152
  waveforms[i] = resample(wf, new_length)
153
 
154
+ # Add 0.3 sec pause between paragraphs except last one
155
  pause = np.zeros(int(sampling_rate * 0.3))
156
  for i, wf in enumerate(waveforms):
157
  audio_list.append(wf)
158
+ if i < len(waveforms) -1:
159
  audio_list.append(pause)
160
 
161
  final_audio = np.concatenate(audio_list)