Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,6 @@ import scipy.io.wavfile
|
|
| 5 |
from transformers import VitsModel, AutoTokenizer
|
| 6 |
import re
|
| 7 |
import time
|
| 8 |
-
from scipy.signal import resample
|
| 9 |
|
| 10 |
# Load model and tokenizer
|
| 11 |
model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
|
|
@@ -91,23 +90,26 @@ def normalize_text(text):
|
|
| 91 |
'&': 'iyo',
|
| 92 |
'@': 'at',
|
| 93 |
'#': 'hash',
|
| 94 |
-
'.': 'dhibic',
|
| 95 |
}
|
| 96 |
for sym, word in symbol_map.items():
|
| 97 |
text = text.replace(sym, ' ' + word + ' ')
|
| 98 |
|
| 99 |
# Special rule for 'z' or 'Z' prefix or suffix to sound as 's'
|
|
|
|
| 100 |
def replace_z(match):
|
| 101 |
word = match.group()
|
|
|
|
| 102 |
if word.startswith('z'):
|
| 103 |
word = 's' + word[1:]
|
| 104 |
if word.endswith('z'):
|
| 105 |
word = word[:-1] + 's'
|
| 106 |
return word
|
| 107 |
|
|
|
|
| 108 |
text = re.sub(r'\b[z][a-z]*\b', replace_z, text) # words starting with z
|
| 109 |
text = re.sub(r'\b[a-z]*[z]\b', replace_z, text) # words ending with z
|
| 110 |
|
|
|
|
| 111 |
text = text.replace("kh", "qa").replace("sh", "sha'a").replace("dh", "dha'a")
|
| 112 |
|
| 113 |
return text
|
|
@@ -116,18 +118,16 @@ def tts(text):
|
|
| 116 |
paragraphs = [p for p in text.strip().split("\n") if p.strip()]
|
| 117 |
audio_list = []
|
| 118 |
|
|
|
|
| 119 |
n = len(paragraphs)
|
| 120 |
if n <= 5:
|
| 121 |
max_duration = 30 # seconds
|
| 122 |
elif n <= 20:
|
| 123 |
max_duration = 60
|
| 124 |
-
elif n <= 50:
|
| 125 |
-
max_duration = 120
|
| 126 |
-
elif n <= 100:
|
| 127 |
-
max_duration = 240
|
| 128 |
else:
|
| 129 |
-
max_duration =
|
| 130 |
|
|
|
|
| 131 |
waveforms = []
|
| 132 |
for para in paragraphs:
|
| 133 |
norm_para = normalize_text(para)
|
|
@@ -136,20 +136,26 @@ def tts(text):
|
|
| 136 |
waveform = model(**inputs).waveform.squeeze().cpu().numpy()
|
| 137 |
waveforms.append(waveform)
|
| 138 |
|
|
|
|
| 139 |
total_samples = sum(wf.shape[0] for wf in waveforms)
|
| 140 |
sampling_rate = model.config.sampling_rate
|
| 141 |
|
|
|
|
| 142 |
total_duration = total_samples / sampling_rate
|
| 143 |
speed_factor = total_duration / max_duration if total_duration > max_duration else 1.0
|
| 144 |
|
|
|
|
|
|
|
|
|
|
| 145 |
for i, wf in enumerate(waveforms):
|
| 146 |
new_length = int(len(wf) / speed_factor)
|
| 147 |
waveforms[i] = resample(wf, new_length)
|
| 148 |
|
|
|
|
| 149 |
pause = np.zeros(int(sampling_rate * 0.3))
|
| 150 |
for i, wf in enumerate(waveforms):
|
| 151 |
audio_list.append(wf)
|
| 152 |
-
if i < len(waveforms) -
|
| 153 |
audio_list.append(pause)
|
| 154 |
|
| 155 |
final_audio = np.concatenate(audio_list)
|
|
|
|
| 5 |
from transformers import VitsModel, AutoTokenizer
|
| 6 |
import re
|
| 7 |
import time
|
|
|
|
| 8 |
|
| 9 |
# Load model and tokenizer
|
| 10 |
model = VitsModel.from_pretrained("Somali-tts/somali_tts_model")
|
|
|
|
| 90 |
'&': 'iyo',
|
| 91 |
'@': 'at',
|
| 92 |
'#': 'hash',
|
|
|
|
| 93 |
}
|
| 94 |
for sym, word in symbol_map.items():
|
| 95 |
text = text.replace(sym, ' ' + word + ' ')
|
| 96 |
|
| 97 |
# Special rule for 'z' or 'Z' prefix or suffix to sound as 's'
|
| 98 |
+
# Replace 'z' or 'Z' at start or end of word with 's'
|
| 99 |
def replace_z(match):
|
| 100 |
word = match.group()
|
| 101 |
+
# Replace z or Z at start or end with s
|
| 102 |
if word.startswith('z'):
|
| 103 |
word = 's' + word[1:]
|
| 104 |
if word.endswith('z'):
|
| 105 |
word = word[:-1] + 's'
|
| 106 |
return word
|
| 107 |
|
| 108 |
+
# Apply regex word by word for words containing z or Z
|
| 109 |
text = re.sub(r'\b[z][a-z]*\b', replace_z, text) # words starting with z
|
| 110 |
text = re.sub(r'\b[a-z]*[z]\b', replace_z, text) # words ending with z
|
| 111 |
|
| 112 |
+
# Optional character normalization (kuma jirto 'z' sababtoo ah hadda la maamulo)
|
| 113 |
text = text.replace("kh", "qa").replace("sh", "sha'a").replace("dh", "dha'a")
|
| 114 |
|
| 115 |
return text
|
|
|
|
| 118 |
paragraphs = [p for p in text.strip().split("\n") if p.strip()]
|
| 119 |
audio_list = []
|
| 120 |
|
| 121 |
+
# Calculate max total duration allowed based on paragraph count
|
| 122 |
n = len(paragraphs)
|
| 123 |
if n <= 5:
|
| 124 |
max_duration = 30 # seconds
|
| 125 |
elif n <= 20:
|
| 126 |
max_duration = 60
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
else:
|
| 128 |
+
max_duration = 120
|
| 129 |
|
| 130 |
+
# Generate waveform per paragraph and keep track of lengths
|
| 131 |
waveforms = []
|
| 132 |
for para in paragraphs:
|
| 133 |
norm_para = normalize_text(para)
|
|
|
|
| 136 |
waveform = model(**inputs).waveform.squeeze().cpu().numpy()
|
| 137 |
waveforms.append(waveform)
|
| 138 |
|
| 139 |
+
# Calculate total length of raw waveform (in samples)
|
| 140 |
total_samples = sum(wf.shape[0] for wf in waveforms)
|
| 141 |
sampling_rate = model.config.sampling_rate
|
| 142 |
|
| 143 |
+
# Compute speed factor to fit into max_duration seconds
|
| 144 |
total_duration = total_samples / sampling_rate
|
| 145 |
speed_factor = total_duration / max_duration if total_duration > max_duration else 1.0
|
| 146 |
|
| 147 |
+
# Adjust waveforms speed by resampling (speed up if needed)
|
| 148 |
+
from scipy.signal import resample
|
| 149 |
+
|
| 150 |
for i, wf in enumerate(waveforms):
|
| 151 |
new_length = int(len(wf) / speed_factor)
|
| 152 |
waveforms[i] = resample(wf, new_length)
|
| 153 |
|
| 154 |
+
# Add 0.3 sec pause between paragraphs except last one
|
| 155 |
pause = np.zeros(int(sampling_rate * 0.3))
|
| 156 |
for i, wf in enumerate(waveforms):
|
| 157 |
audio_list.append(wf)
|
| 158 |
+
if i < len(waveforms) -1:
|
| 159 |
audio_list.append(pause)
|
| 160 |
|
| 161 |
final_audio = np.concatenate(audio_list)
|