Somalitts commited on
Commit
cd7d46e
·
verified ·
1 Parent(s): 50f6f95

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -36
app.py CHANGED
@@ -6,8 +6,6 @@ import os
6
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
  from speechbrain.pretrained import EncoderClassifier
8
 
9
- import torchaudio.sox_effects as sox
10
-
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
  # Load models
@@ -79,48 +77,21 @@ def normalize_text(text):
79
  text = re.sub(r'[^\w\s]', '', text)
80
  return text
81
 
82
- # Adjust speed using sox effects (preserves pitch and reduces noise)
83
- def adjust_speed(waveform, sample_rate, text):
84
- length = len(text)
85
- if length <= 100:
86
- speed_factor = 0.85
87
- elif length <= 150:
88
- speed_factor = 0.95
89
- elif length <= 500:
90
- speed_factor = 1.0
91
- elif length <= 2000:
92
- speed_factor = 1.1
93
- else:
94
- speed_factor = 1.2
95
-
96
- effects = [["speed", str(speed_factor)], ["rate", str(sample_rate)]]
97
- adjusted, _ = torchaudio.sox_effects.apply_effects_tensor(waveform, sample_rate, effects)
98
- return adjusted
99
-
100
- # TTS function with chunking for long text
101
  def text_to_speech(text):
102
  text = normalize_text(text)
103
- max_chars_per_chunk = 300
104
- chunks = [text[i:i+max_chars_per_chunk] for i in range(0, len(text), max_chars_per_chunk)]
105
-
106
- full_waveform = torch.tensor([], device=device)
107
-
108
- for chunk in chunks:
109
- inputs = processor(text=chunk, return_tensors="pt").to(device)
110
- with torch.no_grad():
111
- speech = model.generate_speech(inputs["input_ids"], speaker_embedding.unsqueeze(0), vocoder=vocoder)
112
- adjusted = adjust_speed(speech.unsqueeze(0), 16000, chunk)
113
- full_waveform = torch.cat((full_waveform, adjusted.squeeze(0)), dim=-1)
114
-
115
- return (16000, full_waveform.cpu().numpy())
116
 
117
  # Gradio Interface
118
  iface = gr.Interface(
119
  fn=text_to_speech,
120
  inputs=gr.Textbox(label="Geli qoraalka af-soomaali"),
121
  outputs=gr.Audio(label="Codka la abuuray", type="numpy"),
122
- title="Somali TTS - Degdeg ah",
123
- description="Cod abuurista Af-Soomaaliga iyadoo lagu daray xawaaraha saxda ah."
124
  )
125
 
126
  iface.launch()
 
6
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
  from speechbrain.pretrained import EncoderClassifier
8
 
 
 
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
 
11
  # Load models
 
77
  text = re.sub(r'[^\w\s]', '', text)
78
  return text
79
 
80
+ # TTS function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  def text_to_speech(text):
82
  text = normalize_text(text)
83
+ inputs = processor(text=text, return_tensors="pt").to(device)
84
+ with torch.no_grad():
85
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding.unsqueeze(0), vocoder=vocoder)
86
+ return (16000, speech.cpu().numpy())
 
 
 
 
 
 
 
 
 
87
 
88
  # Gradio Interface
89
  iface = gr.Interface(
90
  fn=text_to_speech,
91
  inputs=gr.Textbox(label="Geli qoraalka af-soomaali"),
92
  outputs=gr.Audio(label="Codka la abuuray", type="numpy"),
93
+ title="Somali TTS",
94
+ description="TTS Soomaaliyeed oo la adeegsaday cod gaar ah (11.wav)"
95
  )
96
 
97
  iface.launch()