Somalitts commited on
Commit
51f97d6
·
verified ·
1 Parent(s): d8a204f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -55
app.py CHANGED
@@ -1,13 +1,13 @@
1
  # ==============================================================================
2
- # Advanced Somali TTS for Natural Voice Cloning (FIXED)
3
  # ==============================================================================
4
- # This script is specifically updated to address the issue of robotic or
5
- # distorted audio output.
6
  #
7
  # KEY FIX:
8
- # The `model.generate_speech` call now includes a `threshold` parameter.
9
- # This helps the model to end sentences more naturally, significantly
10
- # reducing the metallic/robotic artifacts and improving overall quality.
11
  # ==============================================================================
12
 
13
  import gradio as gr
@@ -19,20 +19,19 @@ import numpy as np
19
  import soundfile as sf
20
  from pydub import AudioSegment, effects
21
 
22
- # --- FIX: Ensure all required classes are imported ---
23
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
24
  from speechbrain.pretrained import EncoderClassifier
 
25
 
26
  # --- Model Loading ---
27
  print("Loading models, this may take a moment...")
28
-
29
  device = "cuda" if torch.cuda.is_available() else "cpu"
30
  print(f"Using device: {device}")
31
 
32
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
33
  model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
34
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
35
-
36
  speaker_model = EncoderClassifier.from_hparams(
37
  source="speechbrain/spkrec-xvect-voxceleb",
38
  run_opts={"device": device},
@@ -41,38 +40,26 @@ speaker_model = EncoderClassifier.from_hparams(
41
  print("Models loaded successfully.")
42
 
43
 
44
- # --- Speaker Embedding: The Key to Voice Quality ---
45
- # The quality of the output voice depends ENTIRELY on the quality of the
46
- # audio file used here (`1.wav`).
47
- # For best results, your `1.wav` file should be:
48
- # - At least 30 seconds long.
49
- # - Contain clear speech with NO background noise or echo.
50
- # - Contain only one speaker.
51
  def create_speaker_embedding(audio_path):
52
  print("Creating speaker embedding from:", audio_path)
53
  waveform, sr = torchaudio.load(audio_path)
54
-
55
- # Resample to 16000 Hz if it's not already
56
  if sr != 16000:
57
- resampler = torchaudio.transforms.Resample(sr, 16000)
58
- waveform = resampler(waveform)
59
-
60
  with torch.no_grad():
61
  embedding = speaker_model.encode_batch(waveform.to(device))
62
- # Normalize the embedding for the TTS model
63
  embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
64
  print("Speaker embedding created.")
65
  return embedding
66
 
67
  SPEAKER_WAV = "1.wav"
68
- EMB_PATH = "speaker_embedding.pt"
69
-
70
- # For Hugging Face Spaces, create a dummy file if `1.wav` doesn't exist
71
  if not os.path.exists(SPEAKER_WAV):
72
- print(f"Warning: '{SPEAKER_WAV}' not found. Creating a silent dummy file. Please upload a real voice sample for good results.")
73
- sf.write(SPEAKER_WAV, np.zeros(16000 * 2), 16000) # 2 seconds of silence
74
 
75
- # Generate and cache the speaker embedding
76
  print("Loading or creating speaker embedding...")
77
  speaker_embedding = create_speaker_embedding(SPEAKER_WAV)
78
  print("Speaker embedding ready.")
@@ -82,14 +69,11 @@ print("Speaker embedding ready.")
82
  def number_to_somali_words(num_str):
83
  try:
84
  num = int(num_str)
85
- except ValueError:
86
- return num_str # Not a number
87
-
88
  if num < 0: return "eber ka yar"
89
  units = ["", "koow", "labo", "saddex", "afar", "shan", "lix", "toddobo", "siddeed", "sagaal"]
90
  teens = ["toban", "kow iyo toban", "laba iyo toban", "saddex iyo toban", "afar iyo toban", "shan iyo toban", "lix iyo toban", "toddobo iyo toban", "siddeed iyo toban", "sagaal iyo toban"]
91
  tens = ["", "toban", "labaatan", "soddon", "afartan", "konton", "lixdan", "toddobaatan", "siddeetan", "sagaashan"]
92
-
93
  if num == 0: return "eber"
94
  if num < 10: return units[num]
95
  if num < 20: return teens[num-10]
@@ -111,42 +95,51 @@ def normalize_text(text):
111
  return text.strip()
112
 
113
 
114
- # --- Core TTS Function with Quality Fix ---
115
  def text_to_speech(text):
116
  print(f"Generating speech for: '{text}'")
117
  normalized_text = normalize_text(text)
118
  if not normalized_text:
119
  return (16000, np.zeros(0, dtype=np.int16))
120
 
 
121
  inputs = processor(text=normalized_text, return_tensors="pt").to(device)
122
-
123
  with torch.no_grad():
124
- # --- QUALITY IMPROVEMENT ---
125
- # The `threshold` parameter helps the model stop generating more naturally.
126
- # This is a key factor in reducing robotic artifacts.
127
  speech = model.generate_speech(
128
  inputs["input_ids"],
129
  speaker_embedding.unsqueeze(0),
130
  vocoder=vocoder,
131
- threshold=0.5 # This makes a big difference!
132
  )
133
-
134
- # The output from the model is a tensor, convert it to a numpy array
135
- speech_numpy = speech.cpu().numpy()
 
 
 
 
 
 
 
 
 
 
136
 
137
- # Post-processing: Normalize volume for a polished feel.
138
- # This does not fix distortion, but it improves listenability.
139
  audio_segment = AudioSegment(
140
- speech_numpy.tobytes(),
141
  frame_rate=16000,
142
- sample_width=speech_numpy.dtype.itemsize,
143
  channels=1
144
  )
 
 
145
  processed_audio = effects.normalize(audio_segment)
146
- processed_numpy = np.array(processed_audio.get_array_of_samples())
 
 
147
 
148
- print("Speech generation complete.")
149
- return (16000, processed_numpy)
150
 
151
 
152
  # --- Gradio Web Interface ---
@@ -154,21 +147,21 @@ iface = gr.Interface(
154
  fn=text_to_speech,
155
  inputs=gr.Textbox(
156
  label="Qoraalka Geli (Enter Somali Text)",
157
- placeholder="Ku qor qoraalkaaga halkan..."
158
  ),
159
  outputs=gr.Audio(
160
- label="Codka La Soo Saaray (Generated Audio)",
161
  type="numpy"
162
  ),
163
- title="🇸🇴 Somali TTS - Cod Tayo Sare Leh (Natural Voice)",
164
  description=(
165
- "Ku qor qoraal si aad ugu badasho cod u eg kan bini'aadamka. **Natiijada ugu fiican, hubi in faylka codka `1.wav` uu yahay mid tayo sare leh.**"
166
- "\n\n(Enter text to convert to a human-like voice. **For best results, ensure your `1.wav` voice file is high quality.**)"
167
  ),
168
  examples=[
169
  ["Sidee tahay saaxiib? Maanta waa maalin qurux badan."],
170
- ["Barnaamijkan wuxuu adeegsadaa sirdoonka macmalka ah si uu u abuuro codadka."],
171
- ["Natiijadu waxay ku xiran tahay tayada codka aad gelisay."],
172
  ]
173
  )
174
 
 
1
  # ==============================================================================
2
+ # Somali TTS with AI-Powered Noise Reduction
3
  # ==============================================================================
4
+ # This script is the final version, designed to produce a clean, studio-quality
5
+ # voice by removing background noise and digital artifacts.
6
  #
7
  # KEY FIX:
8
+ # A noise reduction filter (`noisereduce`) is applied directly to the
9
+ # generated audio. This intelligently removes hiss and unwanted noise,
10
+ # leaving only the clean voice.
11
  # ==============================================================================
12
 
13
  import gradio as gr
 
19
  import soundfile as sf
20
  from pydub import AudioSegment, effects
21
 
22
+ # --- Required Imports for TTS and Noise Reduction ---
23
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
24
  from speechbrain.pretrained import EncoderClassifier
25
+ import noisereduce as nr # Import the noise reduction library
26
 
27
  # --- Model Loading ---
28
  print("Loading models, this may take a moment...")
 
29
  device = "cuda" if torch.cuda.is_available() else "cpu"
30
  print(f"Using device: {device}")
31
 
32
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
33
  model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
34
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 
35
  speaker_model = EncoderClassifier.from_hparams(
36
  source="speechbrain/spkrec-xvect-voxceleb",
37
  run_opts={"device": device},
 
40
  print("Models loaded successfully.")
41
 
42
 
43
+ # --- Speaker Embedding Generation ---
44
+ # The quality of your `1.wav` file is CRITICAL for good results.
45
+ # It should be a clean, noise-free recording of a single speaker.
 
 
 
 
46
  def create_speaker_embedding(audio_path):
47
  print("Creating speaker embedding from:", audio_path)
48
  waveform, sr = torchaudio.load(audio_path)
 
 
49
  if sr != 16000:
50
+ waveform = torchaudio.functional.resample(waveform, sr, 16000)
51
+
 
52
  with torch.no_grad():
53
  embedding = speaker_model.encode_batch(waveform.to(device))
 
54
  embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
55
  print("Speaker embedding created.")
56
  return embedding
57
 
58
  SPEAKER_WAV = "1.wav"
 
 
 
59
  if not os.path.exists(SPEAKER_WAV):
60
+ print(f"Warning: '{SPEAKER_WAV}' not found. Creating a silent dummy file.")
61
+ sf.write(SPEAKER_WAV, np.zeros(16000 * 2), 16000)
62
 
 
63
  print("Loading or creating speaker embedding...")
64
  speaker_embedding = create_speaker_embedding(SPEAKER_WAV)
65
  print("Speaker embedding ready.")
 
69
  def number_to_somali_words(num_str):
70
  try:
71
  num = int(num_str)
72
+ except ValueError: return num_str
 
 
73
  if num < 0: return "eber ka yar"
74
  units = ["", "koow", "labo", "saddex", "afar", "shan", "lix", "toddobo", "siddeed", "sagaal"]
75
  teens = ["toban", "kow iyo toban", "laba iyo toban", "saddex iyo toban", "afar iyo toban", "shan iyo toban", "lix iyo toban", "toddobo iyo toban", "siddeed iyo toban", "sagaal iyo toban"]
76
  tens = ["", "toban", "labaatan", "soddon", "afartan", "konton", "lixdan", "toddobaatan", "siddeetan", "sagaashan"]
 
77
  if num == 0: return "eber"
78
  if num < 10: return units[num]
79
  if num < 20: return teens[num-10]
 
95
  return text.strip()
96
 
97
 
98
+ # --- Core TTS Function with AI Noise Reduction ---
99
  def text_to_speech(text):
100
  print(f"Generating speech for: '{text}'")
101
  normalized_text = normalize_text(text)
102
  if not normalized_text:
103
  return (16000, np.zeros(0, dtype=np.int16))
104
 
105
+ # --- Step 1: Generate the raw speech ---
106
  inputs = processor(text=normalized_text, return_tensors="pt").to(device)
 
107
  with torch.no_grad():
 
 
 
108
  speech = model.generate_speech(
109
  inputs["input_ids"],
110
  speaker_embedding.unsqueeze(0),
111
  vocoder=vocoder,
112
+ threshold=0.5
113
  )
114
+ raw_speech_numpy = speech.cpu().numpy()
115
+
116
+ # --- Step 2: AI-POWERED NOISE REDUCTION ---
117
+ # This is the crucial step to clean the audio.
118
+ print("Applying noise reduction filter...")
119
+ # The sample rate (sr) must match the audio's sample rate.
120
+ clean_speech = nr.reduce_noise(y=raw_speech_numpy, sr=16000)
121
+ print("Noise reduction complete.")
122
+
123
+ # --- Step 3: Final Polishing (Volume Normalization) ---
124
+ # Convert to pydub AudioSegment for easy volume handling.
125
+ # Note: Ensure the numpy array is in 16-bit integer format for pydub.
126
+ clean_speech_int16 = (clean_speech * 32767).astype(np.int16)
127
 
 
 
128
  audio_segment = AudioSegment(
129
+ clean_speech_int16.tobytes(),
130
  frame_rate=16000,
131
+ sample_width=clean_speech_int16.dtype.itemsize,
132
  channels=1
133
  )
134
+
135
+ # Normalize volume to a standard level for a professional feel.
136
  processed_audio = effects.normalize(audio_segment)
137
+
138
+ # Convert back to numpy array for Gradio output
139
+ final_output_numpy = np.array(processed_audio.get_array_of_samples())
140
 
141
+ print("Speech generation and cleaning finished.")
142
+ return (16000, final_output_numpy)
143
 
144
 
145
  # --- Gradio Web Interface ---
 
147
  fn=text_to_speech,
148
  inputs=gr.Textbox(
149
  label="Qoraalka Geli (Enter Somali Text)",
150
+ placeholder="Ku qor qoraalkaaga halkan si aad u hesho cod saafi ah..."
151
  ),
152
  outputs=gr.Audio(
153
+ label="Codka La Soo Saaray (Cleaned Audio)",
154
  type="numpy"
155
  ),
156
+ title="🇸🇴 Somali TTS oo leh Cod Sifeeye (with Noise Reduction)",
157
  description=(
158
+ "Ku qor qoraal si aad ugu beddesho cod saafi ah oo aan qaylo lahayn. Barnaamijkan wuxuu si toos ah uga saarayaa sawaxanka codka la soo saaray."
159
+ "\n\n(Enter text to convert it to a clean, noise-free voice. This model automatically removes background noise from the generated audio.)"
160
  ),
161
  examples=[
162
  ["Sidee tahay saaxiib? Maanta waa maalin qurux badan."],
163
+ ["Tani waa tijaabo si loo maqlo tayada codka oo saafi ah."],
164
+ ["Waan ku faraxsanahay inaan idinla hadlo maanta."],
165
  ]
166
  )
167