Somalitts commited on
Commit
f47a2b0
·
verified ·
1 Parent(s): 51f97d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -139
app.py CHANGED
@@ -1,169 +1,97 @@
1
- # ==============================================================================
2
- # Somali TTS with AI-Powered Noise Reduction
3
- # ==============================================================================
4
- # This script is the final version, designed to produce a clean, studio-quality
5
- # voice by removing background noise and digital artifacts.
6
- #
7
- # KEY FIX:
8
- # A noise reduction filter (`noisereduce`) is applied directly to the
9
- # generated audio. This intelligently removes hiss and unwanted noise,
10
- # leaving only the clean voice.
11
- # ==============================================================================
12
-
13
  import gradio as gr
14
  import torch
15
  import torchaudio
16
  import re
17
  import os
18
- import numpy as np
19
- import soundfile as sf
20
- from pydub import AudioSegment, effects
21
-
22
- # --- Required Imports for TTS and Noise Reduction ---
23
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
24
  from speechbrain.pretrained import EncoderClassifier
25
- import noisereduce as nr # Import the noise reduction library
26
 
27
- # --- Model Loading ---
28
- print("Loading models, this may take a moment...")
29
  device = "cuda" if torch.cuda.is_available() else "cpu"
30
- print(f"Using device: {device}")
31
 
 
32
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
33
  model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
34
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 
35
  speaker_model = EncoderClassifier.from_hparams(
36
  source="speechbrain/spkrec-xvect-voxceleb",
37
  run_opts={"device": device},
38
- savedir=os.path.join("models", "spk_model")
39
  )
40
- print("Models loaded successfully.")
41
 
42
-
43
- # --- Speaker Embedding Generation ---
44
- # The quality of your `1.wav` file is CRITICAL for good results.
45
- # It should be a clean, noise-free recording of a single speaker.
46
- def create_speaker_embedding(audio_path):
47
- print("Creating speaker embedding from:", audio_path)
48
- waveform, sr = torchaudio.load(audio_path)
49
- if sr != 16000:
50
- waveform = torchaudio.functional.resample(waveform, sr, 16000)
51
-
52
  with torch.no_grad():
53
- embedding = speaker_model.encode_batch(waveform.to(device))
54
- embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
55
- print("Speaker embedding created.")
56
- return embedding
57
-
58
- SPEAKER_WAV = "1.wav"
59
- if not os.path.exists(SPEAKER_WAV):
60
- print(f"Warning: '{SPEAKER_WAV}' not found. Creating a silent dummy file.")
61
- sf.write(SPEAKER_WAV, np.zeros(16000 * 2), 16000)
62
-
63
- print("Loading or creating speaker embedding...")
64
- speaker_embedding = create_speaker_embedding(SPEAKER_WAV)
65
- print("Speaker embedding ready.")
66
-
67
-
68
- # --- Text Normalization (Somali) ---
69
- def number_to_somali_words(num_str):
70
- try:
71
- num = int(num_str)
72
- except ValueError: return num_str
73
- if num < 0: return "eber ka yar"
74
- units = ["", "koow", "labo", "saddex", "afar", "shan", "lix", "toddobo", "siddeed", "sagaal"]
75
- teens = ["toban", "kow iyo toban", "laba iyo toban", "saddex iyo toban", "afar iyo toban", "shan iyo toban", "lix iyo toban", "toddobo iyo toban", "siddeed iyo toban", "sagaal iyo toban"]
76
- tens = ["", "toban", "labaatan", "soddon", "afartan", "konton", "lixdan", "toddobaatan", "siddeetan", "sagaashan"]
77
- if num == 0: return "eber"
78
- if num < 10: return units[num]
79
- if num < 20: return teens[num-10]
80
- if num < 100:
81
- ten, unit = divmod(num, 10)
82
- return tens[ten] + ((" iyo " + units[unit]) if unit != 0 else "")
83
- if num < 1000:
84
- hundred, rest = divmod(num, 100)
85
- return (units[hundred] if hundred > 1 else "") + " boqol" + ((" iyo " + number_to_somali_words(str(rest))) if rest != 0 else "")
86
- if num < 1000000:
87
- thousand, rest = divmod(num, 1000)
88
- return number_to_somali_words(str(thousand)) + " kun" + ((" iyo " + number_to_somali_words(str(rest))) if rest != 0 else "")
89
- return num_str
 
 
 
 
 
 
90
 
91
  def normalize_text(text):
92
  text = text.lower()
93
- text = re.sub(r"\d+", lambda m: number_to_somali_words(m.group(0)), text)
94
- text = re.sub(r'[^\w\s,\.]', '', text)
95
- return text.strip()
96
-
97
 
98
- # --- Core TTS Function with AI Noise Reduction ---
99
  def text_to_speech(text):
100
- print(f"Generating speech for: '{text}'")
101
- normalized_text = normalize_text(text)
102
- if not normalized_text:
103
- return (16000, np.zeros(0, dtype=np.int16))
104
-
105
- # --- Step 1: Generate the raw speech ---
106
- inputs = processor(text=normalized_text, return_tensors="pt").to(device)
107
  with torch.no_grad():
108
- speech = model.generate_speech(
109
- inputs["input_ids"],
110
- speaker_embedding.unsqueeze(0),
111
- vocoder=vocoder,
112
- threshold=0.5
113
- )
114
- raw_speech_numpy = speech.cpu().numpy()
115
-
116
- # --- Step 2: AI-POWERED NOISE REDUCTION ---
117
- # This is the crucial step to clean the audio.
118
- print("Applying noise reduction filter...")
119
- # The sample rate (sr) must match the audio's sample rate.
120
- clean_speech = nr.reduce_noise(y=raw_speech_numpy, sr=16000)
121
- print("Noise reduction complete.")
122
-
123
- # --- Step 3: Final Polishing (Volume Normalization) ---
124
- # Convert to pydub AudioSegment for easy volume handling.
125
- # Note: Ensure the numpy array is in 16-bit integer format for pydub.
126
- clean_speech_int16 = (clean_speech * 32767).astype(np.int16)
127
-
128
- audio_segment = AudioSegment(
129
- clean_speech_int16.tobytes(),
130
- frame_rate=16000,
131
- sample_width=clean_speech_int16.dtype.itemsize,
132
- channels=1
133
- )
134
-
135
- # Normalize volume to a standard level for a professional feel.
136
- processed_audio = effects.normalize(audio_segment)
137
-
138
- # Convert back to numpy array for Gradio output
139
- final_output_numpy = np.array(processed_audio.get_array_of_samples())
140
-
141
- print("Speech generation and cleaning finished.")
142
- return (16000, final_output_numpy)
143
-
144
 
145
- # --- Gradio Web Interface ---
146
  iface = gr.Interface(
147
  fn=text_to_speech,
148
- inputs=gr.Textbox(
149
- label="Qoraalka Geli (Enter Somali Text)",
150
- placeholder="Ku qor qoraalkaaga halkan si aad u hesho cod saafi ah..."
151
- ),
152
- outputs=gr.Audio(
153
- label="Codka La Soo Saaray (Cleaned Audio)",
154
- type="numpy"
155
- ),
156
- title="🇸🇴 Somali TTS oo leh Cod Sifeeye (with Noise Reduction)",
157
- description=(
158
- "Ku qor qoraal si aad ugu beddesho cod saafi ah oo aan qaylo lahayn. Barnaamijkan wuxuu si toos ah uga saarayaa sawaxanka codka la soo saaray."
159
- "\n\n(Enter text to convert it to a clean, noise-free voice. This model automatically removes background noise from the generated audio.)"
160
- ),
161
- examples=[
162
- ["Sidee tahay saaxiib? Maanta waa maalin qurux badan."],
163
- ["Tani waa tijaabo si loo maqlo tayada codka oo saafi ah."],
164
- ["Waan ku faraxsanahay inaan idinla hadlo maanta."],
165
- ]
166
  )
167
 
168
- if __name__ == "__main__":
169
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import torch
3
  import torchaudio
4
  import re
5
  import os
 
 
 
 
 
6
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
  from speechbrain.pretrained import EncoderClassifier
 
8
 
 
 
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
10
 
11
+ # Load models
12
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
13
  model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
14
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
15
+
16
  speaker_model = EncoderClassifier.from_hparams(
17
  source="speechbrain/spkrec-xvect-voxceleb",
18
  run_opts={"device": device},
19
+ savedir="./spk_model"
20
  )
 
21
 
22
+ # Speaker embedding
23
+ EMB_PATH = "speaker_embedding.pt"
24
+ if os.path.exists(EMB_PATH):
25
+ speaker_embedding = torch.load(EMB_PATH).to(device)
26
+ else:
27
+ audio, sr = torchaudio.load("1.wav")
28
+ audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
 
 
 
29
  with torch.no_grad():
30
+ emb = speaker_model.encode_batch(audio)
31
+ emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
32
+ torch.save(emb.cpu(), EMB_PATH)
33
+ speaker_embedding = emb
34
+
35
+ # Number conversion (Somali)
36
+ number_words = {
37
+ 0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
38
+ 6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
39
+ 11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
40
+ 14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
41
+ 17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
42
+ 20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
43
+ 60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
44
+ 100: "boqol", 1000: "kun",
45
+ }
46
+
47
+ def number_to_words(number):
48
+ if number < 20:
49
+ return number_words[number]
50
+ elif number < 100:
51
+ tens, unit = divmod(number, 10)
52
+ return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
53
+ elif number < 1000:
54
+ hundreds, remainder = divmod(number, 100)
55
+ return (number_words[hundreds] + " boqol" if hundreds > 1 else "BOQOL") + (" " + number_to_words(remainder) if remainder else "")
56
+ elif number < 1000000:
57
+ thousands, remainder = divmod(number, 1000)
58
+ return (number_to_words(thousands) + " kun" if thousands > 1 else "KUN") + (" " + number_to_words(remainder) if remainder else "")
59
+ elif number < 1000000000:
60
+ millions, remainder = divmod(number, 1000000)
61
+ return number_to_words(millions) + " malyan" + (" " + number_to_words(remainder) if remainder else "")
62
+ elif number < 1000000000000:
63
+ billions, remainder = divmod(number, 1000000000)
64
+ return number_to_words(billions) + " milyaar" + (" " + number_to_words(remainder) if remainder else "")
65
+ else:
66
+ return str(number)
67
+
68
+ def replace_numbers_with_words(text):
69
+ def replace(match):
70
+ number = int(match.group())
71
+ return number_to_words(number)
72
+ return re.sub(r'\b\d+\b', replace, text)
73
 
74
  def normalize_text(text):
75
  text = text.lower()
76
+ text = replace_numbers_with_words(text)
77
+ text = re.sub(r'[^\w\s]', '', text)
78
+ return text
 
79
 
80
+ # TTS function
81
  def text_to_speech(text):
82
+ text = normalize_text(text)
83
+ inputs = processor(text=text, return_tensors="pt").to(device)
 
 
 
 
 
84
  with torch.no_grad():
85
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding.unsqueeze(0), vocoder=vocoder)
86
+ return (16000, speech.cpu().numpy())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ # Gradio Interface
89
  iface = gr.Interface(
90
  fn=text_to_speech,
91
+ inputs=gr.Textbox(label="Geli qoraalka af-soomaali"),
92
+ outputs=gr.Audio(label="Codka la abuuray", type="numpy"),
93
+ title="Somali TTS",
94
+ description="TTS Soomaaliyeed oo la adeegsaday cod gaar ah (11.wav)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  )
96
 
97
+ iface.launch()