Spaces:

Somalitts
/

8aad

Running

App Files Files Community

Somalitts commited on Jul 13, 2025

Commit

51f97d6

verified ·

1 Parent(s): d8a204f

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -55

app.py CHANGED Viewed

@@ -1,13 +1,13 @@
 # ==============================================================================
-#           Advanced Somali TTS for Natural Voice Cloning (FIXED)
 # ==============================================================================
-# This script is specifically updated to address the issue of robotic or
-# distorted audio output.
 #
 # KEY FIX:
-# The `model.generate_speech` call now includes a `threshold` parameter.
-# This helps the model to end sentences more naturally, significantly
-# reducing the metallic/robotic artifacts and improving overall quality.
 # ==============================================================================
 import gradio as gr
@@ -19,20 +19,19 @@ import numpy as np
 import soundfile as sf
 from pydub import AudioSegment, effects
-# --- FIX: Ensure all required classes are imported ---
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from speechbrain.pretrained import EncoderClassifier
 # --- Model Loading ---
 print("Loading models, this may take a moment...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 speaker_model = EncoderClassifier.from_hparams(
     source="speechbrain/spkrec-xvect-voxceleb",
     run_opts={"device": device},
@@ -41,38 +40,26 @@ speaker_model = EncoderClassifier.from_hparams(
 print("Models loaded successfully.")
-# --- Speaker Embedding: The Key to Voice Quality ---
-# The quality of the output voice depends ENTIRELY on the quality of the
-# audio file used here (`1.wav`).
-# For best results, your `1.wav` file should be:
-#   - At least 30 seconds long.
-#   - Contain clear speech with NO background noise or echo.
-#   - Contain only one speaker.
 def create_speaker_embedding(audio_path):
     print("Creating speaker embedding from:", audio_path)
     waveform, sr = torchaudio.load(audio_path)
-    # Resample to 16000 Hz if it's not already
     if sr != 16000:
-        resampler = torchaudio.transforms.Resample(sr, 16000)
-        waveform = resampler(waveform)
     with torch.no_grad():
         embedding = speaker_model.encode_batch(waveform.to(device))
-        # Normalize the embedding for the TTS model
         embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
     print("Speaker embedding created.")
     return embedding
 SPEAKER_WAV = "1.wav"
-EMB_PATH = "speaker_embedding.pt"
-# For Hugging Face Spaces, create a dummy file if `1.wav` doesn't exist
 if not os.path.exists(SPEAKER_WAV):
-    print(f"Warning: '{SPEAKER_WAV}' not found. Creating a silent dummy file. Please upload a real voice sample for good results.")
-    sf.write(SPEAKER_WAV, np.zeros(16000 * 2), 16000) # 2 seconds of silence
-# Generate and cache the speaker embedding
 print("Loading or creating speaker embedding...")
 speaker_embedding = create_speaker_embedding(SPEAKER_WAV)
 print("Speaker embedding ready.")
@@ -82,14 +69,11 @@ print("Speaker embedding ready.")
 def number_to_somali_words(num_str):
     try:
         num = int(num_str)
-    except ValueError:
-        return num_str # Not a number
     if num < 0: return "eber ka yar"
     units = ["", "koow", "labo", "saddex", "afar", "shan", "lix", "toddobo", "siddeed", "sagaal"]
     teens = ["toban", "kow iyo toban", "laba iyo toban", "saddex iyo toban", "afar iyo toban", "shan iyo toban", "lix iyo toban", "toddobo iyo toban", "siddeed iyo toban", "sagaal iyo toban"]
     tens = ["", "toban", "labaatan", "soddon", "afartan", "konton", "lixdan", "toddobaatan", "siddeetan", "sagaashan"]
     if num == 0: return "eber"
     if num < 10: return units[num]
     if num < 20: return teens[num-10]
@@ -111,42 +95,51 @@ def normalize_text(text):
     return text.strip()
-# --- Core TTS Function with Quality Fix ---
 def text_to_speech(text):
     print(f"Generating speech for: '{text}'")
     normalized_text = normalize_text(text)
     if not normalized_text:
         return (16000, np.zeros(0, dtype=np.int16))
     inputs = processor(text=normalized_text, return_tensors="pt").to(device)
     with torch.no_grad():
-        # --- QUALITY IMPROVEMENT ---
-        # The `threshold` parameter helps the model stop generating more naturally.
-        # This is a key factor in reducing robotic artifacts.
         speech = model.generate_speech(
             inputs["input_ids"],
             speaker_embedding.unsqueeze(0),
             vocoder=vocoder,
-            threshold=0.5  # This makes a big difference!
         )
-    # The output from the model is a tensor, convert it to a numpy array
-    speech_numpy = speech.cpu().numpy()
-    # Post-processing: Normalize volume for a polished feel.
-    # This does not fix distortion, but it improves listenability.
     audio_segment = AudioSegment(
-        speech_numpy.tobytes(),
         frame_rate=16000,
-        sample_width=speech_numpy.dtype.itemsize,
         channels=1
     )
     processed_audio = effects.normalize(audio_segment)
-    processed_numpy = np.array(processed_audio.get_array_of_samples())
-    print("Speech generation complete.")
-    return (16000, processed_numpy)
 # --- Gradio Web Interface ---
@@ -154,21 +147,21 @@ iface = gr.Interface(
     fn=text_to_speech,
     inputs=gr.Textbox(
         label="Qoraalka Geli (Enter Somali Text)",
-        placeholder="Ku qor qoraalkaaga halkan..."
     ),
     outputs=gr.Audio(
-        label="Codka La Soo Saaray (Generated Audio)",
         type="numpy"
     ),
-    title="🇸🇴 Somali TTS - Cod Tayo Sare Leh (Natural Voice)",
     description=(
-        "Ku qor qoraal si aad ugu badasho cod u eg kan bini'aadamka. **Natiijada ugu fiican, hubi in faylka codka `1.wav` uu yahay mid tayo sare leh.**"
-        "\n\n(Enter text to convert to a human-like voice. **For best results, ensure your `1.wav` voice file is high quality.**)"
     ),
     examples=[
         ["Sidee tahay saaxiib? Maanta waa maalin qurux badan."],
-        ["Barnaamijkan wuxuu adeegsadaa sirdoonka macmalka ah si uu u abuuro codadka."],
-        ["Natiijadu waxay ku xiran tahay tayada codka aad gelisay."],
     ]
 )

 # ==============================================================================
+#           Somali TTS with AI-Powered Noise Reduction
 # ==============================================================================
+# This script is the final version, designed to produce a clean, studio-quality
+# voice by removing background noise and digital artifacts.
 #
 # KEY FIX:
+# A noise reduction filter (`noisereduce`) is applied directly to the
+# generated audio. This intelligently removes hiss and unwanted noise,
+# leaving only the clean voice.
 # ==============================================================================
 import gradio as gr
 import soundfile as sf
 from pydub import AudioSegment, effects
+# --- Required Imports for TTS and Noise Reduction ---
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from speechbrain.pretrained import EncoderClassifier
+import noisereduce as nr # Import the noise reduction library
 # --- Model Loading ---
 print("Loading models, this may take a moment...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 speaker_model = EncoderClassifier.from_hparams(
     source="speechbrain/spkrec-xvect-voxceleb",
     run_opts={"device": device},
 print("Models loaded successfully.")
+# --- Speaker Embedding Generation ---
+# The quality of your `1.wav` file is CRITICAL for good results.
+# It should be a clean, noise-free recording of a single speaker.
 def create_speaker_embedding(audio_path):
     print("Creating speaker embedding from:", audio_path)
     waveform, sr = torchaudio.load(audio_path)
     if sr != 16000:
+        waveform = torchaudio.functional.resample(waveform, sr, 16000)
     with torch.no_grad():
         embedding = speaker_model.encode_batch(waveform.to(device))
         embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
     print("Speaker embedding created.")
     return embedding
 SPEAKER_WAV = "1.wav"
 if not os.path.exists(SPEAKER_WAV):
+    print(f"Warning: '{SPEAKER_WAV}' not found. Creating a silent dummy file.")
+    sf.write(SPEAKER_WAV, np.zeros(16000 * 2), 16000)
 print("Loading or creating speaker embedding...")
 speaker_embedding = create_speaker_embedding(SPEAKER_WAV)
 print("Speaker embedding ready.")
 def number_to_somali_words(num_str):
     try:
         num = int(num_str)
+    except ValueError: return num_str
     if num < 0: return "eber ka yar"
     units = ["", "koow", "labo", "saddex", "afar", "shan", "lix", "toddobo", "siddeed", "sagaal"]
     teens = ["toban", "kow iyo toban", "laba iyo toban", "saddex iyo toban", "afar iyo toban", "shan iyo toban", "lix iyo toban", "toddobo iyo toban", "siddeed iyo toban", "sagaal iyo toban"]
     tens = ["", "toban", "labaatan", "soddon", "afartan", "konton", "lixdan", "toddobaatan", "siddeetan", "sagaashan"]
     if num == 0: return "eber"
     if num < 10: return units[num]
     if num < 20: return teens[num-10]
     return text.strip()
+# --- Core TTS Function with AI Noise Reduction ---
 def text_to_speech(text):
     print(f"Generating speech for: '{text}'")
     normalized_text = normalize_text(text)
     if not normalized_text:
         return (16000, np.zeros(0, dtype=np.int16))
+    # --- Step 1: Generate the raw speech ---
     inputs = processor(text=normalized_text, return_tensors="pt").to(device)
     with torch.no_grad():
         speech = model.generate_speech(
             inputs["input_ids"],
             speaker_embedding.unsqueeze(0),
             vocoder=vocoder,
+            threshold=0.5
         )
+    raw_speech_numpy = speech.cpu().numpy()
+    # --- Step 2: AI-POWERED NOISE REDUCTION ---
+    # This is the crucial step to clean the audio.
+    print("Applying noise reduction filter...")
+    # The sample rate (sr) must match the audio's sample rate.
+    clean_speech = nr.reduce_noise(y=raw_speech_numpy, sr=16000)
+    print("Noise reduction complete.")
+    # --- Step 3: Final Polishing (Volume Normalization) ---
+    # Convert to pydub AudioSegment for easy volume handling.
+    # Note: Ensure the numpy array is in 16-bit integer format for pydub.
+    clean_speech_int16 = (clean_speech * 32767).astype(np.int16)
     audio_segment = AudioSegment(
+        clean_speech_int16.tobytes(),
         frame_rate=16000,
+        sample_width=clean_speech_int16.dtype.itemsize,
         channels=1
     )
+    # Normalize volume to a standard level for a professional feel.
     processed_audio = effects.normalize(audio_segment)
+    # Convert back to numpy array for Gradio output
+    final_output_numpy = np.array(processed_audio.get_array_of_samples())
+    print("Speech generation and cleaning finished.")
+    return (16000, final_output_numpy)
 # --- Gradio Web Interface ---
     fn=text_to_speech,
     inputs=gr.Textbox(
         label="Qoraalka Geli (Enter Somali Text)",
+        placeholder="Ku qor qoraalkaaga halkan si aad u hesho cod saafi ah..."
     ),
     outputs=gr.Audio(
+        label="Codka La Soo Saaray (Cleaned Audio)",
         type="numpy"
     ),
+    title="🇸🇴 Somali TTS oo leh Cod Sifeeye (with Noise Reduction)",
     description=(
+        "Ku qor qoraal si aad ugu beddesho cod saafi ah oo aan qaylo lahayn. Barnaamijkan wuxuu si toos ah uga saarayaa sawaxanka codka la soo saaray."
+        "\n\n(Enter text to convert it to a clean, noise-free voice. This model automatically removes background noise from the generated audio.)"
     ),
     examples=[
         ["Sidee tahay saaxiib? Maanta waa maalin qurux badan."],
+        ["Tani waa tijaabo si loo maqlo tayada codka oo saafi ah."],
+        ["Waan ku faraxsanahay inaan idinla hadlo maanta."],
     ]
 )