Spaces:

Somalitts
/

8aad

Running

App Files Files Community

Somalitts commited on Jul 13

Commit

f47a2b0

verified ·

1 Parent(s): 51f97d6

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -139

app.py CHANGED Viewed

@@ -1,169 +1,97 @@
-# ==============================================================================
-#           Somali TTS with AI-Powered Noise Reduction
-# ==============================================================================
-# This script is the final version, designed to produce a clean, studio-quality
-# voice by removing background noise and digital artifacts.
-#
-# KEY FIX:
-# A noise reduction filter (`noisereduce`) is applied directly to the
-# generated audio. This intelligently removes hiss and unwanted noise,
-# leaving only the clean voice.
-# ==============================================================================
 import gradio as gr
 import torch
 import torchaudio
 import re
 import os
-import numpy as np
-import soundfile as sf
-from pydub import AudioSegment, effects
-# --- Required Imports for TTS and Noise Reduction ---
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from speechbrain.pretrained import EncoderClassifier
-import noisereduce as nr # Import the noise reduction library
-# --- Model Loading ---
-print("Loading models, this may take a moment...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 speaker_model = EncoderClassifier.from_hparams(
     source="speechbrain/spkrec-xvect-voxceleb",
     run_opts={"device": device},
-    savedir=os.path.join("models", "spk_model")
 )
-print("Models loaded successfully.")
-# --- Speaker Embedding Generation ---
-# The quality of your `1.wav` file is CRITICAL for good results.
-# It should be a clean, noise-free recording of a single speaker.
-def create_speaker_embedding(audio_path):
-    print("Creating speaker embedding from:", audio_path)
-    waveform, sr = torchaudio.load(audio_path)
-    if sr != 16000:
-        waveform = torchaudio.functional.resample(waveform, sr, 16000)
     with torch.no_grad():
-        embedding = speaker_model.encode_batch(waveform.to(device))
-        embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
-    print("Speaker embedding created.")
-    return embedding
-SPEAKER_WAV = "1.wav"
-if not os.path.exists(SPEAKER_WAV):
-    print(f"Warning: '{SPEAKER_WAV}' not found. Creating a silent dummy file.")
-    sf.write(SPEAKER_WAV, np.zeros(16000 * 2), 16000)
-print("Loading or creating speaker embedding...")
-speaker_embedding = create_speaker_embedding(SPEAKER_WAV)
-print("Speaker embedding ready.")
-# --- Text Normalization (Somali) ---
-def number_to_somali_words(num_str):
-    try:
-        num = int(num_str)
-    except ValueError: return num_str
-    if num < 0: return "eber ka yar"
-    units = ["", "koow", "labo", "saddex", "afar", "shan", "lix", "toddobo", "siddeed", "sagaal"]
-    teens = ["toban", "kow iyo toban", "laba iyo toban", "saddex iyo toban", "afar iyo toban", "shan iyo toban", "lix iyo toban", "toddobo iyo toban", "siddeed iyo toban", "sagaal iyo toban"]
-    tens = ["", "toban", "labaatan", "soddon", "afartan", "konton", "lixdan", "toddobaatan", "siddeetan", "sagaashan"]
-    if num == 0: return "eber"
-    if num < 10: return units[num]
-    if num < 20: return teens[num-10]
-    if num < 100:
-        ten, unit = divmod(num, 10)
-        return tens[ten] + ((" iyo " + units[unit]) if unit != 0 else "")
-    if num < 1000:
-        hundred, rest = divmod(num, 100)
-        return (units[hundred] if hundred > 1 else "") + " boqol" + ((" iyo " + number_to_somali_words(str(rest))) if rest != 0 else "")
-    if num < 1000000:
-        thousand, rest = divmod(num, 1000)
-        return number_to_somali_words(str(thousand)) + " kun" + ((" iyo " + number_to_somali_words(str(rest))) if rest != 0 else "")
-    return num_str
 def normalize_text(text):
     text = text.lower()
-    text = re.sub(r"\d+", lambda m: number_to_somali_words(m.group(0)), text)
-    text = re.sub(r'[^\w\s,\.]', '', text)
-    return text.strip()
-# --- Core TTS Function with AI Noise Reduction ---
 def text_to_speech(text):
-    print(f"Generating speech for: '{text}'")
-    normalized_text = normalize_text(text)
-    if not normalized_text:
-        return (16000, np.zeros(0, dtype=np.int16))
-    # --- Step 1: Generate the raw speech ---
-    inputs = processor(text=normalized_text, return_tensors="pt").to(device)
     with torch.no_grad():
-        speech = model.generate_speech(
-            inputs["input_ids"],
-            speaker_embedding.unsqueeze(0),
-            vocoder=vocoder,
-            threshold=0.5
-        )
-    raw_speech_numpy = speech.cpu().numpy()
-    # --- Step 2: AI-POWERED NOISE REDUCTION ---
-    # This is the crucial step to clean the audio.
-    print("Applying noise reduction filter...")
-    # The sample rate (sr) must match the audio's sample rate.
-    clean_speech = nr.reduce_noise(y=raw_speech_numpy, sr=16000)
-    print("Noise reduction complete.")
-    # --- Step 3: Final Polishing (Volume Normalization) ---
-    # Convert to pydub AudioSegment for easy volume handling.
-    # Note: Ensure the numpy array is in 16-bit integer format for pydub.
-    clean_speech_int16 = (clean_speech * 32767).astype(np.int16)
-    audio_segment = AudioSegment(
-        clean_speech_int16.tobytes(),
-        frame_rate=16000,
-        sample_width=clean_speech_int16.dtype.itemsize,
-        channels=1
-    )
-    # Normalize volume to a standard level for a professional feel.
-    processed_audio = effects.normalize(audio_segment)
-    # Convert back to numpy array for Gradio output
-    final_output_numpy = np.array(processed_audio.get_array_of_samples())
-    print("Speech generation and cleaning finished.")
-    return (16000, final_output_numpy)
-# --- Gradio Web Interface ---
 iface = gr.Interface(
     fn=text_to_speech,
-    inputs=gr.Textbox(
-        label="Qoraalka Geli (Enter Somali Text)",
-        placeholder="Ku qor qoraalkaaga halkan si aad u hesho cod saafi ah..."
-    ),
-    outputs=gr.Audio(
-        label="Codka La Soo Saaray (Cleaned Audio)",
-        type="numpy"
-    ),
-    title="🇸🇴 Somali TTS oo leh Cod Sifeeye (with Noise Reduction)",
-    description=(
-        "Ku qor qoraal si aad ugu beddesho cod saafi ah oo aan qaylo lahayn. Barnaamijkan wuxuu si toos ah uga saarayaa sawaxanka codka la soo saaray."
-        "\n\n(Enter text to convert it to a clean, noise-free voice. This model automatically removes background noise from the generated audio.)"
-    ),
-    examples=[
-        ["Sidee tahay saaxiib? Maanta waa maalin qurux badan."],
-        ["Tani waa tijaabo si loo maqlo tayada codka oo saafi ah."],
-        ["Waan ku faraxsanahay inaan idinla hadlo maanta."],
-    ]
 )
-if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
 import torch
 import torchaudio
 import re
 import os
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from speechbrain.pretrained import EncoderClassifier
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load models
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 speaker_model = EncoderClassifier.from_hparams(
     source="speechbrain/spkrec-xvect-voxceleb",
     run_opts={"device": device},
+    savedir="./spk_model"
 )
+# Speaker embedding
+EMB_PATH = "speaker_embedding.pt"
+if os.path.exists(EMB_PATH):
+    speaker_embedding = torch.load(EMB_PATH).to(device)
+else:
+    audio, sr = torchaudio.load("1.wav")
+    audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
     with torch.no_grad():
+        emb = speaker_model.encode_batch(audio)
+        emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
+    torch.save(emb.cpu(), EMB_PATH)
+    speaker_embedding = emb
+# Number conversion (Somali)
+number_words = {
+    0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
+    6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
+    11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
+    14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
+    17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
+    20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
+    60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
+    100: "boqol", 1000: "kun",
+}
+def number_to_words(number):
+    if number < 20:
+        return number_words[number]
+    elif number < 100:
+        tens, unit = divmod(number, 10)
+        return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
+    elif number < 1000:
+        hundreds, remainder = divmod(number, 100)
+        return (number_words[hundreds] + " boqol" if hundreds > 1 else "BOQOL") + (" " + number_to_words(remainder) if remainder else "")
+    elif number < 1000000:
+        thousands, remainder = divmod(number, 1000)
+        return (number_to_words(thousands) + " kun" if thousands > 1 else "KUN") + (" " + number_to_words(remainder) if remainder else "")
+    elif number < 1000000000:
+        millions, remainder = divmod(number, 1000000)
+        return number_to_words(millions) + " malyan" + (" " + number_to_words(remainder) if remainder else "")
+    elif number < 1000000000000:
+        billions, remainder = divmod(number, 1000000000)
+        return number_to_words(billions) + " milyaar" + (" " + number_to_words(remainder) if remainder else "")
+    else:
+        return str(number)
+def replace_numbers_with_words(text):
+    def replace(match):
+        number = int(match.group())
+        return number_to_words(number)
+    return re.sub(r'\b\d+\b', replace, text)
 def normalize_text(text):
     text = text.lower()
+    text = replace_numbers_with_words(text)
+    text = re.sub(r'[^\w\s]', '', text)
+    return text
+# TTS function
 def text_to_speech(text):
+    text = normalize_text(text)
+    inputs = processor(text=text, return_tensors="pt").to(device)
     with torch.no_grad():
+        speech = model.generate_speech(inputs["input_ids"], speaker_embedding.unsqueeze(0), vocoder=vocoder)
+    return (16000, speech.cpu().numpy())
+# Gradio Interface
 iface = gr.Interface(
     fn=text_to_speech,
+    inputs=gr.Textbox(label="Geli qoraalka af-soomaali"),
+    outputs=gr.Audio(label="Codka la abuuray", type="numpy"),
+    title="Somali TTS",
+    description="TTS Soomaaliyeed oo la adeegsaday cod gaar ah (11.wav)"
 )
+iface.launch()