Spaces:

Somalitts
/

5aad

Runtime error

App Files Files Community

Somalitts commited on Jul 13, 2025

Commit

33739e4

verified ·

1 Parent(s): 0862d1a

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -82

app.py CHANGED Viewed

@@ -8,83 +8,77 @@ from speechbrain.pretrained import EncoderClassifier
 import numpy as np
 # --- Configuration ---
-# Choose the device to run the models on
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# --- CHANGE THIS TO THE PATH OF YOUR HIGH-QUALITY VOICE RECORDING ---
-# For the best results, use a clean, clear voice recording with no background noise.
-# The recording should be at least 10-15 seconds long.
-VOICE_SAMPLE_PATH = "7.wav"
-# Path to save the generated speaker embedding file for faster loading next time.
-EMB_PATH = "speaker_embedding.pt"
 # --- Load Models ---
-# It's generally a good practice to handle potential download issues.
 try:
     processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
     model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/5aad").to(device)
     vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
     speaker_model = EncoderClassifier.from_hparams(
         source="speechbrain/spkrec-xvect-voxceleb",
         run_opts={"device": device},
-        savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb") # More organized model saving
     )
 except Exception as e:
-    raise gr.Error(f"Error loading models: {e}. Please check your internet connection and model names.")
-# --- Function to Create Speaker Embedding ---
-def create_speaker_embedding(wav_file_path, classifier):
     """
-    Analyzes a voice recording and creates a speaker embedding.
     """
-    if not os.path.exists(wav_file_path):
-        raise FileNotFoundError(f"The voice sample file was not found at: {wav_file_path}")
-    # Load the audio file
-    audio, sr = torchaudio.load(wav_file_path)
-    # Resample to 16000 Hz if necessary, which is what the model expects
-    if sr != 16000:
-        resampler = torchaudio.transforms.Resample(sr, 16000)
-        audio = resampler(audio)
-    # Ensure the audio is mono by averaging channels if it's stereo
-    if audio.shape[0] > 1:
-        audio = torch.mean(audio, dim=0, keepdim=True)
-    audio = audio.to(device)
-    # Generate the embedding
-    with torch.no_grad():
-        embedding = classifier.encode_batch(audio)
-        # Normalize the embedding to have a consistent scale
-        embedding = torch.nn.functional.normalize(embedding, dim=2)
-        # Remove unnecessary dimensions
-        embedding = embedding.squeeze()
-    return embedding
-# --- Get or Create the Speaker Embedding ---
-# This part of the code now clearly separates the creation of the embedding.
-if os.path.exists(EMB_PATH):
-    print("Loading existing speaker embedding.")
-    speaker_embedding = torch.load(EMB_PATH).to(device)
-else:
-    print("Creating a new speaker embedding from the voice sample.")
     try:
-        speaker_embedding = create_speaker_embedding(VOICE_SAMPLE_PATH, speaker_model)
-        # Save the embedding to avoid re-creating it every time
-        torch.save(speaker_embedding.cpu(), EMB_PATH)
-        print(f"New speaker embedding saved to {EMB_PATH}")
-    except FileNotFoundError as e:
-        raise gr.Error(str(e))
     except Exception as e:
-        raise gr.Error(f"Could not create speaker embedding. Ensure your audio file is valid. Error: {e}")
 # --- Text Processing Functions (Somali Number Conversion) ---
-# These functions for converting numbers to words remain the same.
 number_words = {
     0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
     6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
@@ -95,20 +89,11 @@ number_words = {
     60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
     100: "boqol", 1000: "kun",
 }
 def number_to_words(n):
-    if n in number_words:
-        return number_words[n]
-    if n < 100:
-        tens, unit = divmod(n, 10)
-        return number_words[tens * 10] + (" iyo " + number_words[unit] if unit else "")
-    if n < 1000:
-        hundreds, remainder = divmod(n, 100)
-        return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" iyo " + number_to_words(remainder) if remainder else "")
-    if n < 1000000:
-        thousands, remainder = divmod(n, 1000)
-        return (number_to_words(thousands) + " kun" if thousands > 1 else "kun") + (" iyo " + number_to_words(remainder) if remainder else "")
-    # Add more for larger numbers if needed
     return str(n)
 def replace_numbers_with_words(text):
@@ -117,44 +102,64 @@ def replace_numbers_with_words(text):
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
-    # Allows for more Somali characters
     text = re.sub(r'[^\w\s\']', '', text)
     return text
 # --- Main Text-to-Speech Function ---
-def text_to_speech(text):
     """
-    Converts a string of text into speech using the loaded models and speaker embedding.
     """
     normalized_text = normalize_text(text)
     inputs = processor(text=normalized_text, return_tensors="pt").to(device)
     with torch.no_grad():
-        # The model generates the speech waveform
         speech = model.generate_speech(
             inputs["input_ids"],
-            speaker_embeddings=speaker_embedding.unsqueeze(0), # Add batch dimension
             vocoder=vocoder
         )
-    # Return the sampling rate and the speech audio as a NumPy array
     return (16000, speech.cpu().numpy())
 # --- Gradio Interface ---
-# The user interface for interacting with the TTS system.
 iface = gr.Interface(
     fn=text_to_speech,
-    inputs=gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)"),
     outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
-    title="Somali Text-to-Speech with Custom Voice",
-    description=f"This tool uses a custom voice from the file '{VOICE_SAMPLE_PATH}'. To change the voice, update the VOICE_SAMPLE_PATH variable in the code and restart.",
     examples=[
-        ["Sidee tahay saaxiib? Maanta waa maalin wanaagsan."],
-        ["Barnaamijkan wuxuu qoraalka u beddelaa hadal."],
-        ["Waxaan joogaa magaalada Muqdisho."],
     ]
 )
 # Launch the web interface
 if __name__ == "__main__":
-    iface.launch()

 import numpy as np
 # --- Configuration ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# --- ADD ALL YOUR VOICE FILES HERE ---
+# The code will automatically create a dropdown for these files.
+# Make sure these files are in the same directory as your script.
+VOICE_SAMPLE_FILES = ["7.wav", "46.wav", "90.wav", "150.wav", "355.wav"]
+# Directory to store speaker embedding files
+EMBEDDING_DIR = "speaker_embeddings"
+os.makedirs(EMBEDDING_DIR, exist_ok=True)
 # --- Load Models ---
+# This part loads all the necessary AI models.
 try:
+    print("Loading models... This may take a moment.")
     processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
     model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/5aad").to(device)
     vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
     speaker_model = EncoderClassifier.from_hparams(
         source="speechbrain/spkrec-xvect-voxceleb",
         run_opts={"device": device},
+        savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
     )
+    print("Models loaded successfully.")
 except Exception as e:
+    raise gr.Error(f"Error loading models: {e}. Check your internet connection.")
+# A dictionary to cache loaded speaker embeddings in memory
+speaker_embeddings_cache = {}
+# --- Function to Get or Create Speaker Embedding ---
+def get_speaker_embedding(wav_file_path):
     """
+    Loads a speaker embedding from cache or file. If not found, creates and saves it.
     """
+    # Check cache first
+    if wav_file_path in speaker_embeddings_cache:
+        return speaker_embeddings_cache[wav_file_path]
+    embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
+    if os.path.exists(embedding_path):
+        print(f"Loading existing embedding for {wav_file_path}")
+        embedding = torch.load(embedding_path, map_location=device)
+        speaker_embeddings_cache[wav_file_path] = embedding
+        return embedding
+    print(f"Creating new speaker embedding for {wav_file_path}...")
+    if not os.path.exists(wav_file_path):
+        raise gr.Error(f"Audio file not found: {wav_file_path}. Please make sure it's in the correct directory.")
     try:
+        audio, sr = torchaudio.load(wav_file_path)
+        if sr != 16000:
+            audio = torchaudio.functional.resample(audio, sr, 16000)
+        if audio.shape[0] > 1:
+            audio = torch.mean(audio, dim=0, keepdim=True)
+        with torch.no_grad():
+            embedding = speaker_model.encode_batch(audio.to(device))
+            embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
+        torch.save(embedding.cpu(), embedding_path)
+        speaker_embeddings_cache[wav_file_path] = embedding.to(device)
+        print(f"Embedding created and saved for {wav_file_path}.")
+        return embedding.to(device)
     except Exception as e:
+        raise gr.Error(f"Could not process audio file {wav_file_path}. Is it a valid WAV file? Error: {e}")
 # --- Text Processing Functions (Somali Number Conversion) ---
+# These functions remain the same.
 number_words = {
     0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
     6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
     60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
     100: "boqol", 1000: "kun",
 }
 def number_to_words(n):
+    if n in number_words: return number_words[n]
+    if n < 100: return number_words[n//10 * 10] + (" iyo " + number_words[n%10] if n%10 else "")
+    if n < 1000: return (number_words[n//100] + " boqol" if n//100 > 1 else "boqol") + (" iyo " + number_to_words(n%100) if n%100 else "")
+    if n < 1000000: return (number_to_words(n//1000) + " kun" if n//1000 > 1 else "kun") + (" iyo " + number_to_words(n%1000) if n%1000 else "")
     return str(n)
 def replace_numbers_with_words(text):
 def normalize_text(text):
     text = text.lower()
     text = replace_numbers_with_words(text)
     text = re.sub(r'[^\w\s\']', '', text)
     return text
 # --- Main Text-to-Speech Function ---
+def text_to_speech(text, voice_choice):
     """
+    Takes text and the chosen voice file, and returns audio.
     """
+    if not text:
+        gr.Warning("Please enter some text.")
+        return None, None
+    if not voice_choice:
+        gr.Warning("Please select a voice from the dropdown.")
+        return None, None
+    # Get the correct speaker embedding for the chosen voice
+    speaker_embedding = get_speaker_embedding(voice_choice)
     normalized_text = normalize_text(text)
     inputs = processor(text=normalized_text, return_tensors="pt").to(device)
     with torch.no_grad():
         speech = model.generate_speech(
             inputs["input_ids"],
+            speaker_embeddings=speaker_embedding.unsqueeze(0),
             vocoder=vocoder
         )
     return (16000, speech.cpu().numpy())
 # --- Gradio Interface ---
+# The user interface now includes a dropdown menu for voice selection.
 iface = gr.Interface(
     fn=text_to_speech,
+    inputs=[
+        gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)"),
+        gr.Dropdown(
+            VOICE_SAMPLE_FILES,
+            label="Select Voice",
+            info="Choose the voice you want to use for the speech.",
+            value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None # Default to the first voice
+        )
+    ],
     outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
+    title="Multi-Voice Somali Text-to-Speech",
+    description="Enter Somali text, choose a voice from the dropdown, and click submit to generate speech.",
     examples=[
+        ["Sidee tahay saaxiib? Maanta waa maalin wanaagsan.", VOICE_SAMPLE_FILES[0]],
+        ["Nabad gelyo, is arag dambe.", VOICE_SAMPLE_FILES[1] if len(VOICE_SAMPLE_FILES) > 1 else VOICE_SAMPLE_FILES[0]],
     ]
 )
 # Launch the web interface
 if __name__ == "__main__":
+    # Pre-load embeddings for a faster startup experience
+    print("Pre-loading all voice embeddings...")
+    for voice_file in VOICE_SAMPLE_FILES:
+        get_speaker_embedding(voice_file)
+    print("All voices are ready. Launching interface.")
+    iface.launch(share=True)