Somalitts commited on
Commit
33739e4
·
verified ·
1 Parent(s): 0862d1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -82
app.py CHANGED
@@ -8,83 +8,77 @@ from speechbrain.pretrained import EncoderClassifier
8
  import numpy as np
9
 
10
  # --- Configuration ---
11
- # Choose the device to run the models on
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
 
14
- # --- CHANGE THIS TO THE PATH OF YOUR HIGH-QUALITY VOICE RECORDING ---
15
- # For the best results, use a clean, clear voice recording with no background noise.
16
- # The recording should be at least 10-15 seconds long.
17
- VOICE_SAMPLE_PATH = "7.wav"
18
 
19
- # Path to save the generated speaker embedding file for faster loading next time.
20
- EMB_PATH = "speaker_embedding.pt"
 
21
 
22
  # --- Load Models ---
23
- # It's generally a good practice to handle potential download issues.
24
  try:
 
25
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
26
  model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/5aad").to(device)
27
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
28
  speaker_model = EncoderClassifier.from_hparams(
29
  source="speechbrain/spkrec-xvect-voxceleb",
30
  run_opts={"device": device},
31
- savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb") # More organized model saving
32
  )
 
33
  except Exception as e:
34
- raise gr.Error(f"Error loading models: {e}. Please check your internet connection and model names.")
35
 
36
- # --- Function to Create Speaker Embedding ---
37
- def create_speaker_embedding(wav_file_path, classifier):
 
 
 
38
  """
39
- Analyzes a voice recording and creates a speaker embedding.
40
  """
41
- if not os.path.exists(wav_file_path):
42
- raise FileNotFoundError(f"The voice sample file was not found at: {wav_file_path}")
43
-
44
- # Load the audio file
45
- audio, sr = torchaudio.load(wav_file_path)
46
 
47
- # Resample to 16000 Hz if necessary, which is what the model expects
48
- if sr != 16000:
49
- resampler = torchaudio.transforms.Resample(sr, 16000)
50
- audio = resampler(audio)
51
 
52
- # Ensure the audio is mono by averaging channels if it's stereo
53
- if audio.shape[0] > 1:
54
- audio = torch.mean(audio, dim=0, keepdim=True)
55
-
56
- audio = audio.to(device)
 
 
 
 
57
 
58
- # Generate the embedding
59
- with torch.no_grad():
60
- embedding = classifier.encode_batch(audio)
61
- # Normalize the embedding to have a consistent scale
62
- embedding = torch.nn.functional.normalize(embedding, dim=2)
63
- # Remove unnecessary dimensions
64
- embedding = embedding.squeeze()
65
-
66
- return embedding
67
-
68
- # --- Get or Create the Speaker Embedding ---
69
- # This part of the code now clearly separates the creation of the embedding.
70
- if os.path.exists(EMB_PATH):
71
- print("Loading existing speaker embedding.")
72
- speaker_embedding = torch.load(EMB_PATH).to(device)
73
- else:
74
- print("Creating a new speaker embedding from the voice sample.")
75
  try:
76
- speaker_embedding = create_speaker_embedding(VOICE_SAMPLE_PATH, speaker_model)
77
- # Save the embedding to avoid re-creating it every time
78
- torch.save(speaker_embedding.cpu(), EMB_PATH)
79
- print(f"New speaker embedding saved to {EMB_PATH}")
80
- except FileNotFoundError as e:
81
- raise gr.Error(str(e))
 
 
 
 
 
 
 
 
82
  except Exception as e:
83
- raise gr.Error(f"Could not create speaker embedding. Ensure your audio file is valid. Error: {e}")
84
-
85
 
86
  # --- Text Processing Functions (Somali Number Conversion) ---
87
- # These functions for converting numbers to words remain the same.
88
  number_words = {
89
  0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
90
  6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
@@ -95,20 +89,11 @@ number_words = {
95
  60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
96
  100: "boqol", 1000: "kun",
97
  }
98
-
99
  def number_to_words(n):
100
- if n in number_words:
101
- return number_words[n]
102
- if n < 100:
103
- tens, unit = divmod(n, 10)
104
- return number_words[tens * 10] + (" iyo " + number_words[unit] if unit else "")
105
- if n < 1000:
106
- hundreds, remainder = divmod(n, 100)
107
- return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" iyo " + number_to_words(remainder) if remainder else "")
108
- if n < 1000000:
109
- thousands, remainder = divmod(n, 1000)
110
- return (number_to_words(thousands) + " kun" if thousands > 1 else "kun") + (" iyo " + number_to_words(remainder) if remainder else "")
111
- # Add more for larger numbers if needed
112
  return str(n)
113
 
114
  def replace_numbers_with_words(text):
@@ -117,44 +102,64 @@ def replace_numbers_with_words(text):
117
  def normalize_text(text):
118
  text = text.lower()
119
  text = replace_numbers_with_words(text)
120
- # Allows for more Somali characters
121
  text = re.sub(r'[^\w\s\']', '', text)
122
  return text
123
 
124
  # --- Main Text-to-Speech Function ---
125
- def text_to_speech(text):
126
  """
127
- Converts a string of text into speech using the loaded models and speaker embedding.
128
  """
 
 
 
 
 
 
 
 
 
 
129
  normalized_text = normalize_text(text)
130
  inputs = processor(text=normalized_text, return_tensors="pt").to(device)
131
-
132
  with torch.no_grad():
133
- # The model generates the speech waveform
134
  speech = model.generate_speech(
135
  inputs["input_ids"],
136
- speaker_embeddings=speaker_embedding.unsqueeze(0), # Add batch dimension
137
  vocoder=vocoder
138
  )
139
-
140
- # Return the sampling rate and the speech audio as a NumPy array
141
  return (16000, speech.cpu().numpy())
142
 
143
  # --- Gradio Interface ---
144
- # The user interface for interacting with the TTS system.
145
  iface = gr.Interface(
146
  fn=text_to_speech,
147
- inputs=gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)"),
 
 
 
 
 
 
 
 
148
  outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
149
- title="Somali Text-to-Speech with Custom Voice",
150
- description=f"This tool uses a custom voice from the file '{VOICE_SAMPLE_PATH}'. To change the voice, update the VOICE_SAMPLE_PATH variable in the code and restart.",
151
  examples=[
152
- ["Sidee tahay saaxiib? Maanta waa maalin wanaagsan."],
153
- ["Barnaamijkan wuxuu qoraalka u beddelaa hadal."],
154
- ["Waxaan joogaa magaalada Muqdisho."],
155
  ]
156
  )
157
 
158
  # Launch the web interface
159
  if __name__ == "__main__":
160
- iface.launch()
 
 
 
 
 
 
 
8
  import numpy as np
9
 
10
  # --- Configuration ---
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
+ # --- ADD ALL YOUR VOICE FILES HERE ---
14
+ # The code will automatically create a dropdown for these files.
15
+ # Make sure these files are in the same directory as your script.
16
+ VOICE_SAMPLE_FILES = ["7.wav", "46.wav", "90.wav", "150.wav", "355.wav"]
17
 
18
+ # Directory to store speaker embedding files
19
+ EMBEDDING_DIR = "speaker_embeddings"
20
+ os.makedirs(EMBEDDING_DIR, exist_ok=True)
21
 
22
  # --- Load Models ---
23
+ # This part loads all the necessary AI models.
24
  try:
25
+ print("Loading models... This may take a moment.")
26
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
27
  model = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/5aad").to(device)
28
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
29
  speaker_model = EncoderClassifier.from_hparams(
30
  source="speechbrain/spkrec-xvect-voxceleb",
31
  run_opts={"device": device},
32
+ savedir=os.path.join("pretrained_models", "spkrec-xvect-voxceleb")
33
  )
34
+ print("Models loaded successfully.")
35
  except Exception as e:
36
+ raise gr.Error(f"Error loading models: {e}. Check your internet connection.")
37
 
38
+ # A dictionary to cache loaded speaker embeddings in memory
39
+ speaker_embeddings_cache = {}
40
+
41
+ # --- Function to Get or Create Speaker Embedding ---
42
+ def get_speaker_embedding(wav_file_path):
43
  """
44
+ Loads a speaker embedding from cache or file. If not found, creates and saves it.
45
  """
46
+ # Check cache first
47
+ if wav_file_path in speaker_embeddings_cache:
48
+ return speaker_embeddings_cache[wav_file_path]
 
 
49
 
50
+ embedding_path = os.path.join(EMBEDDING_DIR, f"{os.path.basename(wav_file_path)}.pt")
 
 
 
51
 
52
+ if os.path.exists(embedding_path):
53
+ print(f"Loading existing embedding for {wav_file_path}")
54
+ embedding = torch.load(embedding_path, map_location=device)
55
+ speaker_embeddings_cache[wav_file_path] = embedding
56
+ return embedding
57
+
58
+ print(f"Creating new speaker embedding for {wav_file_path}...")
59
+ if not os.path.exists(wav_file_path):
60
+ raise gr.Error(f"Audio file not found: {wav_file_path}. Please make sure it's in the correct directory.")
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  try:
63
+ audio, sr = torchaudio.load(wav_file_path)
64
+ if sr != 16000:
65
+ audio = torchaudio.functional.resample(audio, sr, 16000)
66
+ if audio.shape[0] > 1:
67
+ audio = torch.mean(audio, dim=0, keepdim=True)
68
+
69
+ with torch.no_grad():
70
+ embedding = speaker_model.encode_batch(audio.to(device))
71
+ embedding = torch.nn.functional.normalize(embedding, dim=2).squeeze()
72
+
73
+ torch.save(embedding.cpu(), embedding_path)
74
+ speaker_embeddings_cache[wav_file_path] = embedding.to(device)
75
+ print(f"Embedding created and saved for {wav_file_path}.")
76
+ return embedding.to(device)
77
  except Exception as e:
78
+ raise gr.Error(f"Could not process audio file {wav_file_path}. Is it a valid WAV file? Error: {e}")
 
79
 
80
  # --- Text Processing Functions (Somali Number Conversion) ---
81
+ # These functions remain the same.
82
  number_words = {
83
  0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
84
  6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
 
89
  60: "lixdan", 70: "toddobaatan", 80: "siddeetan", 90: "sagaashan",
90
  100: "boqol", 1000: "kun",
91
  }
 
92
  def number_to_words(n):
93
+ if n in number_words: return number_words[n]
94
+ if n < 100: return number_words[n//10 * 10] + (" iyo " + number_words[n%10] if n%10 else "")
95
+ if n < 1000: return (number_words[n//100] + " boqol" if n//100 > 1 else "boqol") + (" iyo " + number_to_words(n%100) if n%100 else "")
96
+ if n < 1000000: return (number_to_words(n//1000) + " kun" if n//1000 > 1 else "kun") + (" iyo " + number_to_words(n%1000) if n%1000 else "")
 
 
 
 
 
 
 
 
97
  return str(n)
98
 
99
  def replace_numbers_with_words(text):
 
102
  def normalize_text(text):
103
  text = text.lower()
104
  text = replace_numbers_with_words(text)
 
105
  text = re.sub(r'[^\w\s\']', '', text)
106
  return text
107
 
108
  # --- Main Text-to-Speech Function ---
109
+ def text_to_speech(text, voice_choice):
110
  """
111
+ Takes text and the chosen voice file, and returns audio.
112
  """
113
+ if not text:
114
+ gr.Warning("Please enter some text.")
115
+ return None, None
116
+ if not voice_choice:
117
+ gr.Warning("Please select a voice from the dropdown.")
118
+ return None, None
119
+
120
+ # Get the correct speaker embedding for the chosen voice
121
+ speaker_embedding = get_speaker_embedding(voice_choice)
122
+
123
  normalized_text = normalize_text(text)
124
  inputs = processor(text=normalized_text, return_tensors="pt").to(device)
125
+
126
  with torch.no_grad():
 
127
  speech = model.generate_speech(
128
  inputs["input_ids"],
129
+ speaker_embeddings=speaker_embedding.unsqueeze(0),
130
  vocoder=vocoder
131
  )
132
+
 
133
  return (16000, speech.cpu().numpy())
134
 
135
  # --- Gradio Interface ---
136
+ # The user interface now includes a dropdown menu for voice selection.
137
  iface = gr.Interface(
138
  fn=text_to_speech,
139
+ inputs=[
140
+ gr.Textbox(label="Geli qoraalka af-Soomaaliga (Enter Somali Text)"),
141
+ gr.Dropdown(
142
+ VOICE_SAMPLE_FILES,
143
+ label="Select Voice",
144
+ info="Choose the voice you want to use for the speech.",
145
+ value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None # Default to the first voice
146
+ )
147
+ ],
148
  outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
149
+ title="Multi-Voice Somali Text-to-Speech",
150
+ description="Enter Somali text, choose a voice from the dropdown, and click submit to generate speech.",
151
  examples=[
152
+ ["Sidee tahay saaxiib? Maanta waa maalin wanaagsan.", VOICE_SAMPLE_FILES[0]],
153
+ ["Nabad gelyo, is arag dambe.", VOICE_SAMPLE_FILES[1] if len(VOICE_SAMPLE_FILES) > 1 else VOICE_SAMPLE_FILES[0]],
 
154
  ]
155
  )
156
 
157
  # Launch the web interface
158
  if __name__ == "__main__":
159
+ # Pre-load embeddings for a faster startup experience
160
+ print("Pre-loading all voice embeddings...")
161
+ for voice_file in VOICE_SAMPLE_FILES:
162
+ get_speaker_embedding(voice_file)
163
+ print("All voices are ready. Launching interface.")
164
+
165
+ iface.launch(share=True)