Somalitts commited on
Commit
4853a7f
·
verified ·
1 Parent(s): 80cd488

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -31
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import gradio as gr
3
  import torch
4
  import torchaudio
@@ -12,8 +11,7 @@ import numpy as np
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
 
14
  # --- ADD ALL YOUR VOICE FILES HERE ---
15
- # The code will automatically create a dropdown for these files.
16
- # Make sure these files are in the same directory as your script.
17
  VOICE_SAMPLE_FILES = ["1.wav", "1005.wav", "1060.wav", "737.wav"]
18
 
19
  # Directory to store speaker embedding files
@@ -21,7 +19,6 @@ EMBEDDING_DIR = "speaker_embeddings"
21
  os.makedirs(EMBEDDING_DIR, exist_ok=True)
22
 
23
  # --- Load Models ---
24
- # This part loads all the necessary AI models.
25
  try:
26
  print("Loading models... This may take a moment.")
27
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
@@ -36,15 +33,9 @@ try:
36
  except Exception as e:
37
  raise gr.Error(f"Error loading models: {e}. Check your internet connection.")
38
 
39
- # A dictionary to cache loaded speaker embeddings in memory
40
  speaker_embeddings_cache = {}
41
 
42
- # --- Function to Get or Create Speaker Embedding ---
43
  def get_speaker_embedding(wav_file_path):
44
- """
45
- Loads a speaker embedding from cache or file. If not found, creates and saves it.
46
- """
47
- # Check cache first
48
  if wav_file_path in speaker_embeddings_cache:
49
  return speaker_embeddings_cache[wav_file_path]
50
 
@@ -58,7 +49,7 @@ def get_speaker_embedding(wav_file_path):
58
 
59
  print(f"Creating new speaker embedding for {wav_file_path}...")
60
  if not os.path.exists(wav_file_path):
61
- raise gr.Error(f"Audio file not found: {wav_file_path}. Please make sure it's in the correct directory.")
62
 
63
  try:
64
  audio, sr = torchaudio.load(wav_file_path)
@@ -76,10 +67,9 @@ def get_speaker_embedding(wav_file_path):
76
  print(f"Embedding created and saved for {wav_file_path}.")
77
  return embedding.to(device)
78
  except Exception as e:
79
- raise gr.Error(f"Could not process audio file {wav_file_path}. Is it a valid WAV file? Error: {e}")
80
 
81
- # --- Text Processing Functions (Somali Number Conversion) ---
82
- # These functions remain the same.
83
  number_words = {
84
  0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
85
  6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
@@ -96,21 +86,16 @@ def number_to_words(n):
96
  if n < 1000: return (number_words[n//100] + " boqol" if n//100 > 1 else "boqol") + (" iyo " + number_to_words(n%100) if n%100 else "")
97
  if n < 1000000: return (number_to_words(n//1000) + " kun" if n//1000 > 1 else "kun") + (" iyo " + number_to_words(n%1000) if n%1000 else "")
98
  return str(n)
99
-
100
  def replace_numbers_with_words(text):
101
  return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
102
-
103
  def normalize_text(text):
104
  text = text.lower()
105
  text = replace_numbers_with_words(text)
106
  text = re.sub(r'[^\w\s\']', '', text)
107
  return text
108
 
109
- # --- Main Text-to-Speech Function ---
110
  def text_to_speech(text, voice_choice):
111
- """
112
- Takes text and the chosen voice file, and returns audio.
113
- """
114
  if not text:
115
  gr.Warning("Please enter some text.")
116
  return None, None
@@ -118,23 +103,26 @@ def text_to_speech(text, voice_choice):
118
  gr.Warning("Please select a voice from the dropdown.")
119
  return None, None
120
 
121
- # Get the correct speaker embedding for the chosen voice
122
  speaker_embedding = get_speaker_embedding(voice_choice)
123
-
124
  normalized_text = normalize_text(text)
125
  inputs = processor(text=normalized_text, return_tensors="pt").to(device)
126
 
 
127
  with torch.no_grad():
128
- speech = model.generate_speech(
129
- inputs["input_ids"],
130
  speaker_embeddings=speaker_embedding.unsqueeze(0),
131
- vocoder=vocoder
 
 
132
  )
 
 
 
133
 
134
  return (16000, speech.cpu().numpy())
135
 
136
- # --- Gradio Interface ---
137
- # The user interface now includes a dropdown menu for voice selection.
138
  iface = gr.Interface(
139
  fn=text_to_speech,
140
  inputs=[
@@ -143,7 +131,7 @@ iface = gr.Interface(
143
  VOICE_SAMPLE_FILES,
144
  label="Select Voice",
145
  info="Choose the voice you want to use for the speech.",
146
- value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None # Default to the first voice
147
  )
148
  ],
149
  outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
@@ -155,12 +143,10 @@ iface = gr.Interface(
155
  ]
156
  )
157
 
158
- # Launch the web interface
159
  if __name__ == "__main__":
160
- # Pre-load embeddings for a faster startup experience
161
  print("Pre-loading all voice embeddings...")
162
  for voice_file in VOICE_SAMPLE_FILES:
163
  get_speaker_embedding(voice_file)
164
  print("All voices are ready. Launching interface.")
165
 
166
- iface.launch(share=True)
 
 
1
  import gradio as gr
2
  import torch
3
  import torchaudio
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
  # --- ADD ALL YOUR VOICE FILES HERE ---
14
+ # Hubi in faylashan ay ku jiraan isla galka uu koodhkani ku jiro.
 
15
  VOICE_SAMPLE_FILES = ["1.wav", "1005.wav", "1060.wav", "737.wav"]
16
 
17
  # Directory to store speaker embedding files
 
19
  os.makedirs(EMBEDDING_DIR, exist_ok=True)
20
 
21
  # --- Load Models ---
 
22
  try:
23
  print("Loading models... This may take a moment.")
24
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 
33
  except Exception as e:
34
  raise gr.Error(f"Error loading models: {e}. Check your internet connection.")
35
 
 
36
  speaker_embeddings_cache = {}
37
 
 
38
  def get_speaker_embedding(wav_file_path):
 
 
 
 
39
  if wav_file_path in speaker_embeddings_cache:
40
  return speaker_embeddings_cache[wav_file_path]
41
 
 
49
 
50
  print(f"Creating new speaker embedding for {wav_file_path}...")
51
  if not os.path.exists(wav_file_path):
52
+ raise gr.Error(f"Audio file not found: {wav_file_path}.")
53
 
54
  try:
55
  audio, sr = torchaudio.load(wav_file_path)
 
67
  print(f"Embedding created and saved for {wav_file_path}.")
68
  return embedding.to(device)
69
  except Exception as e:
70
+ raise gr.Error(f"Could not process audio file {wav_file_path}. Error: {e}")
71
 
72
+ # --- Text Processing Functions (Remains the same) ---
 
73
  number_words = {
74
  0: "eber", 1: "kow", 2: "labo", 3: "saddex", 4: "afar", 5: "shan",
75
  6: "lix", 7: "toddobo", 8: "siddeed", 9: "sagaal", 10: "toban",
 
86
  if n < 1000: return (number_words[n//100] + " boqol" if n//100 > 1 else "boqol") + (" iyo " + number_to_words(n%100) if n%100 else "")
87
  if n < 1000000: return (number_to_words(n//1000) + " kun" if n//1000 > 1 else "kun") + (" iyo " + number_to_words(n%1000) if n%1000 else "")
88
  return str(n)
 
89
  def replace_numbers_with_words(text):
90
  return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)
 
91
  def normalize_text(text):
92
  text = text.lower()
93
  text = replace_numbers_with_words(text)
94
  text = re.sub(r'[^\w\s\']', '', text)
95
  return text
96
 
97
+ # --- Main Text-to-Speech Function (WAXAAN KU DARNAY HAGAAJIN HAKAN) ---
98
  def text_to_speech(text, voice_choice):
 
 
 
99
  if not text:
100
  gr.Warning("Please enter some text.")
101
  return None, None
 
103
  gr.Warning("Please select a voice from the dropdown.")
104
  return None, None
105
 
 
106
  speaker_embedding = get_speaker_embedding(voice_choice)
 
107
  normalized_text = normalize_text(text)
108
  inputs = processor(text=normalized_text, return_tensors="pt").to(device)
109
 
110
+ # Waa kan isbeddelka la sameeyay si codka loo hagaajiyo
111
  with torch.no_grad():
112
+ speech = model.generate(
113
+ input_ids=inputs["input_ids"],
114
  speaker_embeddings=speaker_embedding.unsqueeze(0),
115
+ # Halbeegyada lagu daray si loo yareeyo dareenka AI-ga
116
+ do_sample=True,
117
+ top_k=50,
118
  )
119
+
120
+ # Vocoder-ka si gooni ah ayaa loo isticmaalayaa hadda
121
+ speech = vocoder(speech)
122
 
123
  return (16000, speech.cpu().numpy())
124
 
125
+ # --- Gradio Interface (Remains the same) ---
 
126
  iface = gr.Interface(
127
  fn=text_to_speech,
128
  inputs=[
 
131
  VOICE_SAMPLE_FILES,
132
  label="Select Voice",
133
  info="Choose the voice you want to use for the speech.",
134
+ value=VOICE_SAMPLE_FILES[0] if VOICE_SAMPLE_FILES else None
135
  )
136
  ],
137
  outputs=gr.Audio(label="Codka La Abuuray (Generated Voice)", type="numpy"),
 
143
  ]
144
  )
145
 
 
146
  if __name__ == "__main__":
 
147
  print("Pre-loading all voice embeddings...")
148
  for voice_file in VOICE_SAMPLE_FILES:
149
  get_speaker_embedding(voice_file)
150
  print("All voices are ready. Launching interface.")
151
 
152
+ iface.launch(share=True)