rahul7star commited on
Commit
aaaab74
·
verified ·
1 Parent(s): 91fdade

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -41
app.py CHANGED
@@ -149,11 +149,10 @@ def get_or_load_model():
149
  map_location="cpu"
150
  )
151
 
152
- # Extra safety: force CPU
153
  if hasattr(MODEL, "to"):
154
  MODEL = MODEL.to("cpu")
155
 
156
- # Disable gradients (CPU optimization)
157
  MODEL.eval()
158
  for p in MODEL.parameters():
159
  p.requires_grad = False
@@ -177,13 +176,11 @@ except Exception as e:
177
  )
178
 
179
  def set_seed(seed: int):
180
- """Sets the random seed for reproducibility across torch, numpy, and random."""
181
  torch.manual_seed(seed)
182
- if DEVICE == "cuda":
183
- torch.cuda.manual_seed(seed)
184
- torch.cuda.manual_seed_all(seed)
185
  random.seed(seed)
186
  np.random.seed(seed)
 
187
 
188
  def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | None:
189
  """
@@ -206,37 +203,14 @@ def generate_tts_audio(
206
  seed_num_input: int = 0,
207
  cfgw_input: float = 0.5
208
  ) -> tuple[int, np.ndarray]:
209
- """
210
- Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
211
- Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi.
212
-
213
- This tool synthesizes natural-sounding speech from input text. When a reference audio file
214
- is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
215
- maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
216
-
217
- Args:
218
- text_input (str): The text to synthesize into speech (maximum 300 characters)
219
- language_id (str): The language code for synthesis (eg. en, fr, de, es, it, pt, hi)
220
- audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
221
- exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
222
- temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
223
- seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
224
- cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer.
225
-
226
- Returns:
227
- tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
228
- """
229
- current_model = get_or_load_model()
230
 
 
231
  if current_model is None:
232
  raise RuntimeError("TTS model is not loaded.")
233
 
234
  if seed_num_input != 0:
235
  set_seed(int(seed_num_input))
236
 
237
- print(f"Generating audio for text: '{text_input[:50]}...'")
238
-
239
- # Handle optional audio prompt
240
  chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id)
241
 
242
  generate_kwargs = {
@@ -244,19 +218,22 @@ def generate_tts_audio(
244
  "temperature": temperature_input,
245
  "cfg_weight": cfgw_input,
246
  }
 
247
  if chosen_prompt:
248
  generate_kwargs["audio_prompt_path"] = chosen_prompt
249
- print(f"Using audio prompt: {chosen_prompt}")
250
- else:
251
- print("No audio prompt provided; using default voice.")
252
-
253
- wav = current_model.generate(
254
- text_input[:300], # Truncate text to max chars
255
- language_id=language_id,
256
- **generate_kwargs
257
- )
258
- print("Audio generation complete.")
259
- return (current_model.sr, wav.squeeze(0).numpy())
 
 
260
 
261
  with gr.Blocks() as demo:
262
  gr.Markdown(
 
149
  map_location="cpu"
150
  )
151
 
152
+ # Absolute safety
153
  if hasattr(MODEL, "to"):
154
  MODEL = MODEL.to("cpu")
155
 
 
156
  MODEL.eval()
157
  for p in MODEL.parameters():
158
  p.requires_grad = False
 
176
  )
177
 
178
  def set_seed(seed: int):
179
+ """CPU-only reproducibility."""
180
  torch.manual_seed(seed)
 
 
 
181
  random.seed(seed)
182
  np.random.seed(seed)
183
+
184
 
185
  def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | None:
186
  """
 
203
  seed_num_input: int = 0,
204
  cfgw_input: float = 0.5
205
  ) -> tuple[int, np.ndarray]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
+ current_model = get_or_load_model()
208
  if current_model is None:
209
  raise RuntimeError("TTS model is not loaded.")
210
 
211
  if seed_num_input != 0:
212
  set_seed(int(seed_num_input))
213
 
 
 
 
214
  chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id)
215
 
216
  generate_kwargs = {
 
218
  "temperature": temperature_input,
219
  "cfg_weight": cfgw_input,
220
  }
221
+
222
  if chosen_prompt:
223
  generate_kwargs["audio_prompt_path"] = chosen_prompt
224
+
225
+ # 🔒 CPU-safe inference
226
+ with torch.no_grad():
227
+ wav = current_model.generate(
228
+ text_input[:300],
229
+ language_id=language_id,
230
+ **generate_kwargs
231
+ )
232
+
233
+ # Ensure CPU numpy conversion
234
+ wav = wav.squeeze(0).detach().cpu().numpy()
235
+
236
+ return (current_model.sr, wav)
237
 
238
  with gr.Blocks() as demo:
239
  gr.Markdown(