MoHamdyy commited on
Commit
4f298d6
·
1 Parent(s): 8785760

Fix syntax error in TTS stage and complete pipeline

Browse files
Files changed (1) hide show
  1. app.py +15 -6
app.py CHANGED
@@ -345,13 +345,13 @@ class TransformerTTS(nn.Module):
345
  # Check stop condition but ensure minimum generation
346
  stop_prob = torch.sigmoid(stop_token[:, -1])
347
  if stop_prob > gate_threshold and frames_generated > 50: # Ensure at least 50 frames
348
- print(f"TTS: Stopping at frame {frames_generated}, stop_prob: {stop_prob:.6f}")
349
  break
350
  else:
351
  stop_token_outputs = torch.cat([stop_token_outputs, stop_token[:, -1:]], dim=1)
352
  mel_lengths = torch.tensor(mel_padded.shape[1]).unsqueeze(0).to(DEVICE)
353
 
354
- print(f"TTS: Generated {frames_generated} frames, final mel shape: {mel_postnet.shape}")
355
  return mel_postnet, stop_token_outputs
356
  # --- (End of your model definitions) ---
357
 
@@ -418,7 +418,7 @@ def full_speech_translation_pipeline(audio_input_path: str):
418
  msg = "Error: Audio file not provided or not found."
419
  print(msg)
420
  # Return empty/default values
421
- return "Error: No file", "", (hp.sr, np.array([]).astype(np.float32))
422
 
423
  # STT Stage
424
  arabic_transcript = "STT Error: Processing failed."
@@ -465,7 +465,7 @@ def full_speech_translation_pipeline(audio_input_path: str):
465
  print(english_translation)
466
 
467
  # TTS Stage
468
- synthesized_audio_np = np.array([]).astype(np.float32)
469
  if english_translation and not english_translation.startswith("TTT Error"):
470
  try:
471
  print("TTS: Synthesizing English speech...")
@@ -480,14 +480,23 @@ def full_speech_translation_pipeline(audio_input_path: str):
480
  audio_tensor = inverse_mel_spec_to_wav(mel_for_vocoder)
481
  synthesized_audio_np = audio_tensor.cpu().numpy()
482
  print(f"TTS: Synthesized audio shape: {synthesized_audio_np.shape}")
 
 
 
 
 
483
  else:
484
  print("TTS: Generated mel too small, using silence")
485
- synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32) # 1 second of silence
486
  else:
487
  print("TTS: Generated mel is empty or too small, using silence")
488
- synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32) # 1 second of silence
489
  except Exception as e:
490
  print(f"TTS Error: {e}")
 
 
 
 
491
 
492
  print(f"--- PIPELINE END ---")
493
  return arabic_transcript, english_translation, (hp.sr, synthesized_audio_np)
 
345
  # Check stop condition but ensure minimum generation
346
  stop_prob = torch.sigmoid(stop_token[:, -1])
347
  if stop_prob > gate_threshold and frames_generated > 50: # Ensure at least 50 frames
348
+ print(f"TTS: Stopping at frame {frames_generated}, stop_prob: {stop_prob.item():.6f}")
349
  break
350
  else:
351
  stop_token_outputs = torch.cat([stop_token_outputs, stop_token[:, -1:]], dim=1)
352
  mel_lengths = torch.tensor(mel_padded.shape[1]).unsqueeze(0).to(DEVICE)
353
 
354
+ print(f"TTS: Generated {frames_generated} frames, final mel shape: {list(mel_postnet.shape)}")
355
  return mel_postnet, stop_token_outputs
356
  # --- (End of your model definitions) ---
357
 
 
418
  msg = "Error: Audio file not provided or not found."
419
  print(msg)
420
  # Return empty/default values
421
+ return "Error: No file", "", (hp.sr, np.zeros(hp.sr, dtype=np.float32)) # 1 second of silence
422
 
423
  # STT Stage
424
  arabic_transcript = "STT Error: Processing failed."
 
465
  print(english_translation)
466
 
467
  # TTS Stage
468
+ synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32) # Default to 1 second of silence
469
  if english_translation and not english_translation.startswith("TTT Error"):
470
  try:
471
  print("TTS: Synthesizing English speech...")
 
480
  audio_tensor = inverse_mel_spec_to_wav(mel_for_vocoder)
481
  synthesized_audio_np = audio_tensor.cpu().numpy()
482
  print(f"TTS: Synthesized audio shape: {synthesized_audio_np.shape}")
483
+
484
+ # Ensure audio is not empty
485
+ if synthesized_audio_np.size == 0:
486
+ print("TTS: Generated audio is empty, using silence")
487
+ synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)
488
  else:
489
  print("TTS: Generated mel too small, using silence")
490
+ synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)
491
  else:
492
  print("TTS: Generated mel is empty or too small, using silence")
493
+ synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)
494
  except Exception as e:
495
  print(f"TTS Error: {e}")
496
+ synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32) # Fallback to silence
497
+ else:
498
+ print("TTS: Skipped due to TTT failure or empty translation")
499
+ synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)
500
 
501
  print(f"--- PIPELINE END ---")
502
  return arabic_transcript, english_translation, (hp.sr, synthesized_audio_np)