Spaces:
Sleeping
Sleeping
Fix syntax error in TTS stage and complete pipeline
Browse files
app.py
CHANGED
|
@@ -345,13 +345,13 @@ class TransformerTTS(nn.Module):
|
|
| 345 |
# Check stop condition but ensure minimum generation
|
| 346 |
stop_prob = torch.sigmoid(stop_token[:, -1])
|
| 347 |
if stop_prob > gate_threshold and frames_generated > 50: # Ensure at least 50 frames
|
| 348 |
-
print(f"TTS: Stopping at frame {frames_generated}, stop_prob: {stop_prob:.6f}")
|
| 349 |
break
|
| 350 |
else:
|
| 351 |
stop_token_outputs = torch.cat([stop_token_outputs, stop_token[:, -1:]], dim=1)
|
| 352 |
mel_lengths = torch.tensor(mel_padded.shape[1]).unsqueeze(0).to(DEVICE)
|
| 353 |
|
| 354 |
-
print(f"TTS: Generated {frames_generated} frames, final mel shape: {mel_postnet.shape}")
|
| 355 |
return mel_postnet, stop_token_outputs
|
| 356 |
# --- (End of your model definitions) ---
|
| 357 |
|
|
@@ -418,7 +418,7 @@ def full_speech_translation_pipeline(audio_input_path: str):
|
|
| 418 |
msg = "Error: Audio file not provided or not found."
|
| 419 |
print(msg)
|
| 420 |
# Return empty/default values
|
| 421 |
-
return "Error: No file", "", (hp.sr, np.
|
| 422 |
|
| 423 |
# STT Stage
|
| 424 |
arabic_transcript = "STT Error: Processing failed."
|
|
@@ -465,7 +465,7 @@ def full_speech_translation_pipeline(audio_input_path: str):
|
|
| 465 |
print(english_translation)
|
| 466 |
|
| 467 |
# TTS Stage
|
| 468 |
-
synthesized_audio_np = np.
|
| 469 |
if english_translation and not english_translation.startswith("TTT Error"):
|
| 470 |
try:
|
| 471 |
print("TTS: Synthesizing English speech...")
|
|
@@ -480,14 +480,23 @@ def full_speech_translation_pipeline(audio_input_path: str):
|
|
| 480 |
audio_tensor = inverse_mel_spec_to_wav(mel_for_vocoder)
|
| 481 |
synthesized_audio_np = audio_tensor.cpu().numpy()
|
| 482 |
print(f"TTS: Synthesized audio shape: {synthesized_audio_np.shape}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
else:
|
| 484 |
print("TTS: Generated mel too small, using silence")
|
| 485 |
-
synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)
|
| 486 |
else:
|
| 487 |
print("TTS: Generated mel is empty or too small, using silence")
|
| 488 |
-
synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)
|
| 489 |
except Exception as e:
|
| 490 |
print(f"TTS Error: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 491 |
|
| 492 |
print(f"--- PIPELINE END ---")
|
| 493 |
return arabic_transcript, english_translation, (hp.sr, synthesized_audio_np)
|
|
|
|
| 345 |
# Check stop condition but ensure minimum generation
|
| 346 |
stop_prob = torch.sigmoid(stop_token[:, -1])
|
| 347 |
if stop_prob > gate_threshold and frames_generated > 50: # Ensure at least 50 frames
|
| 348 |
+
print(f"TTS: Stopping at frame {frames_generated}, stop_prob: {stop_prob.item():.6f}")
|
| 349 |
break
|
| 350 |
else:
|
| 351 |
stop_token_outputs = torch.cat([stop_token_outputs, stop_token[:, -1:]], dim=1)
|
| 352 |
mel_lengths = torch.tensor(mel_padded.shape[1]).unsqueeze(0).to(DEVICE)
|
| 353 |
|
| 354 |
+
print(f"TTS: Generated {frames_generated} frames, final mel shape: {list(mel_postnet.shape)}")
|
| 355 |
return mel_postnet, stop_token_outputs
|
| 356 |
# --- (End of your model definitions) ---
|
| 357 |
|
|
|
|
| 418 |
msg = "Error: Audio file not provided or not found."
|
| 419 |
print(msg)
|
| 420 |
# Return empty/default values
|
| 421 |
+
return "Error: No file", "", (hp.sr, np.zeros(hp.sr, dtype=np.float32)) # 1 second of silence
|
| 422 |
|
| 423 |
# STT Stage
|
| 424 |
arabic_transcript = "STT Error: Processing failed."
|
|
|
|
| 465 |
print(english_translation)
|
| 466 |
|
| 467 |
# TTS Stage
|
| 468 |
+
synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32) # Default to 1 second of silence
|
| 469 |
if english_translation and not english_translation.startswith("TTT Error"):
|
| 470 |
try:
|
| 471 |
print("TTS: Synthesizing English speech...")
|
|
|
|
| 480 |
audio_tensor = inverse_mel_spec_to_wav(mel_for_vocoder)
|
| 481 |
synthesized_audio_np = audio_tensor.cpu().numpy()
|
| 482 |
print(f"TTS: Synthesized audio shape: {synthesized_audio_np.shape}")
|
| 483 |
+
|
| 484 |
+
# Ensure audio is not empty
|
| 485 |
+
if synthesized_audio_np.size == 0:
|
| 486 |
+
print("TTS: Generated audio is empty, using silence")
|
| 487 |
+
synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)
|
| 488 |
else:
|
| 489 |
print("TTS: Generated mel too small, using silence")
|
| 490 |
+
synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)
|
| 491 |
else:
|
| 492 |
print("TTS: Generated mel is empty or too small, using silence")
|
| 493 |
+
synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)
|
| 494 |
except Exception as e:
|
| 495 |
print(f"TTS Error: {e}")
|
| 496 |
+
synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32) # Fallback to silence
|
| 497 |
+
else:
|
| 498 |
+
print("TTS: Skipped due to TTT failure or empty translation")
|
| 499 |
+
synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)
|
| 500 |
|
| 501 |
print(f"--- PIPELINE END ---")
|
| 502 |
return arabic_transcript, english_translation, (hp.sr, synthesized_audio_np)
|