Spaces:

MoHamdyy
/

Translation_Stack

Sleeping

App Files Files Community

MoHamdyy commited on Jul 13, 2025

Commit

4f298d6

1 Parent(s): 8785760

Fix syntax error in TTS stage and complete pipeline

Browse files

Files changed (1) hide show

app.py +15 -6

app.py CHANGED Viewed

@@ -345,13 +345,13 @@ class TransformerTTS(nn.Module):
             # Check stop condition but ensure minimum generation
             stop_prob = torch.sigmoid(stop_token[:, -1])
             if stop_prob > gate_threshold and frames_generated > 50:  # Ensure at least 50 frames
-                print(f"TTS: Stopping at frame {frames_generated}, stop_prob: {stop_prob:.6f}")
                 break
             else:
                 stop_token_outputs = torch.cat([stop_token_outputs, stop_token[:, -1:]], dim=1)
                 mel_lengths = torch.tensor(mel_padded.shape[1]).unsqueeze(0).to(DEVICE)
-        print(f"TTS: Generated {frames_generated} frames, final mel shape: {mel_postnet.shape}")
         return mel_postnet, stop_token_outputs
 # --- (End of your model definitions) ---
@@ -418,7 +418,7 @@ def full_speech_translation_pipeline(audio_input_path: str):
         msg = "Error: Audio file not provided or not found."
         print(msg)
         # Return empty/default values
-        return "Error: No file", "", (hp.sr, np.array([]).astype(np.float32))
     # STT Stage
     arabic_transcript = "STT Error: Processing failed."
@@ -465,7 +465,7 @@ def full_speech_translation_pipeline(audio_input_path: str):
         print(english_translation)
     # TTS Stage
-    synthesized_audio_np = np.array([]).astype(np.float32)
     if english_translation and not english_translation.startswith("TTT Error"):
         try:
             print("TTS: Synthesizing English speech...")
@@ -480,14 +480,23 @@ def full_speech_translation_pipeline(audio_input_path: str):
                     audio_tensor = inverse_mel_spec_to_wav(mel_for_vocoder)
                     synthesized_audio_np = audio_tensor.cpu().numpy()
                     print(f"TTS: Synthesized audio shape: {synthesized_audio_np.shape}")
                 else:
                     print("TTS: Generated mel too small, using silence")
-                    synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)  # 1 second of silence
             else:
                 print("TTS: Generated mel is empty or too small, using silence")
-                synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)  # 1 second of silence
         except Exception as e:
             print(f"TTS Error: {e}")
     print(f"--- PIPELINE END ---")
     return arabic_transcript, english_translation, (hp.sr, synthesized_audio_np)

             # Check stop condition but ensure minimum generation
             stop_prob = torch.sigmoid(stop_token[:, -1])
             if stop_prob > gate_threshold and frames_generated > 50:  # Ensure at least 50 frames
+                print(f"TTS: Stopping at frame {frames_generated}, stop_prob: {stop_prob.item():.6f}")
                 break
             else:
                 stop_token_outputs = torch.cat([stop_token_outputs, stop_token[:, -1:]], dim=1)
                 mel_lengths = torch.tensor(mel_padded.shape[1]).unsqueeze(0).to(DEVICE)
+        print(f"TTS: Generated {frames_generated} frames, final mel shape: {list(mel_postnet.shape)}")
         return mel_postnet, stop_token_outputs
 # --- (End of your model definitions) ---
         msg = "Error: Audio file not provided or not found."
         print(msg)
         # Return empty/default values
+        return "Error: No file", "", (hp.sr, np.zeros(hp.sr, dtype=np.float32))  # 1 second of silence
     # STT Stage
     arabic_transcript = "STT Error: Processing failed."
         print(english_translation)
     # TTS Stage
+    synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)  # Default to 1 second of silence
     if english_translation and not english_translation.startswith("TTT Error"):
         try:
             print("TTS: Synthesizing English speech...")
                     audio_tensor = inverse_mel_spec_to_wav(mel_for_vocoder)
                     synthesized_audio_np = audio_tensor.cpu().numpy()
                     print(f"TTS: Synthesized audio shape: {synthesized_audio_np.shape}")
+                    # Ensure audio is not empty
+                    if synthesized_audio_np.size == 0:
+                        print("TTS: Generated audio is empty, using silence")
+                        synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)
                 else:
                     print("TTS: Generated mel too small, using silence")
+                    synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)
             else:
                 print("TTS: Generated mel is empty or too small, using silence")
+                synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)
         except Exception as e:
             print(f"TTS Error: {e}")
+            synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)  # Fallback to silence
+    else:
+        print("TTS: Skipped due to TTT failure or empty translation")
+        synthesized_audio_np = np.zeros(hp.sr, dtype=np.float32)
     print(f"--- PIPELINE END ---")
     return arabic_transcript, english_translation, (hp.sr, synthesized_audio_np)