Spaces:

MoHamdyy
/

Translation_Stack

Sleeping

App Files Files Community

MoHamdyy commited on Jul 13, 2025

Commit

e532f67

1 Parent(s): 2877500

Fix syntax error in TTS stage and complete pipeline

Browse files

Files changed (1) hide show

app.py +19 -1

app.py CHANGED Viewed

@@ -314,6 +314,12 @@ class TransformerTTS(nn.Module):
         mel_padded = SOS
         mel_lengths = torch.tensor(1).unsqueeze(0).to(DEVICE)
         stop_token_outputs = torch.FloatTensor([]).to(text.device)
         iters = range(max_length)
         for _ in iters:
             mel_postnet, mel_linear, stop_token = self(text, text_lengths, mel_padded, mel_lengths)
@@ -322,6 +328,18 @@ class TransformerTTS(nn.Module):
             if torch.sigmoid(stop_token[:, -1]) > stop_token_threshold:
                 break
             mel_padded = torch.cat([mel_padded, mel_postnet[:, -1:, :]], dim=1)
             stop_token_outputs = torch.cat([stop_token_outputs, stop_token[:, -1:]], dim=1)
             mel_lengths = torch.tensor(mel_padded.shape[1]).unsqueeze(0).to(DEVICE)
@@ -446,7 +464,7 @@ def full_speech_translation_pipeline(audio_input_path: str):
         try:
             print("TTS: Synthesizing English speech...")
             sequence = text_to_seq(english_translation).unsqueeze(0).to(DEVICE)
-            generated_mel, _ = TTS_MODEL.inference(sequence, max_length=hp.max_mel_time-50, stop_token_threshold=0.3, with_tqdm=False)
             print(f"TTS: Generated mel shape: {generated_mel.shape if generated_mel is not None else 'None'}")
             if generated_mel is not None and generated_mel.numel() > 0:

         mel_padded = SOS
         mel_lengths = torch.tensor(1).unsqueeze(0).to(DEVICE)
         stop_token_outputs = torch.FloatTensor([]).to(text.device)
+        # Silence detection parameters
+        silence_threshold = 0.1  # Consider frames below this as silence
+        consecutive_silence_limit = 20  # Stop after 20 consecutive silent frames
+        consecutive_silence_count = 0
         iters = range(max_length)
         for _ in iters:
             mel_postnet, mel_linear, stop_token = self(text, text_lengths, mel_padded, mel_lengths)
             if torch.sigmoid(stop_token[:, -1]) > stop_token_threshold:
                 break
+            # Check for silence in the generated mel frame
+            current_frame = mel_postnet[:, -1:, :]
+            frame_energy = torch.mean(torch.abs(current_frame))
+            if frame_energy < silence_threshold:
+                consecutive_silence_count += 1
+                if consecutive_silence_count >= consecutive_silence_limit:
+                    print(f"TTS: Stopping due to {consecutive_silence_limit} consecutive silent frames")
+                    break
+            else:
+                consecutive_silence_count = 0  # Reset silence counter
             mel_padded = torch.cat([mel_padded, mel_postnet[:, -1:, :]], dim=1)
             stop_token_outputs = torch.cat([stop_token_outputs, stop_token[:, -1:]], dim=1)
             mel_lengths = torch.tensor(mel_padded.shape[1]).unsqueeze(0).to(DEVICE)
         try:
             print("TTS: Synthesizing English speech...")
             sequence = text_to_seq(english_translation).unsqueeze(0).to(DEVICE)
+            generated_mel, _ = TTS_MODEL.inference(sequence, max_length=hp.max_mel_time-100, stop_token_threshold=0.1, with_tqdm=False)
             print(f"TTS: Generated mel shape: {generated_mel.shape if generated_mel is not None else 'None'}")
             if generated_mel is not None and generated_mel.numel() > 0: