Spaces:

MoHamdyy
/

Translation_Stack

Sleeping

MoHamdyy commited on Jul 13, 2025

Commit

2877500

1 Parent(s): 91c76f8

Fix syntax error in TTS stage and complete pipeline

Files changed (1) hide show

app.py CHANGED Viewed

@@ -317,13 +317,18 @@ class TransformerTTS(nn.Module):
         iters = range(max_length)
         for _ in iters:
             mel_postnet, mel_linear, stop_token = self(text, text_lengths, mel_padded, mel_lengths)
-            mel_padded = torch.cat([mel_padded, mel_postnet[:, -1:, :]], dim=1)
             if torch.sigmoid(stop_token[:, -1]) > stop_token_threshold:
                 break
-            else:
-                stop_token_outputs = torch.cat([stop_token_outputs, stop_token[:, -1:]], dim=1)
-                mel_lengths = torch.tensor(mel_padded.shape[1]).unsqueeze(0).to(DEVICE)
-        return mel_postnet, stop_token_outputs
 # --- (End of your model definitions) ---
 # --- Part 2: Model Loading ---
@@ -441,7 +446,7 @@ def full_speech_translation_pipeline(audio_input_path: str):
         try:
             print("TTS: Synthesizing English speech...")
             sequence = text_to_seq(english_translation).unsqueeze(0).to(DEVICE)
-            generated_mel, _ = TTS_MODEL.inference(sequence, max_length=hp.max_mel_time-20, stop_token_threshold=0.3, with_tqdm=False)
             print(f"TTS: Generated mel shape: {generated_mel.shape if generated_mel is not None else 'None'}")
             if generated_mel is not None and generated_mel.numel() > 0:

         iters = range(max_length)
         for _ in iters:
             mel_postnet, mel_linear, stop_token = self(text, text_lengths, mel_padded, mel_lengths)
+            # Check stop token BEFORE adding to mel_padded
             if torch.sigmoid(stop_token[:, -1]) > stop_token_threshold:
                 break
+            mel_padded = torch.cat([mel_padded, mel_postnet[:, -1:, :]], dim=1)
+            stop_token_outputs = torch.cat([stop_token_outputs, stop_token[:, -1:]], dim=1)
+            mel_lengths = torch.tensor(mel_padded.shape[1]).unsqueeze(0).to(DEVICE)
+        # Remove the initial SOS token and return only the generated mel
+        generated_mel = mel_padded[:, 1:, :]  # Remove first frame (SOS)
+        return generated_mel, stop_token_outputs
 # --- (End of your model definitions) ---
 # --- Part 2: Model Loading ---
         try:
             print("TTS: Synthesizing English speech...")
             sequence = text_to_seq(english_translation).unsqueeze(0).to(DEVICE)
+            generated_mel, _ = TTS_MODEL.inference(sequence, max_length=hp.max_mel_time-50, stop_token_threshold=0.3, with_tqdm=False)
             print(f"TTS: Generated mel shape: {generated_mel.shape if generated_mel is not None else 'None'}")
             if generated_mel is not None and generated_mel.numel() > 0: