Spaces:

MoHamdyy
/

Translation_Stack

Sleeping

App Files Files Community

MoHamdyy commited on Jul 13, 2025

Commit

de6f9f5

1 Parent(s): 3c32f04

Fix syntax error in TTS stage and complete pipeline

Browse files

Files changed (1) hide show

app.py +32 -66

app.py CHANGED Viewed

@@ -306,80 +306,46 @@ class TransformerTTS(nn.Module):
         return mel_postnet, mel_linear, stop_token
     @torch.no_grad()
-    def inference(self, text, max_length=800, stop_token_threshold=0.5, with_tqdm=True):
-        self.eval(); self.train(False)
         text_lengths = torch.tensor(text.shape[1]).unsqueeze(0).to(DEVICE)
         N = 1
         SOS = torch.zeros((N, 1, hp.mel_freq), device=DEVICE)
         mel_padded = SOS
         mel_lengths = torch.tensor(1).unsqueeze(0).to(DEVICE)
         stop_token_outputs = torch.FloatTensor([]).to(text.device)
-        # More balanced stopping parameters
-        silence_threshold = 0.05  # Much lower - only catch true silence
-        consecutive_silence_limit = 50  # Much higher - allow for natural pauses
-        consecutive_silence_count = 0
-        # Less aggressive repetition detection parameters
-        repetition_threshold = 0.98  # Higher threshold - only catch very similar frames
-        repetition_limit = 20  # Allow more repetitive frames before stopping
-        repetition_count = 0
-        previous_frames = []
-        iters = range(max_length)
-        for i, _ in enumerate(iters):
-            mel_postnet, mel_linear, stop_token = self(text, text_lengths, mel_padded, mel_lengths)
-            # Check stop token BEFORE adding to mel_padded (even more aggressive)
-            if torch.sigmoid(stop_token[:, -1]) > stop_token_threshold:
-                print(f"TTS: Stopping due to stop token at frame {i}")
                 break
-            current_frame = mel_postnet[:, -1:, :]
-            frame_energy = torch.mean(torch.abs(current_frame))
-            # Check for silence with higher threshold
-            if frame_energy < silence_threshold:
-                consecutive_silence_count += 1
-                if consecutive_silence_count >= consecutive_silence_limit:
-                    print(f"TTS: Stopping due to {consecutive_silence_limit} consecutive silent frames at frame {i}")
-                    break
             else:
-                consecutive_silence_count = 0
-            # NEW: Check for repetitive content (detecting loops)
-            if len(previous_frames) >= 3:  # Start checking after a few frames
-                # Compare current frame with recent frames
-                current_flat = current_frame.flatten()
-                is_repetitive = False
-                for prev_frame in previous_frames[-3:]:  # Check last 3 frames
-                    prev_flat = prev_frame.flatten()
-                    # Calculate cosine similarity
-                    similarity = torch.cosine_similarity(current_flat, prev_flat, dim=0)
-                    if similarity > repetition_threshold:
-                        repetition_count += 1
-                        is_repetitive = True
-                        break
-                if is_repetitive and repetition_count >= repetition_limit:
-                    print(f"TTS: Stopping due to repetitive content at frame {i}")
-                    break
-                elif not is_repetitive:
-                    repetition_count = 0  # Reset if not repetitive
-            # Keep track of recent frames for repetition detection
-            previous_frames.append(current_frame.clone())
-            if len(previous_frames) > 5:  # Keep only last 5 frames
-                previous_frames.pop(0)
-            mel_padded = torch.cat([mel_padded, mel_postnet[:, -1:, :]], dim=1)
-            stop_token_outputs = torch.cat([stop_token_outputs, stop_token[:, -1:]], dim=1)
-            mel_lengths = torch.tensor(mel_padded.shape[1]).unsqueeze(0).to(DEVICE)
-        # Remove the initial SOS token and return only the generated mel
-        generated_mel = mel_padded[:, 1:, :]  # Remove first frame (SOS)
-        return generated_mel, stop_token_outputs
 # --- (End of your model definitions) ---
 # --- Part 2: Model Loading ---
@@ -497,7 +463,7 @@ def full_speech_translation_pipeline(audio_input_path: str):
         try:
             print("TTS: Synthesizing English speech...")
             sequence = text_to_seq(english_translation).unsqueeze(0).to(DEVICE)
-            generated_mel, _ = TTS_MODEL.inference(sequence, max_length=hp.max_mel_time-50, stop_token_threshold=0.5, with_tqdm=False)
             print(f"TTS: Generated mel shape: {generated_mel.shape if generated_mel is not None else 'None'}")
             if generated_mel is not None and generated_mel.numel() > 0:

         return mel_postnet, mel_linear, stop_token
     @torch.no_grad()
+    def inference(self, text, max_length=800, gate_threshold=1e-5, with_tqdm=True):
+        self.eval()
+        self.train(False)
         text_lengths = torch.tensor(text.shape[1]).unsqueeze(0).to(DEVICE)
         N = 1
         SOS = torch.zeros((N, 1, hp.mel_freq), device=DEVICE)
         mel_padded = SOS
         mel_lengths = torch.tensor(1).unsqueeze(0).to(DEVICE)
         stop_token_outputs = torch.FloatTensor([]).to(text.device)
+        if with_tqdm:
+            from tqdm import tqdm
+            iters = tqdm(range(max_length))
+        else:
+            iters = range(max_length)
+        for _ in iters:
+            mel_postnet, mel_linear, stop_token = self(
+                text,
+                text_lengths,
+                mel_padded,
+                mel_lengths
+            )
+            mel_padded = torch.cat(
+                [
+                    mel_padded,
+                    mel_postnet[:, -1:, :]
+                ],
+                dim=1
+            )
+            if torch.sigmoid(stop_token[:, -1]) > gate_threshold:
                 break
             else:
+                stop_token_outputs = torch.cat([stop_token_outputs, stop_token[:, -1:]], dim=1)
+                mel_lengths = torch.tensor(mel_padded.shape[1]).unsqueeze(0).to(DEVICE)
+        return mel_postnet, stop_token_outputs
 # --- (End of your model definitions) ---
 # --- Part 2: Model Loading ---
         try:
             print("TTS: Synthesizing English speech...")
             sequence = text_to_seq(english_translation).unsqueeze(0).to(DEVICE)
+            generated_mel, _ = TTS_MODEL.inference(sequence, max_length=hp.max_mel_time-50, gate_threshold=1e-5, with_tqdm=False)
             print(f"TTS: Generated mel shape: {generated_mel.shape if generated_mel is not None else 'None'}")
             if generated_mel is not None and generated_mel.numel() > 0: