Spaces:

MoHamdyy
/

Translation_Stack

Sleeping

App Files Files Community

MoHamdyy commited on Jul 13, 2025

Commit

2fbce4b

1 Parent(s): e532f67

Fix syntax error in TTS stage and complete pipeline

Browse files

Files changed (1) hide show

app.py +41 -8

app.py CHANGED Viewed

@@ -315,30 +315,63 @@ class TransformerTTS(nn.Module):
         mel_lengths = torch.tensor(1).unsqueeze(0).to(DEVICE)
         stop_token_outputs = torch.FloatTensor([]).to(text.device)
-        # Silence detection parameters
-        silence_threshold = 0.1  # Consider frames below this as silence
-        consecutive_silence_limit = 20  # Stop after 20 consecutive silent frames
         consecutive_silence_count = 0
         iters = range(max_length)
-        for _ in iters:
             mel_postnet, mel_linear, stop_token = self(text, text_lengths, mel_padded, mel_lengths)
-            # Check stop token BEFORE adding to mel_padded
             if torch.sigmoid(stop_token[:, -1]) > stop_token_threshold:
                 break
-            # Check for silence in the generated mel frame
             current_frame = mel_postnet[:, -1:, :]
             frame_energy = torch.mean(torch.abs(current_frame))
             if frame_energy < silence_threshold:
                 consecutive_silence_count += 1
                 if consecutive_silence_count >= consecutive_silence_limit:
-                    print(f"TTS: Stopping due to {consecutive_silence_limit} consecutive silent frames")
                     break
             else:
-                consecutive_silence_count = 0  # Reset silence counter
             mel_padded = torch.cat([mel_padded, mel_postnet[:, -1:, :]], dim=1)
             stop_token_outputs = torch.cat([stop_token_outputs, stop_token[:, -1:]], dim=1)

         mel_lengths = torch.tensor(1).unsqueeze(0).to(DEVICE)
         stop_token_outputs = torch.FloatTensor([]).to(text.device)
+        # More aggressive stopping parameters
+        silence_threshold = 0.2  # Increased from 0.1 to catch low-energy repetitions
+        consecutive_silence_limit = 10  # Reduced from 20 to stop faster
         consecutive_silence_count = 0
+        # Repetition detection parameters
+        repetition_threshold = 0.95  # Cosine similarity threshold for detecting repeated frames
+        repetition_limit = 8  # Stop after 8 similar consecutive frames
+        repetition_count = 0
+        previous_frames = []
         iters = range(max_length)
+        for i, _ in enumerate(iters):
             mel_postnet, mel_linear, stop_token = self(text, text_lengths, mel_padded, mel_lengths)
+            # Check stop token BEFORE adding to mel_padded (even more aggressive)
             if torch.sigmoid(stop_token[:, -1]) > stop_token_threshold:
+                print(f"TTS: Stopping due to stop token at frame {i}")
                 break
             current_frame = mel_postnet[:, -1:, :]
             frame_energy = torch.mean(torch.abs(current_frame))
+            # Check for silence with higher threshold
             if frame_energy < silence_threshold:
                 consecutive_silence_count += 1
                 if consecutive_silence_count >= consecutive_silence_limit:
+                    print(f"TTS: Stopping due to {consecutive_silence_limit} consecutive silent frames at frame {i}")
                     break
             else:
+                consecutive_silence_count = 0
+            # NEW: Check for repetitive content (detecting loops)
+            if len(previous_frames) >= 3:  # Start checking after a few frames
+                # Compare current frame with recent frames
+                current_flat = current_frame.flatten()
+                is_repetitive = False
+                for prev_frame in previous_frames[-3:]:  # Check last 3 frames
+                    prev_flat = prev_frame.flatten()
+                    # Calculate cosine similarity
+                    similarity = torch.cosine_similarity(current_flat, prev_flat, dim=0)
+                    if similarity > repetition_threshold:
+                        repetition_count += 1
+                        is_repetitive = True
+                        break
+                if is_repetitive and repetition_count >= repetition_limit:
+                    print(f"TTS: Stopping due to repetitive content at frame {i}")
+                    break
+                elif not is_repetitive:
+                    repetition_count = 0  # Reset if not repetitive
+            # Keep track of recent frames for repetition detection
+            previous_frames.append(current_frame.clone())
+            if len(previous_frames) > 5:  # Keep only last 5 frames
+                previous_frames.pop(0)
             mel_padded = torch.cat([mel_padded, mel_postnet[:, -1:, :]], dim=1)
             stop_token_outputs = torch.cat([stop_token_outputs, stop_token[:, -1:]], dim=1)