Spaces:

MoHamdyy
/

Translation_Stack

Sleeping

MoHamdyy commited on Jul 13, 2025

Commit

8fbabff

1 Parent(s): 2fbce4b

Fix syntax error in TTS stage and complete pipeline

Files changed (1) hide show

app.py CHANGED Viewed

@@ -315,14 +315,14 @@ class TransformerTTS(nn.Module):
         mel_lengths = torch.tensor(1).unsqueeze(0).to(DEVICE)
         stop_token_outputs = torch.FloatTensor([]).to(text.device)
-        # More aggressive stopping parameters
-        silence_threshold = 0.2  # Increased from 0.1 to catch low-energy repetitions
-        consecutive_silence_limit = 10  # Reduced from 20 to stop faster
         consecutive_silence_count = 0
-        # Repetition detection parameters
-        repetition_threshold = 0.95  # Cosine similarity threshold for detecting repeated frames
-        repetition_limit = 8  # Stop after 8 similar consecutive frames
         repetition_count = 0
         previous_frames = []
@@ -497,7 +497,7 @@ def full_speech_translation_pipeline(audio_input_path: str):
         try:
             print("TTS: Synthesizing English speech...")
             sequence = text_to_seq(english_translation).unsqueeze(0).to(DEVICE)
-            generated_mel, _ = TTS_MODEL.inference(sequence, max_length=hp.max_mel_time-100, stop_token_threshold=0.1, with_tqdm=False)
             print(f"TTS: Generated mel shape: {generated_mel.shape if generated_mel is not None else 'None'}")
             if generated_mel is not None and generated_mel.numel() > 0:

         mel_lengths = torch.tensor(1).unsqueeze(0).to(DEVICE)
         stop_token_outputs = torch.FloatTensor([]).to(text.device)
+        # More balanced stopping parameters
+        silence_threshold = 0.05  # Much lower - only catch true silence
+        consecutive_silence_limit = 50  # Much higher - allow for natural pauses
         consecutive_silence_count = 0
+        # Less aggressive repetition detection parameters
+        repetition_threshold = 0.98  # Higher threshold - only catch very similar frames
+        repetition_limit = 20  # Allow more repetitive frames before stopping
         repetition_count = 0
         previous_frames = []
         try:
             print("TTS: Synthesizing English speech...")
             sequence = text_to_seq(english_translation).unsqueeze(0).to(DEVICE)
+            generated_mel, _ = TTS_MODEL.inference(sequence, max_length=hp.max_mel_time-50, stop_token_threshold=0.2, with_tqdm=False)
             print(f"TTS: Generated mel shape: {generated_mel.shape if generated_mel is not None else 'None'}")
             if generated_mel is not None and generated_mel.numel() > 0: