NGHIA_Test_Edge_TTS_transcript_w_timestamp

Sleeping

App Files Files Community

cnph001 commited on May 11, 2025

Commit

ad48cb2

verified ·

1 Parent(s): 2899f8f

Update app.py

Browse files

attempt to fix voice distortion when stretch is applied

Files changed (1) hide show

app.py +18 -1

app.py CHANGED Viewed

@@ -102,7 +102,7 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
             if target_duration_ms is not None and os.path.exists(audio_path):
                 audio = AudioSegment.from_mp3(audio_path)
                 audio_duration_ms = len(audio)
-                #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
                 if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
                     speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
                     #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
@@ -111,7 +111,24 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
                             speed_factor = 1.0
                         y, sr = librosa.load(audio_path, sr=None)
                         y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
                         sf.write(audio_path, y_stretched, sr)
                 else:
                     print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
             return audio_path

             if target_duration_ms is not None and os.path.exists(audio_path):
                 audio = AudioSegment.from_mp3(audio_path)
                 audio_duration_ms = len(audio)
+"""             #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
                 if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
                     speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
                     #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
                             speed_factor = 1.0
                         y, sr = librosa.load(audio_path, sr=None)
                         y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
+                        sf.write(audio_path, y_stretched, sr) """
+                if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
+                    speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
+                    if speed_factor > 0:
+                        if speed_factor < 1.0:
+                            speed_factor = 1.0
+                        y, sr = librosa.load(audio_path, sr=None)
+                        # Use the phase vocoder for time stretching without pitch change
+                        hop_length = 512  # You can adjust this parameter
+                        phase_vocoder_output = librosa.phase_vocoder(y, rate=speed_factor, hop_length=hop_length)
+                        # Reconstruct the audio signal from the phase vocoder output
+                        y_stretched = librosa.istft(phase_vocoder_output, hop_length=hop_length, length=len(y) if speed_factor < 1 else None)
                         sf.write(audio_path, y_stretched, sr)
                 else:
                     print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
             return audio_path