NGHIA_Test_Edge_TTS_transcript_w_timestamp

Sleeping

App Files Files Community

cnph001 commited on May 12, 2025

Commit

9733dac

verified ·

1 Parent(s): d474bed

Update app.py

Browse files

Restore working version without post filter

Files changed (1) hide show

app.py +2 -91

app.py CHANGED Viewed

@@ -12,45 +12,6 @@ import soundfile as sf
 import numpy as np
 from pydub import AudioSegment
 from pydub.playback import play
-import math
-from scipy.signal import butter, sosfiltfilt
-def apply_low_pass_filter(audio_segment, cutoff_freq, order=6):
-    """
-    Applies a low-pass filter to an AudioSegment.
-    Args:
-        audio_segment: The AudioSegment to filter.
-        cutoff_freq: The cutoff frequency in Hz.
-        order: The order of the Butterworth filter.
-    Returns:
-        A new AudioSegment with the filtered audio.
-    """
-    segment_array = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
-    frame_rate = audio_segment.frame_rate
-    nyquist_freq = 0.5 * frame_rate
-    normalized_cutoff = cutoff_freq / nyquist_freq
-    sos = butter(order, normalized_cutoff, btype='low', output='sos')
-    filtered_array = sosfiltfilt(sos, segment_array)
-    sample_width = audio_segment.sample_width
-    dtype = None
-    if sample_width == 1:
-        dtype = np.int8
-    elif sample_width == 2:
-        dtype = np.int16
-    elif sample_width == 3:
-        dtype = np.int32  # Or potentially a custom type depending on the library
-    elif sample_width == 4:
-        dtype = np.int32
-    if dtype is not None:
-        return audio_segment._spawn(filtered_array.astype(dtype))
-    else:
-        raise ValueError(f"Unsupported sample width: {sample_width}")
 def get_silence(duration_ms=1000):
@@ -128,19 +89,7 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
     match = re.search(r'([A-Za-z]+)([-]?\d*)', processed_text)
     if match:
         prefix_pitch = match.group(1)
-        number_str = match.group(2)
-        if number_str:  # Check if the second group (number part) is not empty
-            try:
-                number = int(number_str)
-                # Now you can use the 'number' variable
-                print(f"Prefix: {prefix_pitch}, Number: {number}") # Example usage
-            except ValueError as e:
-                print(f"Error converting number string to int: {e}")
-                number = 0 # Or some other default value
-        else:
-            number = 0  # Or some other default value if no number is found
-            print(f"Prefix: {prefix_pitch}, No number found.") # Example handling
         if prefix_pitch in voice_map:
             current_pitch += number
             #processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
@@ -278,32 +227,18 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
                             os.remove(path)
                         except FileNotFoundError:
                             print(f"Warning: Audio file not found: {path}")
-                Rem1='''
-                if combined_line_audio and overall_duration_ms is not None and overall_duration_ms > 0 and total_generated_duration_ms > overall_duration_ms:
-                    speed_factor = (total_generated_duration_ms / overall_duration_ms) * speed_adjustment_factor
-                    if speed_factor > 0:
-                        if speed_factor < 1.0:
-                            speed_factor = 1.0
-                        combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
-                if combined_line_audio:
-                    timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
-                    max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
-                    '''
                 if combined_line_audio and overall_duration_ms is not None and overall_duration_ms > 0 and total_generated_duration_ms > overall_duration_ms:
                     speed_factor = (total_generated_duration_ms / overall_duration_ms) * speed_adjustment_factor
                     if speed_factor > 0:
                         if speed_factor < 1.0:
                             speed_factor = 1.0
                         combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
-                        # Apply low-pass filter AFTER speed adjustment
-                        cutoff_freq = 7000.0  # Adjust as needed
-                        combined_line_audio = apply_low_pass_filter(combined_line_audio, cutoff_freq)
                 if combined_line_audio:
                     timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
                     max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
             elif audio_paths:
                 for path in audio_paths:
                     if path:
@@ -311,38 +246,14 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
                             os.remove(path)
                         except FileNotFoundError:
                             pass # Clean up even if no timestamp
     if not timed_audio_segments:
         return None, "No processable audio segments found."
-    oldx= '''
     final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
     for segment in timed_audio_segments:
         final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
-    '''
-    final_audio = AudioSegment.silent(duration=int(max_end_time_ms * 1000 + 500), frame_rate=24000)
-    for segment in timed_audio_segments:
-        start_position_ms = int(segment['start'] * 1000)
-        audio_to_overlay = segment['audio']
-        if start_position_ms + len(audio_to_overlay) > len(final_audio):
-            padding_needed = (start_position_ms + len(audio_to_overlay)) - len(final_audio)
-            final_audio += AudioSegment.silent(duration=padding_needed + 100, frame_rate=final_audio.frame_rate)
-        try:
-            final_audio = final_audio.overlay(audio_to_overlay, position=start_position_ms)
-        except Exception as e:
-            print(f"Error during overlay: {e}")
-            print(f"  - Start position (ms): {start_position_ms}")
-            print(f"  - Length of audio to overlay (ms): {len(audio_to_overlay)}")
-            print(f"  - Length of final_audio (ms): {len(final_audio)}")
-            # Consider adding logic here to handle the error, e.g., truncating audio_to_overlay
-            # or skipping the overlay if it consistently fails.
     combined_audio_path = tempfile.mktemp(suffix=".mp3")
     final_audio.export(combined_audio_path, format="mp3")
     return combined_audio_path, None

 import numpy as np
 from pydub import AudioSegment
 from pydub.playback import play
 def get_silence(duration_ms=1000):
     match = re.search(r'([A-Za-z]+)([-]?\d*)', processed_text)
     if match:
         prefix_pitch = match.group(1)
+        number = int(match.group(2))
         if prefix_pitch in voice_map:
             current_pitch += number
             #processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
                             os.remove(path)
                         except FileNotFoundError:
                             print(f"Warning: Audio file not found: {path}")
                 if combined_line_audio and overall_duration_ms is not None and overall_duration_ms > 0 and total_generated_duration_ms > overall_duration_ms:
                     speed_factor = (total_generated_duration_ms / overall_duration_ms) * speed_adjustment_factor
                     if speed_factor > 0:
                         if speed_factor < 1.0:
                             speed_factor = 1.0
                         combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
                 if combined_line_audio:
                     timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
                     max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
             elif audio_paths:
                 for path in audio_paths:
                     if path:
                             os.remove(path)
                         except FileNotFoundError:
                             pass # Clean up even if no timestamp
     if not timed_audio_segments:
         return None, "No processable audio segments found."
     final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
     for segment in timed_audio_segments:
         final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
     combined_audio_path = tempfile.mktemp(suffix=".mp3")
     final_audio.export(combined_audio_path, format="mp3")
     return combined_audio_path, None