Spaces:

sdafd
/

whisperx-test

Running

App Files Files Community

sdafd commited on Feb 26, 2025

Commit

435283b

verified ·

1 Parent(s): be1d19d

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -42

app.py CHANGED Viewed

@@ -134,33 +134,7 @@ def get_vocals(input_file):
         print(f"Unexpected error in get_vocals: {ex}")
         return None
-# -------------------------------
-# Advanced Normalization Function
-# -------------------------------
-def advanced_normalize_audio(audio, threshold_ratio=0.4, window_size=1024):
-    """
-    This advanced normalization function computes a moving-average envelope of the absolute
-    audio signal using a specified window size. It then zeroes out portions of the signal
-    where the envelope falls below a threshold (defined as a ratio of the maximum envelope value).
-    Parameters:
-        audio (np.ndarray): Input audio signal.
-        threshold_ratio (float): Ratio (0-1) to determine the minimum envelope value to keep.
-        window_size (int): Size of the moving window used to compute the envelope.
-    Returns:
-        np.ndarray: The normalized audio signal.
-    """
-    # Compute moving-average envelope
-    envelope = np.convolve(np.abs(audio), np.ones(window_size) / window_size, mode='same')
-    max_env = np.max(envelope)
-    threshold = threshold_ratio * max_env
-    # Create a mask: keep samples where the envelope meets or exceeds the threshold.
-    print(envelope)
-    mask = envelope >= threshold
-    # Optionally, you might smooth the mask further to avoid abrupt cuts.
-    normalized_audio = audio * mask.astype(audio.dtype)
-    return normalized_audio
 # -------------------------------
 # Logging and Model Setup
@@ -181,16 +155,8 @@ models = {
     "large-v3": whisperx.load_model("large-v3", device, compute_type=compute_type, vad_method='silero'),
 }
-def split_audio_by_pause(audio, sr, pause_threshold, top_db=30):
-    """
-    Splits the audio into segments using librosa's non-silent detection.
-    Adjacent non-silent intervals are merged if the gap between them is less than the pause_threshold.
-    Returns a list of (start_sample, end_sample) tuples.
-    """
     intervals = librosa.effects.split(audio, top_db=top_db)
-    if intervals.size == 0:
-        return [(0, len(audio))]
     merged_intervals = []
     current_start, current_end = intervals[0]
@@ -202,7 +168,16 @@ def split_audio_by_pause(audio, sr, pause_threshold, top_db=30):
             merged_intervals.append((current_start, current_end))
             current_start, current_end = start, end
     merged_intervals.append((current_start, current_end))
-    return merged_intervals
 # -------------------------------
 # Main Transcription Function
@@ -234,11 +209,6 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
         audio, sr = librosa.load(audio_file, sr=16000)
         debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
-        # If vocal extraction was used, apply advanced normalization
-        #if vocal_extraction:
-        #    audio = advanced_normalize_audio(audio)
-        #    debug_log.append("Advanced normalization applied to extracted audio to remove low-amplitude segments.")
         # Select the model and set batch size
         model = models[model_size]
         batch_size = 8 if model_size == "tiny" else 4

         print(f"Unexpected error in get_vocals: {ex}")
         return None
 # -------------------------------
 # Logging and Model Setup
     "large-v3": whisperx.load_model("large-v3", device, compute_type=compute_type, vad_method='silero'),
 }
+def split_audio_by_pause(audio, sr, pause_threshold, top_db=30, energy_threshold=0.05):
     intervals = librosa.effects.split(audio, top_db=top_db)
     merged_intervals = []
     current_start, current_end = intervals[0]
             merged_intervals.append((current_start, current_end))
             current_start, current_end = start, end
     merged_intervals.append((current_start, current_end))
+    # Filter out segments with low average RMS energy
+    filtered_intervals = []
+    for start, end in merged_intervals:
+        segment = audio[start:end]
+        rms = np.mean(librosa.feature.rms(y=segment))
+        if rms >= energy_threshold:
+            filtered_intervals.append((start, end))
+    return filtered_intervals
 # -------------------------------
 # Main Transcription Function
         audio, sr = librosa.load(audio_file, sr=16000)
         debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
         # Select the model and set batch size
         model = models[model_size]
         batch_size = 8 if model_size == "tiny" else 4