sdafd commited on
Commit
435283b
·
verified ·
1 Parent(s): be1d19d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -42
app.py CHANGED
@@ -134,33 +134,7 @@ def get_vocals(input_file):
134
  print(f"Unexpected error in get_vocals: {ex}")
135
  return None
136
 
137
- # -------------------------------
138
- # Advanced Normalization Function
139
- # -------------------------------
140
- def advanced_normalize_audio(audio, threshold_ratio=0.4, window_size=1024):
141
- """
142
- This advanced normalization function computes a moving-average envelope of the absolute
143
- audio signal using a specified window size. It then zeroes out portions of the signal
144
- where the envelope falls below a threshold (defined as a ratio of the maximum envelope value).
145
-
146
- Parameters:
147
- audio (np.ndarray): Input audio signal.
148
- threshold_ratio (float): Ratio (0-1) to determine the minimum envelope value to keep.
149
- window_size (int): Size of the moving window used to compute the envelope.
150
-
151
- Returns:
152
- np.ndarray: The normalized audio signal.
153
- """
154
- # Compute moving-average envelope
155
- envelope = np.convolve(np.abs(audio), np.ones(window_size) / window_size, mode='same')
156
- max_env = np.max(envelope)
157
- threshold = threshold_ratio * max_env
158
- # Create a mask: keep samples where the envelope meets or exceeds the threshold.
159
- print(envelope)
160
- mask = envelope >= threshold
161
- # Optionally, you might smooth the mask further to avoid abrupt cuts.
162
- normalized_audio = audio * mask.astype(audio.dtype)
163
- return normalized_audio
164
 
165
  # -------------------------------
166
  # Logging and Model Setup
@@ -181,16 +155,8 @@ models = {
181
  "large-v3": whisperx.load_model("large-v3", device, compute_type=compute_type, vad_method='silero'),
182
  }
183
 
184
- def split_audio_by_pause(audio, sr, pause_threshold, top_db=30):
185
- """
186
- Splits the audio into segments using librosa's non-silent detection.
187
- Adjacent non-silent intervals are merged if the gap between them is less than the pause_threshold.
188
- Returns a list of (start_sample, end_sample) tuples.
189
- """
190
  intervals = librosa.effects.split(audio, top_db=top_db)
191
- if intervals.size == 0:
192
- return [(0, len(audio))]
193
-
194
  merged_intervals = []
195
  current_start, current_end = intervals[0]
196
 
@@ -202,7 +168,16 @@ def split_audio_by_pause(audio, sr, pause_threshold, top_db=30):
202
  merged_intervals.append((current_start, current_end))
203
  current_start, current_end = start, end
204
  merged_intervals.append((current_start, current_end))
205
- return merged_intervals
 
 
 
 
 
 
 
 
 
206
 
207
  # -------------------------------
208
  # Main Transcription Function
@@ -234,11 +209,6 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
234
  audio, sr = librosa.load(audio_file, sr=16000)
235
  debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
236
 
237
- # If vocal extraction was used, apply advanced normalization
238
- #if vocal_extraction:
239
- # audio = advanced_normalize_audio(audio)
240
- # debug_log.append("Advanced normalization applied to extracted audio to remove low-amplitude segments.")
241
-
242
  # Select the model and set batch size
243
  model = models[model_size]
244
  batch_size = 8 if model_size == "tiny" else 4
 
134
  print(f"Unexpected error in get_vocals: {ex}")
135
  return None
136
 
137
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  # -------------------------------
140
  # Logging and Model Setup
 
155
  "large-v3": whisperx.load_model("large-v3", device, compute_type=compute_type, vad_method='silero'),
156
  }
157
 
158
+ def split_audio_by_pause(audio, sr, pause_threshold, top_db=30, energy_threshold=0.05):
 
 
 
 
 
159
  intervals = librosa.effects.split(audio, top_db=top_db)
 
 
 
160
  merged_intervals = []
161
  current_start, current_end = intervals[0]
162
 
 
168
  merged_intervals.append((current_start, current_end))
169
  current_start, current_end = start, end
170
  merged_intervals.append((current_start, current_end))
171
+
172
+ # Filter out segments with low average RMS energy
173
+ filtered_intervals = []
174
+ for start, end in merged_intervals:
175
+ segment = audio[start:end]
176
+ rms = np.mean(librosa.feature.rms(y=segment))
177
+ if rms >= energy_threshold:
178
+ filtered_intervals.append((start, end))
179
+ return filtered_intervals
180
+
181
 
182
  # -------------------------------
183
  # Main Transcription Function
 
209
  audio, sr = librosa.load(audio_file, sr=16000)
210
  debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
211
 
 
 
 
 
 
212
  # Select the model and set batch size
213
  model = models[model_size]
214
  batch_size = 8 if model_size == "tiny" else 4