Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -134,33 +134,7 @@ def get_vocals(input_file):
|
|
| 134 |
print(f"Unexpected error in get_vocals: {ex}")
|
| 135 |
return None
|
| 136 |
|
| 137 |
-
|
| 138 |
-
# Advanced Normalization Function
|
| 139 |
-
# -------------------------------
|
| 140 |
-
def advanced_normalize_audio(audio, threshold_ratio=0.4, window_size=1024):
|
| 141 |
-
"""
|
| 142 |
-
This advanced normalization function computes a moving-average envelope of the absolute
|
| 143 |
-
audio signal using a specified window size. It then zeroes out portions of the signal
|
| 144 |
-
where the envelope falls below a threshold (defined as a ratio of the maximum envelope value).
|
| 145 |
-
|
| 146 |
-
Parameters:
|
| 147 |
-
audio (np.ndarray): Input audio signal.
|
| 148 |
-
threshold_ratio (float): Ratio (0-1) to determine the minimum envelope value to keep.
|
| 149 |
-
window_size (int): Size of the moving window used to compute the envelope.
|
| 150 |
-
|
| 151 |
-
Returns:
|
| 152 |
-
np.ndarray: The normalized audio signal.
|
| 153 |
-
"""
|
| 154 |
-
# Compute moving-average envelope
|
| 155 |
-
envelope = np.convolve(np.abs(audio), np.ones(window_size) / window_size, mode='same')
|
| 156 |
-
max_env = np.max(envelope)
|
| 157 |
-
threshold = threshold_ratio * max_env
|
| 158 |
-
# Create a mask: keep samples where the envelope meets or exceeds the threshold.
|
| 159 |
-
print(envelope)
|
| 160 |
-
mask = envelope >= threshold
|
| 161 |
-
# Optionally, you might smooth the mask further to avoid abrupt cuts.
|
| 162 |
-
normalized_audio = audio * mask.astype(audio.dtype)
|
| 163 |
-
return normalized_audio
|
| 164 |
|
| 165 |
# -------------------------------
|
| 166 |
# Logging and Model Setup
|
|
@@ -181,16 +155,8 @@ models = {
|
|
| 181 |
"large-v3": whisperx.load_model("large-v3", device, compute_type=compute_type, vad_method='silero'),
|
| 182 |
}
|
| 183 |
|
| 184 |
-
def split_audio_by_pause(audio, sr, pause_threshold, top_db=30):
|
| 185 |
-
"""
|
| 186 |
-
Splits the audio into segments using librosa's non-silent detection.
|
| 187 |
-
Adjacent non-silent intervals are merged if the gap between them is less than the pause_threshold.
|
| 188 |
-
Returns a list of (start_sample, end_sample) tuples.
|
| 189 |
-
"""
|
| 190 |
intervals = librosa.effects.split(audio, top_db=top_db)
|
| 191 |
-
if intervals.size == 0:
|
| 192 |
-
return [(0, len(audio))]
|
| 193 |
-
|
| 194 |
merged_intervals = []
|
| 195 |
current_start, current_end = intervals[0]
|
| 196 |
|
|
@@ -202,7 +168,16 @@ def split_audio_by_pause(audio, sr, pause_threshold, top_db=30):
|
|
| 202 |
merged_intervals.append((current_start, current_end))
|
| 203 |
current_start, current_end = start, end
|
| 204 |
merged_intervals.append((current_start, current_end))
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
# -------------------------------
|
| 208 |
# Main Transcription Function
|
|
@@ -234,11 +209,6 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
|
|
| 234 |
audio, sr = librosa.load(audio_file, sr=16000)
|
| 235 |
debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
|
| 236 |
|
| 237 |
-
# If vocal extraction was used, apply advanced normalization
|
| 238 |
-
#if vocal_extraction:
|
| 239 |
-
# audio = advanced_normalize_audio(audio)
|
| 240 |
-
# debug_log.append("Advanced normalization applied to extracted audio to remove low-amplitude segments.")
|
| 241 |
-
|
| 242 |
# Select the model and set batch size
|
| 243 |
model = models[model_size]
|
| 244 |
batch_size = 8 if model_size == "tiny" else 4
|
|
|
|
| 134 |
print(f"Unexpected error in get_vocals: {ex}")
|
| 135 |
return None
|
| 136 |
|
| 137 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
# -------------------------------
|
| 140 |
# Logging and Model Setup
|
|
|
|
| 155 |
"large-v3": whisperx.load_model("large-v3", device, compute_type=compute_type, vad_method='silero'),
|
| 156 |
}
|
| 157 |
|
| 158 |
+
def split_audio_by_pause(audio, sr, pause_threshold, top_db=30, energy_threshold=0.05):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
intervals = librosa.effects.split(audio, top_db=top_db)
|
|
|
|
|
|
|
|
|
|
| 160 |
merged_intervals = []
|
| 161 |
current_start, current_end = intervals[0]
|
| 162 |
|
|
|
|
| 168 |
merged_intervals.append((current_start, current_end))
|
| 169 |
current_start, current_end = start, end
|
| 170 |
merged_intervals.append((current_start, current_end))
|
| 171 |
+
|
| 172 |
+
# Filter out segments with low average RMS energy
|
| 173 |
+
filtered_intervals = []
|
| 174 |
+
for start, end in merged_intervals:
|
| 175 |
+
segment = audio[start:end]
|
| 176 |
+
rms = np.mean(librosa.feature.rms(y=segment))
|
| 177 |
+
if rms >= energy_threshold:
|
| 178 |
+
filtered_intervals.append((start, end))
|
| 179 |
+
return filtered_intervals
|
| 180 |
+
|
| 181 |
|
| 182 |
# -------------------------------
|
| 183 |
# Main Transcription Function
|
|
|
|
| 209 |
audio, sr = librosa.load(audio_file, sr=16000)
|
| 210 |
debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
|
| 211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
# Select the model and set batch size
|
| 213 |
model = models[model_size]
|
| 214 |
batch_size = 8 if model_size == "tiny" else 4
|