Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -56,7 +56,7 @@ def get_vocals(input_file):
|
|
| 56 |
'data': [
|
| 57 |
{
|
| 58 |
'path': json_data[0],
|
| 59 |
-
'url': 'https://politrees-audio-separator-uvr.hf.space/gradio_api/file='+json_data[0],
|
| 60 |
'orig_name': pathlib.Path(input_file).name,
|
| 61 |
'size': file_len,
|
| 62 |
'mime_type': 'audio/wav',
|
|
@@ -135,17 +135,30 @@ def get_vocals(input_file):
|
|
| 135 |
return None
|
| 136 |
|
| 137 |
# -------------------------------
|
| 138 |
-
# Normalization Function
|
| 139 |
# -------------------------------
|
| 140 |
-
def
|
| 141 |
"""
|
| 142 |
-
|
| 143 |
-
a
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
"""
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
return normalized_audio
|
| 150 |
|
| 151 |
# -------------------------------
|
|
@@ -207,7 +220,6 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
|
|
| 207 |
debug_log.append("Vocal extraction succeeded; downloading extracted audio...")
|
| 208 |
response = requests.get(extracted_url)
|
| 209 |
if response.status_code == 200:
|
| 210 |
-
# Write to a temporary file
|
| 211 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
|
| 212 |
tmp.write(response.content)
|
| 213 |
audio_file = tmp.name
|
|
@@ -221,26 +233,26 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
|
|
| 221 |
audio, sr = librosa.load(audio_file, sr=16000)
|
| 222 |
debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
|
| 223 |
|
| 224 |
-
# If
|
| 225 |
if vocal_extraction:
|
| 226 |
-
audio =
|
| 227 |
-
debug_log.append("
|
| 228 |
|
| 229 |
# Select the model and set batch size
|
| 230 |
model = models[model_size]
|
| 231 |
batch_size = 8 if model_size == "tiny" else 4
|
| 232 |
|
| 233 |
-
# Use
|
| 234 |
if language:
|
| 235 |
transcript = model.transcribe(audio, batch_size=batch_size, language=language)
|
| 236 |
else:
|
| 237 |
transcript = model.transcribe(audio, batch_size=batch_size)
|
| 238 |
language = transcript.get("language", "unknown")
|
| 239 |
|
| 240 |
-
# Load alignment model using the specified
|
| 241 |
model_a, metadata = whisperx.load_align_model(language_code=language, device=device)
|
| 242 |
|
| 243 |
-
# If pause_threshold > 0, split
|
| 244 |
if pause_threshold > 0:
|
| 245 |
segments = split_audio_by_pause(audio, sr, pause_threshold)
|
| 246 |
debug_log.append(f"Audio split into {len(segments)} segment(s) using a pause threshold of {pause_threshold}s")
|
|
@@ -307,12 +319,10 @@ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
|
|
| 307 |
interactive=True,
|
| 308 |
info="Set a pause duration threshold. Audio pauses longer than this will be used to split the audio into segments."
|
| 309 |
)
|
| 310 |
-
# New input for vocal extraction feature
|
| 311 |
vocal_extraction_checkbox = gr.Checkbox(
|
| 312 |
label="Extract Vocals (improves accuracy on noisy audio)",
|
| 313 |
value=False
|
| 314 |
)
|
| 315 |
-
# New language selection (default English)
|
| 316 |
language_input = gr.Textbox(
|
| 317 |
label="Language Code (e.g., en, es, fr)",
|
| 318 |
placeholder="Enter language code",
|
|
@@ -334,7 +344,6 @@ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
|
|
| 334 |
visible=False,
|
| 335 |
)
|
| 336 |
|
| 337 |
-
# Toggle debug visibility
|
| 338 |
def toggle_debug(debug_enabled):
|
| 339 |
return gr.update(visible=debug_enabled)
|
| 340 |
|
|
@@ -344,7 +353,6 @@ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
|
|
| 344 |
outputs=[debug_output]
|
| 345 |
)
|
| 346 |
|
| 347 |
-
# Process transcription with all new parameters
|
| 348 |
transcribe_btn.click(
|
| 349 |
transcribe,
|
| 350 |
inputs=[audio_input, model_selector, debug_checkbox, pause_threshold_slider, vocal_extraction_checkbox, language_input],
|
|
|
|
| 56 |
'data': [
|
| 57 |
{
|
| 58 |
'path': json_data[0],
|
| 59 |
+
'url': 'https://politrees-audio-separator-uvr.hf.space/gradio_api/file=' + json_data[0],
|
| 60 |
'orig_name': pathlib.Path(input_file).name,
|
| 61 |
'size': file_len,
|
| 62 |
'mime_type': 'audio/wav',
|
|
|
|
| 135 |
return None
|
| 136 |
|
| 137 |
# -------------------------------
|
| 138 |
+
# Advanced Normalization Function
|
| 139 |
# -------------------------------
|
| 140 |
+
def advanced_normalize_audio(audio, threshold_ratio=0.6, window_size=1024):
|
| 141 |
"""
|
| 142 |
+
This advanced normalization function computes a moving-average envelope of the absolute
|
| 143 |
+
audio signal using a specified window size. It then zeroes out portions of the signal
|
| 144 |
+
where the envelope falls below a threshold (defined as a ratio of the maximum envelope value).
|
| 145 |
+
|
| 146 |
+
Parameters:
|
| 147 |
+
audio (np.ndarray): Input audio signal.
|
| 148 |
+
threshold_ratio (float): Ratio (0-1) to determine the minimum envelope value to keep.
|
| 149 |
+
window_size (int): Size of the moving window used to compute the envelope.
|
| 150 |
+
|
| 151 |
+
Returns:
|
| 152 |
+
np.ndarray: The normalized audio signal.
|
| 153 |
"""
|
| 154 |
+
# Compute moving-average envelope
|
| 155 |
+
envelope = np.convolve(np.abs(audio), np.ones(window_size) / window_size, mode='same')
|
| 156 |
+
max_env = np.max(envelope)
|
| 157 |
+
threshold = threshold_ratio * max_env
|
| 158 |
+
# Create a mask: keep samples where the envelope meets or exceeds the threshold.
|
| 159 |
+
mask = envelope >= threshold
|
| 160 |
+
# Optionally, you might smooth the mask further to avoid abrupt cuts.
|
| 161 |
+
normalized_audio = audio * mask.astype(audio.dtype)
|
| 162 |
return normalized_audio
|
| 163 |
|
| 164 |
# -------------------------------
|
|
|
|
| 220 |
debug_log.append("Vocal extraction succeeded; downloading extracted audio...")
|
| 221 |
response = requests.get(extracted_url)
|
| 222 |
if response.status_code == 200:
|
|
|
|
| 223 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
|
| 224 |
tmp.write(response.content)
|
| 225 |
audio_file = tmp.name
|
|
|
|
| 233 |
audio, sr = librosa.load(audio_file, sr=16000)
|
| 234 |
debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
|
| 235 |
|
| 236 |
+
# If vocal extraction was used, apply advanced normalization
|
| 237 |
if vocal_extraction:
|
| 238 |
+
audio = advanced_normalize_audio(audio)
|
| 239 |
+
debug_log.append("Advanced normalization applied to extracted audio to remove low-amplitude segments.")
|
| 240 |
|
| 241 |
# Select the model and set batch size
|
| 242 |
model = models[model_size]
|
| 243 |
batch_size = 8 if model_size == "tiny" else 4
|
| 244 |
|
| 245 |
+
# Use provided language if set; otherwise, use language detection.
|
| 246 |
if language:
|
| 247 |
transcript = model.transcribe(audio, batch_size=batch_size, language=language)
|
| 248 |
else:
|
| 249 |
transcript = model.transcribe(audio, batch_size=batch_size)
|
| 250 |
language = transcript.get("language", "unknown")
|
| 251 |
|
| 252 |
+
# Load alignment model using the specified language
|
| 253 |
model_a, metadata = whisperx.load_align_model(language_code=language, device=device)
|
| 254 |
|
| 255 |
+
# If pause_threshold > 0, split audio and process segments individually
|
| 256 |
if pause_threshold > 0:
|
| 257 |
segments = split_audio_by_pause(audio, sr, pause_threshold)
|
| 258 |
debug_log.append(f"Audio split into {len(segments)} segment(s) using a pause threshold of {pause_threshold}s")
|
|
|
|
| 319 |
interactive=True,
|
| 320 |
info="Set a pause duration threshold. Audio pauses longer than this will be used to split the audio into segments."
|
| 321 |
)
|
|
|
|
| 322 |
vocal_extraction_checkbox = gr.Checkbox(
|
| 323 |
label="Extract Vocals (improves accuracy on noisy audio)",
|
| 324 |
value=False
|
| 325 |
)
|
|
|
|
| 326 |
language_input = gr.Textbox(
|
| 327 |
label="Language Code (e.g., en, es, fr)",
|
| 328 |
placeholder="Enter language code",
|
|
|
|
| 344 |
visible=False,
|
| 345 |
)
|
| 346 |
|
|
|
|
| 347 |
def toggle_debug(debug_enabled):
|
| 348 |
return gr.update(visible=debug_enabled)
|
| 349 |
|
|
|
|
| 353 |
outputs=[debug_output]
|
| 354 |
)
|
| 355 |
|
|
|
|
| 356 |
transcribe_btn.click(
|
| 357 |
transcribe,
|
| 358 |
inputs=[audio_input, model_selector, debug_checkbox, pause_threshold_slider, vocal_extraction_checkbox, language_input],
|