Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,11 +2,10 @@ import gradio as gr
|
|
| 2 |
import whisper
|
| 3 |
import torch
|
| 4 |
import os
|
| 5 |
-
import numpy as np
|
| 6 |
from pydub import AudioSegment, silence
|
| 7 |
from faster_whisper import WhisperModel # Import faster-whisper
|
| 8 |
-
import
|
| 9 |
-
from
|
| 10 |
|
| 11 |
# Mapping of model names to Whisper model sizes
|
| 12 |
MODELS = {
|
|
@@ -188,63 +187,48 @@ def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
|
|
| 188 |
|
| 189 |
return output_path
|
| 190 |
|
| 191 |
-
def
|
| 192 |
"""
|
| 193 |
-
|
| 194 |
|
| 195 |
Args:
|
| 196 |
audio_file (str): Path to the input audio file.
|
| 197 |
-
|
| 198 |
|
| 199 |
Returns:
|
| 200 |
-
str: Path to the output audio file with
|
| 201 |
"""
|
| 202 |
# Load the audio file
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
# Convert audio to numpy array for noisereduce
|
| 206 |
-
samples = np.array(audio.get_array_of_samples())
|
| 207 |
-
sample_rate = audio.frame_rate
|
| 208 |
-
|
| 209 |
-
# Perform noise reduction
|
| 210 |
-
reduced_noise = nr.reduce_noise(
|
| 211 |
-
y=samples,
|
| 212 |
-
sr=sample_rate,
|
| 213 |
-
prop_decrease=noise_reduce_level
|
| 214 |
-
)
|
| 215 |
|
| 216 |
-
#
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
frame_rate=sample_rate,
|
| 220 |
-
sample_width=audio.sample_width,
|
| 221 |
-
channels=audio.channels
|
| 222 |
-
)
|
| 223 |
-
|
| 224 |
-
# Export the processed audio
|
| 225 |
-
output_path = "noise_reduced_audio.wav"
|
| 226 |
-
reduced_audio.export(output_path, format="wav")
|
| 227 |
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
#
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
return output_path
|
| 250 |
|
|
@@ -339,22 +323,18 @@ with gr.Blocks() as demo:
|
|
| 339 |
silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
|
| 340 |
silence_button = gr.Button("Remove Silence")
|
| 341 |
|
| 342 |
-
with gr.Tab("
|
| 343 |
-
gr.Markdown("Upload
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
|
|
|
| 349 |
)
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
with gr.Tab("Remove Background Music"):
|
| 354 |
-
gr.Markdown("Upload an audio file to remove background music.")
|
| 355 |
-
music_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
|
| 356 |
-
music_output = gr.Audio(label="Processed Audio (Music Removed)", type="filepath")
|
| 357 |
-
music_button = gr.Button("Remove Background Music")
|
| 358 |
|
| 359 |
# Link buttons to functions
|
| 360 |
detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
|
|
@@ -368,15 +348,10 @@ with gr.Blocks() as demo:
|
|
| 368 |
inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
|
| 369 |
outputs=silence_output
|
| 370 |
)
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
inputs=[
|
| 374 |
-
outputs=
|
| 375 |
-
)
|
| 376 |
-
music_button.click(
|
| 377 |
-
remove_background_music,
|
| 378 |
-
inputs=music_audio_input,
|
| 379 |
-
outputs=music_output
|
| 380 |
)
|
| 381 |
|
| 382 |
# Launch the Gradio interface
|
|
|
|
| 2 |
import whisper
|
| 3 |
import torch
|
| 4 |
import os
|
|
|
|
| 5 |
from pydub import AudioSegment, silence
|
| 6 |
from faster_whisper import WhisperModel # Import faster-whisper
|
| 7 |
+
import numpy as np
|
| 8 |
+
from scipy.io import wavfile
|
| 9 |
|
| 10 |
# Mapping of model names to Whisper model sizes
|
| 11 |
MODELS = {
|
|
|
|
| 187 |
|
| 188 |
return output_path
|
| 189 |
|
| 190 |
+
def detect_voice_activity(audio_file, threshold=0.02):
|
| 191 |
"""
|
| 192 |
+
Detect voice activity in the audio file and trim the audio to include only voice segments.
|
| 193 |
|
| 194 |
Args:
|
| 195 |
audio_file (str): Path to the input audio file.
|
| 196 |
+
threshold (float): Amplitude threshold for voice detection. Default is 0.02.
|
| 197 |
|
| 198 |
Returns:
|
| 199 |
+
str: Path to the output audio file with only voice segments.
|
| 200 |
"""
|
| 201 |
# Load the audio file
|
| 202 |
+
sample_rate, data = wavfile.read(audio_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
+
# Normalize the audio data
|
| 205 |
+
if data.dtype != np.float32:
|
| 206 |
+
data = data.astype(np.float32) / np.iinfo(data.dtype).max
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
+
# Detect voice activity
|
| 209 |
+
voice_segments = []
|
| 210 |
+
is_voice = False
|
| 211 |
+
start = 0
|
| 212 |
+
for i, sample in enumerate(data):
|
| 213 |
+
if abs(sample) > threshold and not is_voice:
|
| 214 |
+
is_voice = True
|
| 215 |
+
start = i
|
| 216 |
+
elif abs(sample) <= threshold and is_voice:
|
| 217 |
+
is_voice = False
|
| 218 |
+
voice_segments.append((start, i))
|
| 219 |
+
|
| 220 |
+
# If the last segment is voice, add it
|
| 221 |
+
if is_voice:
|
| 222 |
+
voice_segments.append((start, len(data)))
|
| 223 |
+
|
| 224 |
+
# Trim the audio to include only voice segments
|
| 225 |
+
trimmed_audio = np.array([], dtype=np.float32)
|
| 226 |
+
for segment in voice_segments:
|
| 227 |
+
trimmed_audio = np.concatenate((trimmed_audio, data[segment[0]:segment[1]]))
|
| 228 |
+
|
| 229 |
+
# Export the trimmed audio
|
| 230 |
+
output_path = "voice_trimmed_audio.wav"
|
| 231 |
+
wavfile.write(output_path, sample_rate, trimmed_audio)
|
| 232 |
|
| 233 |
return output_path
|
| 234 |
|
|
|
|
| 323 |
silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
|
| 324 |
silence_button = gr.Button("Remove Silence")
|
| 325 |
|
| 326 |
+
with gr.Tab("Voice Detection and Trimming"):
|
| 327 |
+
gr.Markdown("Upload two audio files to detect voice activity and trim the audio.")
|
| 328 |
+
voice_audio_input1 = gr.Audio(type="filepath", label="Upload Audio File 1")
|
| 329 |
+
voice_audio_input2 = gr.Audio(type="filepath", label="Upload Audio File 2")
|
| 330 |
+
voice_threshold_slider = gr.Slider(
|
| 331 |
+
minimum=0.01, maximum=0.1, value=0.02, step=0.01,
|
| 332 |
+
label="Voice Detection Threshold",
|
| 333 |
+
info="Higher values detect louder sounds as voice."
|
| 334 |
)
|
| 335 |
+
voice_output1 = gr.Audio(label="Trimmed Audio 1", type="filepath")
|
| 336 |
+
voice_output2 = gr.Audio(label="Trimmed Audio 2", type="filepath")
|
| 337 |
+
voice_button = gr.Button("Detect and Trim Voice")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
|
| 339 |
# Link buttons to functions
|
| 340 |
detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
|
|
|
|
| 348 |
inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
|
| 349 |
outputs=silence_output
|
| 350 |
)
|
| 351 |
+
voice_button.click(
|
| 352 |
+
lambda audio1, audio2, threshold: (detect_voice_activity(audio1, threshold), detect_voice_activity(audio2, threshold)),
|
| 353 |
+
inputs=[voice_audio_input1, voice_audio_input2, voice_threshold_slider],
|
| 354 |
+
outputs=[voice_output1, voice_output2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
)
|
| 356 |
|
| 357 |
# Launch the Gradio interface
|