Spaces:

Athspi
/

Ai-audio

Running

App Files Files Community

Athspi commited on Jan 12

Commit

380d6cf

verified ·

1 Parent(s): 2eebdd2

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -140

app.py CHANGED Viewed

@@ -3,9 +3,10 @@ import whisper
 import torch
 import os
 from pydub import AudioSegment, silence
-from faster_whisper import WhisperModel  # Import faster-whisper
 import numpy as np
 from scipy.io import wavfile
 # Mapping of model names to Whisper model sizes
 MODELS = {
@@ -187,141 +188,74 @@ def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
     return output_path
-def convert_to_wav(audio_file):
     """
-    Convert the input audio file to WAV format.
     Args:
-        audio_file (str): Path to the input audio file.
     Returns:
-        str: Path to the converted WAV file.
-    """
-    audio = AudioSegment.from_file(audio_file)
-    wav_path = "converted_audio.wav"
-    audio.export(wav_path, format="wav")
-    return wav_path
-def detect_voice_activity(audio_file, threshold=0.02):
     """
-    Detect voice activity in the audio file and trim the audio to include only voice segments.
-    Args:
-        audio_file (str): Path to the input audio file.
-        threshold (float): Amplitude threshold for voice detection. Default is 0.02.
-    Returns:
-        str: Path to the output audio file with only voice segments.
-    """
-    # Convert the input audio to WAV format
-    wav_path = convert_to_wav(audio_file)
-    # Load the WAV file
-    sample_rate, data = wavfile.read(wav_path)
-    # If the audio is stereo, convert it to mono by averaging the channels
-    if len(data.shape) > 1:
-        data = np.mean(data, axis=1)
-    # Normalize the audio data to the range [-1, 1]
-    if data.dtype != np.float32:
-        data = data.astype(np.float32) / np.iinfo(data.dtype).max
-    # Detect voice activity
-    voice_segments = []
-    is_voice = False
-    start = 0
-    for i, sample in enumerate(data):
-        if abs(sample) > threshold and not is_voice:
-            is_voice = True
-            start = i
-        elif abs(sample) <= threshold and is_voice:
-            is_voice = False
-            voice_segments.append((start, i))
-    # If the last segment is voice, add it
-    if is_voice:
-        voice_segments.append((start, len(data)))
-    # Trim the audio to include only voice segments
-    trimmed_audio = np.array([], dtype=np.float32)
-    for segment in voice_segments:
-        trimmed_audio = np.concatenate((trimmed_audio, data[segment[0]:segment[1]]))
-    # Convert the trimmed audio back to 16-bit integer format
-    trimmed_audio_int16 = np.int16(trimmed_audio * 32767)
-    # Export the trimmed audio
-    output_path = "voice_trimmed_audio.wav"
-    wavfile.write(output_path, sample_rate, trimmed_audio_int16)
-    # Clean up the converted WAV file
-    os.remove(wav_path)
-    return output_path
-def detect_and_trim_audio(audio_file, threshold=0.02):
-    """
-    Detect voice activity in the audio file, trim the audio to include only voice segments,
-    and return the timestamps of the detected segments.
-    Args:
-        audio_file (str): Path to the input audio file.
-        threshold (float): Amplitude threshold for voice detection. Default is 0.02.
-    Returns:
-        str: Path to the output audio file with only voice segments.
-        list: List of timestamps (start, end) for the detected segments.
-    """
-    # Convert the input audio to WAV format
-    wav_path = convert_to_wav(audio_file)
-    # Load the WAV file
-    sample_rate, data = wavfile.read(wav_path)
-    # If the audio is stereo, convert it to mono by averaging the channels
-    if len(data.shape) > 1:
-        data = np.mean(data, axis=1)
-    # Normalize the audio data to the range [-1, 1]
-    if data.dtype != np.float32:
-        data = data.astype(np.float32) / np.iinfo(data.dtype).max
-    # Detect voice activity
-    voice_segments = []
-    is_voice = False
-    start = 0
-    for i, sample in enumerate(data):
-        if abs(sample) > threshold and not is_voice:
-            is_voice = True
-            start = i
-        elif abs(sample) <= threshold and is_voice:
-            is_voice = False
-            voice_segments.append((start, i))
-    # If the last segment is voice, add it
-    if is_voice:
-        voice_segments.append((start, len(data)))
-    # Trim the audio to include only voice segments
-    trimmed_audio = np.array([], dtype=np.float32)
-    for segment in voice_segments:
-        trimmed_audio = np.concatenate((trimmed_audio, data[segment[0]:segment[1]]))
-    # Convert the trimmed audio back to 16-bit integer format
-    trimmed_audio_int16 = np.int16(trimmed_audio * 32767)
     # Export the trimmed audio
-    output_path = "voice_trimmed_audio.wav"
-    wavfile.write(output_path, sample_rate, trimmed_audio_int16)
-    # Calculate timestamps in seconds
-    timestamps = [(start / sample_rate, end / sample_rate) for start, end in voice_segments]
-    # Clean up the converted WAV file
-    os.remove(wav_path)
-    return output_path, timestamps
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
     """Transcribe the audio file."""
@@ -373,7 +307,7 @@ def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whis
 # Define the Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Audio Transcription and Language Detection")
     with gr.Tab("Detect Language"):
         gr.Markdown("Upload an audio file to detect its language.")
@@ -414,17 +348,18 @@ with gr.Blocks() as demo:
         silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
         silence_button = gr.Button("Remove Silence")
-    with gr.Tab("Voice Detection and Trimming"):
-        gr.Markdown("Upload an audio file to detect voice activity and trim the audio.")
-        voice_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
-        voice_threshold_slider = gr.Slider(
-            minimum=0.01, maximum=0.1, value=0.02, step=0.01,
-            label="Voice Detection Threshold",
-            info="Higher values detect louder sounds as voice."
         )
-        voice_output = gr.Audio(label="Trimmed Audio", type="filepath")
-        timestamps_output = gr.Textbox(label="Detected Timestamps (seconds)")
-        voice_button = gr.Button("Detect and Trim Voice")
     # Link buttons to functions
     detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
@@ -438,10 +373,10 @@ with gr.Blocks() as demo:
         inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
         outputs=silence_output
     )
-    voice_button.click(
         detect_and_trim_audio,
-        inputs=[voice_audio_input, voice_threshold_slider],
-        outputs=[voice_output, timestamps_output]
     )
 # Launch the Gradio interface

 import torch
 import os
 from pydub import AudioSegment, silence
+from faster_whisper import WhisperModel
 import numpy as np
 from scipy.io import wavfile
+from scipy.signal import correlate
 # Mapping of model names to Whisper model sizes
 MODELS = {
     return output_path
+def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
     """
+    Detect the target audio in the main audio and trim the main audio to include only the detected segments.
     Args:
+        main_audio (str): Path to the main audio file.
+        target_audio (str): Path to the target audio file.
+        threshold (float): Detection threshold (0 to 1). Higher values mean stricter detection.
     Returns:
+        str: Path to the trimmed audio file.
+        str: Detected timestamps in the format "start-end (in seconds)".
     """
+    # Load audio files
+    main_rate, main_data = wavfile.read(main_audio)
+    target_rate, target_data = wavfile.read(target_audio)
+    # Ensure both audio files have the same sample rate
+    if main_rate != target_rate:
+        raise ValueError("Sample rates of the main audio and target audio must match.")
+    # Normalize audio data
+    main_data = main_data.astype(np.float32) / np.iinfo(main_data.dtype).max
+    target_data = target_data.astype(np.float32) / np.iinfo(target_data.dtype).max
+    # Perform cross-correlation to detect the target audio in the main audio
+    correlation = correlate(main_data, target_data, mode='valid')
+    correlation = np.abs(correlation)
+    max_corr = np.max(correlation)
+    # Detect segments where the target audio is present
+    detected_segments = []
+    for i, corr_value in enumerate(correlation):
+        if corr_value >= threshold * max_corr:
+            start_time = i / main_rate
+            end_time = (i + len(target_data)) / main_rate
+            detected_segments.append((start_time, end_time))
+    # Merge overlapping or nearby segments
+    merged_segments = []
+    for segment in detected_segments:
+        if not merged_segments:
+            merged_segments.append(segment)
+        else:
+            last_segment = merged_segments[-1]
+            if segment[0] <= last_segment[1] + 1.0:  # Merge if within 1 second
+                merged_segments[-1] = (last_segment[0], max(last_segment[1], segment[1]))
+            else:
+                merged_segments.append(segment)
+    # Trim the main audio to include only the detected segments
+    main_audio_segment = AudioSegment.from_file(main_audio)
+    trimmed_audio = AudioSegment.empty()
+    timestamps = []
+    for segment in merged_segments:
+        start_ms = int(segment[0] * 1000)
+        end_ms = int(segment[1] * 1000)
+        trimmed_audio += main_audio_segment[start_ms:end_ms]
+        timestamps.append(f"{segment[0]:.2f}-{segment[1]:.2f}")
     # Export the trimmed audio
+    output_path = "trimmed_audio.wav"
+    trimmed_audio.export(output_path, format="wav")
+    # Format timestamps
+    timestamps_str = "\n".join(timestamps)
+    return output_path, timestamps_str
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
     """Transcribe the audio file."""
 # Define the Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Audio Processing Tool")
     with gr.Tab("Detect Language"):
         gr.Markdown("Upload an audio file to detect its language.")
         silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
         silence_button = gr.Button("Remove Silence")
+    with gr.Tab("Detect and Trim Audio"):
+        gr.Markdown("Upload a main audio file and a target audio file. The app will detect the target audio in the main audio and trim it.")
+        main_audio_input = gr.Audio(type="filepath", label="Upload Main Audio File")
+        target_audio_input = gr.Audio(type="filepath", label="Upload Target Audio File")
+        threshold_slider = gr.Slider(
+            minimum=0.1, maximum=1.0, value=0.5, step=0.1,
+            label="Detection Threshold",
+            info="Higher values mean stricter detection."
         )
+        trimmed_audio_output = gr.Audio(label="Trimmed Audio", type="filepath")
+        timestamps_output = gr.Textbox(label="Detected Timestamps (in seconds)")
+        detect_button = gr.Button("Detect and Trim")
     # Link buttons to functions
     detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
         inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
         outputs=silence_output
     )
+    detect_button.click(
         detect_and_trim_audio,
+        inputs=[main_audio_input, target_audio_input, threshold_slider],
+        outputs=[trimmed_audio_output, timestamps_output]
     )
 # Launch the Gradio interface