mixed-audio-seperation

Sleeping

App Files Files Community

snowsafed commited on Jan 3

Commit

5602abd

verified ·

1 Parent(s): f488e69

Create app.py

Browse files

Files changed (1) hide show

app.py +213 -0

app.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import gradio as gr
+import torch
+import torchaudio
+import numpy as np
+from asteroid.models import ConvTasNet
+from speechbrain.pretrained import SepformerSeparation
+from scipy.io import wavfile
+from scipy import signal
+import noisereduce as nr
+import warnings
+warnings.filterwarnings('ignore')
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {DEVICE}")
+# Global model variables
+convtasnet_model = None
+sepformer_model = None
+def load_convtasnet():
+    global convtasnet_model
+    if convtasnet_model is None:
+        print("Loading ConvTasNet model...")
+        convtasnet_model = ConvTasNet.from_pretrained("JorisCos/ConvTasNet_Libri2Mix_sepclean_16k")
+        convtasnet_model = convtasnet_model.to(DEVICE)
+        convtasnet_model.eval()
+        print("ConvTasNet loaded!")
+    return convtasnet_model
+def load_sepformer():
+    global sepformer_model
+    if sepformer_model is None:
+        print("Loading SepFormer model...")
+        sepformer_model = SepformerSeparation.from_hparams(
+            source="speechbrain/sepformer-wsj02mix",
+            savedir="pretrained_models/sepformer-wsj02mix",
+            run_opts={"device": DEVICE}
+        )
+        print("SepFormer loaded!")
+    return sepformer_model
+def apply_highpass_filter(audio, sr, cutoff=80):
+    if len(audio) < 18:
+        return audio
+    try:
+        nyquist = sr / 2
+        normalized_cutoff = cutoff / nyquist
+        filter_order = min(4, max(1, len(audio) // 10))
+        b, a = signal.butter(filter_order, normalized_cutoff, btype='high', analog=False)
+        padlen = min(len(audio) // 3, 3 * max(len(a), len(b)))
+        filtered = signal.filtfilt(b, a, audio, padlen=padlen)
+        return filtered
+    except:
+        return audio
+def normalize_audio(audio, target_level=-20):
+    rms = np.sqrt(np.mean(audio**2))
+    if rms > 0:
+        target_rms = 10**(target_level/20)
+        audio = audio * (target_rms / rms)
+    return np.clip(audio, -1.0, 1.0)
+def apply_gate(audio, threshold=-40):
+    if len(audio) < 10:
+        return audio
+    try:
+        threshold_linear = 10**(threshold/20)
+        envelope = np.abs(signal.hilbert(audio))
+        gate_mask = envelope > threshold_linear
+        window_size = max(1, int(len(audio) * 0.001))
+        if window_size > 1 and window_size < len(gate_mask):
+            gate_mask = signal.convolve(gate_mask.astype(float),
+                                         np.ones(window_size)/window_size,
+                                         mode='same')
+        return audio * gate_mask
+    except:
+        return audio
+def reduce_musical_noise(audio, sr):
+    if len(audio) < 100:
+        return audio
+    try:
+        reduced = nr.reduce_noise(y=audio, sr=sr, stationary=False, prop_decrease=0.6)
+        return reduced
+    except:
+        return audio
+def enhance_separation(audio, sr, is_convtasnet=True):
+    if len(audio) < 100:
+        return audio
+    audio = apply_highpass_filter(audio, sr, cutoff=80)
+    if is_convtasnet:
+        audio = reduce_musical_noise(audio, sr)
+    threshold = -40 if is_convtasnet else -45
+    audio = apply_gate(audio, threshold=threshold)
+    audio = normalize_audio(audio, target_level=-20)
+    return audio
+def separate_audio(audio_file, model_choice):
+    try:
+        # Load audio
+        waveform, sample_rate = torchaudio.load(audio_file)
+        # Convert to mono
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        # Resample
+        target_sr = 16000 if model_choice == "ConvTasNet" else 8000
+        if sample_rate != target_sr:
+            resampler = torchaudio.transforms.Resample(sample_rate, target_sr)
+            waveform = resampler(waveform)
+            sample_rate = target_sr
+        # Separate based on model choice
+        if model_choice == "ConvTasNet":
+            model = load_convtasnet()
+            with torch.no_grad():
+                waveform_input = waveform.to(DEVICE)
+                separated = model(waveform_input.unsqueeze(0))
+                separated = separated.squeeze(0).cpu()
+            source1 = separated[0].numpy()
+            source2 = separated[1].numpy()
+        else:  # SepFormer
+            model = load_sepformer()
+            separated = model.separate_file(path=audio_file)
+            separated = separated.squeeze()
+            # Handle shape
+            if len(separated.shape) == 2:
+                if separated.shape[1] == 2 and separated.shape[0] > separated.shape[1]:
+                    separated = separated.T
+                source1 = separated[0].cpu().numpy() if isinstance(separated[0], torch.Tensor) else separated[0]
+                source2 = separated[1].cpu().numpy() if isinstance(separated[1], torch.Tensor) else separated[1]
+            else:
+                raise ValueError(f"Unexpected shape: {separated.shape}")
+        # Enhance audio (always on)
+        is_convtasnet = (model_choice == "ConvTasNet")
+        source1 = enhance_separation(source1, sample_rate, is_convtasnet)
+        source2 = enhance_separation(source2, sample_rate, is_convtasnet)
+        # Save as WAV files
+        output1 = "speaker1.wav"
+        output2 = "speaker2.wav"
+        wavfile.write(output1, sample_rate, (source1 * 32767).astype(np.int16))
+        wavfile.write(output2, sample_rate, (source2 * 32767).astype(np.int16))
+        status = f"✅ Separation complete using {model_choice} with audio enhancement"
+        return output1, output2, status
+    except Exception as e:
+        error_msg = f"❌ Error: {str(e)}"
+        print(error_msg)
+        import traceback
+        traceback.print_exc()
+        return None, None, error_msg
+# Create Gradio Interface
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🎵 Audio Source Separator
+        Upload mixed audio to separate it into individual speakers using AI.
+        Enhancement is automatically applied for best quality.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(
+                label="Upload Mixed Audio",
+                type="filepath"
+            )
+            model_choice = gr.Radio(
+                ["ConvTasNet", "SepFormer"],
+                label="Select Model",
+                value="ConvTasNet",
+                info="ConvTasNet: Faster | SepFormer: Higher Quality"
+            )
+            separate_btn = gr.Button("🚀 Separate Audio", variant="primary")
+        with gr.Column():
+            status_output = gr.Textbox(label="Status", interactive=False)
+    with gr.Row():
+        audio_output1 = gr.Audio(label="🎤 Speaker 1")
+        audio_output2 = gr.Audio(label="🎤 Speaker 2")
+    gr.Markdown(
+        """
+        ### 📝 How to Use:
+        1. Upload your mixed audio file (MP3, WAV, etc.)
+        2. Choose a model (ConvTasNet is faster, SepFormer is more accurate)
+        3. Click "Separate Audio" and wait
+        4. Download the separated audio files
+        **Note:** First separation takes longer as models load. Subsequent separations are faster!
+        """
+    )
+    separate_btn.click(
+        fn=separate_audio,
+        inputs=[audio_input, model_choice],
+        outputs=[audio_output1, audio_output2, status_output]
+    )
+# Preload models on startup
+print("Preloading ConvTasNet model...")
+load_convtasnet()
+if __name__ == "__main__":
+    demo.launch()