#!/usr/bin/env python3 """ Gradio App for Hugging Face Spaces Audio Processing Pipeline: Demucs + Denoise + Normalize + Resample """ import gradio as gr import torch import torchaudio import soundfile as sf import os import tempfile from pathlib import Path import numpy as np print("Loading dependencies...") # Check device DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print(f"Device: {DEVICE}") def separate_vocals_demucs(audio_path, device="cpu"): """Extract vocals using Demucs""" from demucs.pretrained import get_model from demucs.apply import apply_model # Load model model = get_model('htdemucs') model.to(device) model.eval() # Load audio using soundfile instead of torchaudio audio_data, sr = sf.read(audio_path, dtype='float32') # Convert to torch tensor and ensure correct shape [channels, samples] if audio_data.ndim == 1: wav = torch.from_numpy(audio_data).unsqueeze(0) # Add channel dimension else: wav = torch.from_numpy(audio_data.T) # Transpose to [channels, samples] # Resample to 44.1kHz if needed if sr != 44100: wav = torchaudio.transforms.Resample(sr, 44100)(wav) sr = 44100 # Process wav = wav.to(device) if wav.dim() == 2: wav = wav.unsqueeze(0) with torch.no_grad(): sources = apply_model(model, wav, device=device) # Extract vocals vocals_idx = model.sources.index('vocals') vocals = sources[0, vocals_idx].cpu() return vocals, sr def denoise_audio(audio, sr): """Apply noise reduction""" try: import noisereduce as nr audio_np = audio.squeeze().numpy() reduced = nr.reduce_noise( y=audio_np, sr=sr, stationary=True, prop_decrease=1.0, freq_mask_smooth_hz=500, time_mask_smooth_ms=50 ) audio = torch.from_numpy(reduced).unsqueeze(0).float() except Exception as e: print(f"Denoising skipped: {e}") return audio def normalize_loudness(audio, target_dbfs=-20.0): """Normalize to target loudness""" rms = torch.sqrt(torch.mean(audio ** 2)) if rms > 0: current_dbfs = 20 * torch.log10(rms) gain_db = target_dbfs - current_dbfs gain_linear = 10 ** (gain_db / 20) audio = audio * gain_linear audio = torch.clamp(audio, -1.0, 1.0) return audio def convert_to_mono(audio): """Convert to mono""" if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True) return audio def process_audio( input_file, target_sr, target_dbfs, use_demucs, use_denoise, progress=gr.Progress() ): """Complete audio processing pipeline""" if input_file is None: return None, "❌ Please upload an audio file" try: progress(0.1, desc="Loading audio...") # Step 1: Vocal separation (optional) if use_demucs: progress(0.2, desc="Separating vocals with Demucs...") audio, sr = separate_vocals_demucs(input_file, DEVICE) else: # Load audio using soundfile audio_data, sr = sf.read(input_file, dtype='float32') if audio_data.ndim == 1: audio = torch.from_numpy(audio_data).unsqueeze(0) else: audio = torch.from_numpy(audio_data.T) # Step 2: Convert to mono progress(0.5, desc="Converting to mono...") audio = convert_to_mono(audio) # Step 3: Denoise (optional) if use_denoise: progress(0.6, desc="Removing noise...") audio = denoise_audio(audio, sr) # Step 4: Normalize progress(0.7, desc="Normalizing loudness...") audio = normalize_loudness(audio, target_dbfs) # Step 5: Resample if sr != target_sr: progress(0.8, desc=f"Resampling to {target_sr} Hz...") resampler = torchaudio.transforms.Resample(sr, target_sr) audio = resampler(audio) sr = target_sr # Save output progress(0.9, desc="Saving output...") output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name # Save using soundfile instead of torchaudio sf.write(output_path, audio.squeeze().numpy().T, sr) # Get info duration = audio.shape[1] / sr size_mb = os.path.getsize(output_path) / (1024 * 1024) info = f""" ✅ **Processing Complete!** 📊 **Output Info:** - Duration: {duration:.1f} seconds - Sample Rate: {sr} Hz - Channels: {audio.shape[0]} (mono) - Size: {size_mb:.2f} MB - Loudness: {target_dbfs} dBFS 🎵 **Pipeline Steps:** {"✓ Demucs vocal separation" if use_demucs else "⊗ Skipped vocal separation"} {"✓ Noise reduction" if use_denoise else "⊗ Skipped noise reduction"} ✓ Loudness normalization ✓ Resampled to {target_sr} Hz ✓ Converted to mono """ progress(1.0, desc="Done!") return output_path, info except Exception as e: import traceback error_msg = f"❌ **Error:** {str(e)}\n\n```\n{traceback.format_exc()}\n```" return None, error_msg # Create Gradio interface with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎵 Audio Processing Pipeline for TTS Extract clean vocals from podcasts/audio for TTS training **Pipeline:** Demucs Vocal Separation → Denoise → Normalize → Resample → Mono """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 📁 Input") input_audio = gr.Audio( label="Upload Audio (WAV format, 44.1kHz recommended)", type="filepath" ) gr.Markdown("### ⚙️ Options") target_sr = gr.Radio( choices=[16000, 22050, 24000, 44100, 48000], value=24000, label="Target Sample Rate", info="24kHz recommended for TTS" ) target_dbfs = gr.Slider( minimum=-40, maximum=0, value=-20, step=1, label="Target Loudness (dBFS)", info="Normalization level (-20 recommended)" ) use_demucs = gr.Checkbox( value=True, label="Use Demucs Vocal Separation", info="Extracts clean vocals (slower but better)" ) use_denoise = gr.Checkbox( value=True, label="Apply Noise Reduction", info="Remove background noise" ) process_btn = gr.Button("🚀 Process Audio", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown("### 📥 Output") output_audio = gr.Audio( label="Processed Audio", type="filepath" ) output_info = gr.Markdown("Upload audio and click 'Process Audio' to start") gr.Markdown(""" --- ### 📖 Usage Tips - **Input:** Upload WAV files (44.1kHz recommended for best quality) - **Demucs:** Enable for podcasts with music/background sounds - **Denoise:** Enable for noisy recordings - **Sample Rate:** Use 24kHz for TTS training, 16kHz for ASR - **Processing Time:** ~30-60 seconds for 5-minute audio (CPU mode) ### 🔧 Technical Details - **Device:** {} {} - **Demucs Model:** htdemucs (hybrid transformer) - **Denoise:** Spectral gating with noisereduce - **Output:** Mono WAV, normalized loudness ### 💡 Next Steps After processing: 1. Download the clean audio 2. Use Pyannote for speaker diarization 3. Use Whisper for transcription 4. Package as TTS training dataset --- Made with ❤️ for TTS dataset creation """.format(DEVICE, torch.cuda.get_device_name(0) if DEVICE == "cuda" else "")) # Connect button process_btn.click( fn=process_audio, inputs=[input_audio, target_sr, target_dbfs, use_demucs, use_denoise], outputs=[output_audio, output_info] ) if __name__ == "__main__": print("Starting Gradio app...") demo.launch()