Spaces:

redsky17
/

extractor

Sleeping

File size: 8,495 Bytes

#!/usr/bin/env python3
"""
Gradio App for Hugging Face Spaces
Audio Processing Pipeline: Demucs + Denoise + Normalize + Resample
"""

import gradio as gr
import torch
import torchaudio
import soundfile as sf
import os
import tempfile
from pathlib import Path
import numpy as np

print("Loading dependencies...")

# Check device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

def separate_vocals_demucs(audio_path, device="cpu"):
    """Extract vocals using Demucs"""
    from demucs.pretrained import get_model
    from demucs.apply import apply_model
    
    # Load model
    model = get_model('htdemucs')
    model.to(device)
    model.eval()
    
    # Load audio using soundfile instead of torchaudio
    audio_data, sr = sf.read(audio_path, dtype='float32')
    # Convert to torch tensor and ensure correct shape [channels, samples]
    if audio_data.ndim == 1:
        wav = torch.from_numpy(audio_data).unsqueeze(0)  # Add channel dimension
    else:
        wav = torch.from_numpy(audio_data.T)  # Transpose to [channels, samples]
    
    # Resample to 44.1kHz if needed
    if sr != 44100:
        wav = torchaudio.transforms.Resample(sr, 44100)(wav)
        sr = 44100
    
    # Process
    wav = wav.to(device)
    if wav.dim() == 2:
        wav = wav.unsqueeze(0)
    
    with torch.no_grad():
        sources = apply_model(model, wav, device=device)
    
    # Extract vocals
    vocals_idx = model.sources.index('vocals')
    vocals = sources[0, vocals_idx].cpu()
    
    return vocals, sr


def denoise_audio(audio, sr):
    """Apply noise reduction"""
    try:
        import noisereduce as nr
        audio_np = audio.squeeze().numpy()
        reduced = nr.reduce_noise(
            y=audio_np,
            sr=sr,
            stationary=True,
            prop_decrease=1.0,
            freq_mask_smooth_hz=500,
            time_mask_smooth_ms=50
        )
        audio = torch.from_numpy(reduced).unsqueeze(0).float()
    except Exception as e:
        print(f"Denoising skipped: {e}")
    return audio


def normalize_loudness(audio, target_dbfs=-20.0):
    """Normalize to target loudness"""
    rms = torch.sqrt(torch.mean(audio ** 2))
    if rms > 0:
        current_dbfs = 20 * torch.log10(rms)
        gain_db = target_dbfs - current_dbfs
        gain_linear = 10 ** (gain_db / 20)
        audio = audio * gain_linear
        audio = torch.clamp(audio, -1.0, 1.0)
    return audio


def convert_to_mono(audio):
    """Convert to mono"""
    if audio.shape[0] > 1:
        audio = torch.mean(audio, dim=0, keepdim=True)
    return audio


def process_audio(
    input_file,
    target_sr,
    target_dbfs,
    use_demucs,
    use_denoise,
    progress=gr.Progress()
):
    """Complete audio processing pipeline"""
    
    if input_file is None:
        return None, "❌ Please upload an audio file"
    
    try:
        progress(0.1, desc="Loading audio...")
        
        # Step 1: Vocal separation (optional)
        if use_demucs:
            progress(0.2, desc="Separating vocals with Demucs...")
            audio, sr = separate_vocals_demucs(input_file, DEVICE)
        else:
            # Load audio using soundfile
            audio_data, sr = sf.read(input_file, dtype='float32')
            if audio_data.ndim == 1:
                audio = torch.from_numpy(audio_data).unsqueeze(0)
            else:
                audio = torch.from_numpy(audio_data.T)
        
        # Step 2: Convert to mono
        progress(0.5, desc="Converting to mono...")
        audio = convert_to_mono(audio)
        
        # Step 3: Denoise (optional)
        if use_denoise:
            progress(0.6, desc="Removing noise...")
            audio = denoise_audio(audio, sr)
        
        # Step 4: Normalize
        progress(0.7, desc="Normalizing loudness...")
        audio = normalize_loudness(audio, target_dbfs)
        
        # Step 5: Resample
        if sr != target_sr:
            progress(0.8, desc=f"Resampling to {target_sr} Hz...")
            resampler = torchaudio.transforms.Resample(sr, target_sr)
            audio = resampler(audio)
            sr = target_sr
        
        # Save output
        progress(0.9, desc="Saving output...")
        output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
        # Save using soundfile instead of torchaudio
        sf.write(output_path, audio.squeeze().numpy().T, sr)
        
        # Get info
        duration = audio.shape[1] / sr
        size_mb = os.path.getsize(output_path) / (1024 * 1024)
        
        info = f"""
✅ **Processing Complete!**

📊 **Output Info:**
- Duration: {duration:.1f} seconds
- Sample Rate: {sr} Hz
- Channels: {audio.shape[0]} (mono)
- Size: {size_mb:.2f} MB
- Loudness: {target_dbfs} dBFS

🎵 **Pipeline Steps:**
{"✓ Demucs vocal separation" if use_demucs else "⊗ Skipped vocal separation"}
{"✓ Noise reduction" if use_denoise else "⊗ Skipped noise reduction"}
✓ Loudness normalization
✓ Resampled to {target_sr} Hz
✓ Converted to mono
"""
        
        progress(1.0, desc="Done!")
        return output_path, info
        
    except Exception as e:
        import traceback
        error_msg = f"❌ **Error:** {str(e)}\n\n```\n{traceback.format_exc()}\n```"
        return None, error_msg


# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎵 Audio Processing Pipeline for TTS
    
    Extract clean vocals from podcasts/audio for TTS training
    
    **Pipeline:** Demucs Vocal Separation → Denoise → Normalize → Resample → Mono
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 📁 Input")
            input_audio = gr.Audio(
                label="Upload Audio (WAV format, 44.1kHz recommended)",
                type="filepath"
            )
            
            gr.Markdown("### ⚙️ Options")
            
            target_sr = gr.Radio(
                choices=[16000, 22050, 24000, 44100, 48000],
                value=24000,
                label="Target Sample Rate",
                info="24kHz recommended for TTS"
            )
            
            target_dbfs = gr.Slider(
                minimum=-40,
                maximum=0,
                value=-20,
                step=1,
                label="Target Loudness (dBFS)",
                info="Normalization level (-20 recommended)"
            )
            
            use_demucs = gr.Checkbox(
                value=True,
                label="Use Demucs Vocal Separation",
                info="Extracts clean vocals (slower but better)"
            )
            
            use_denoise = gr.Checkbox(
                value=True,
                label="Apply Noise Reduction",
                info="Remove background noise"
            )
            
            process_btn = gr.Button("🚀 Process Audio", variant="primary", size="lg")
        
        with gr.Column(scale=1):
            gr.Markdown("### 📥 Output")
            output_audio = gr.Audio(
                label="Processed Audio",
                type="filepath"
            )
            output_info = gr.Markdown("Upload audio and click 'Process Audio' to start")
    
    gr.Markdown("""
    ---
    ### 📖 Usage Tips
    
    - **Input:** Upload WAV files (44.1kHz recommended for best quality)
    - **Demucs:** Enable for podcasts with music/background sounds
    - **Denoise:** Enable for noisy recordings
    - **Sample Rate:** Use 24kHz for TTS training, 16kHz for ASR
    - **Processing Time:** ~30-60 seconds for 5-minute audio (CPU mode)
    
    ### 🔧 Technical Details
    
    - **Device:** {} {}
    - **Demucs Model:** htdemucs (hybrid transformer)
    - **Denoise:** Spectral gating with noisereduce
    - **Output:** Mono WAV, normalized loudness
    
    ### 💡 Next Steps
    
    After processing:
    1. Download the clean audio
    2. Use Pyannote for speaker diarization
    3. Use Whisper for transcription
    4. Package as TTS training dataset
    
    ---
    Made with ❤️ for TTS dataset creation
    """.format(DEVICE, torch.cuda.get_device_name(0) if DEVICE == "cuda" else ""))
    
    # Connect button
    process_btn.click(
        fn=process_audio,
        inputs=[input_audio, target_sr, target_dbfs, use_demucs, use_denoise],
        outputs=[output_audio, output_info]
    )

if __name__ == "__main__":
    print("Starting Gradio app...")
    demo.launch()