extractor / app.py
redsky17's picture
Update app.py
8b9a956 verified
#!/usr/bin/env python3
"""
Gradio App for Hugging Face Spaces
Audio Processing Pipeline: Demucs + Denoise + Normalize + Resample
"""
import gradio as gr
import torch
import torchaudio
import soundfile as sf
import os
import tempfile
from pathlib import Path
import numpy as np
print("Loading dependencies...")
# Check device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")
def separate_vocals_demucs(audio_path, device="cpu"):
"""Extract vocals using Demucs"""
from demucs.pretrained import get_model
from demucs.apply import apply_model
# Load model
model = get_model('htdemucs')
model.to(device)
model.eval()
# Load audio using soundfile instead of torchaudio
audio_data, sr = sf.read(audio_path, dtype='float32')
# Convert to torch tensor and ensure correct shape [channels, samples]
if audio_data.ndim == 1:
wav = torch.from_numpy(audio_data).unsqueeze(0) # Add channel dimension
else:
wav = torch.from_numpy(audio_data.T) # Transpose to [channels, samples]
# Resample to 44.1kHz if needed
if sr != 44100:
wav = torchaudio.transforms.Resample(sr, 44100)(wav)
sr = 44100
# Process
wav = wav.to(device)
if wav.dim() == 2:
wav = wav.unsqueeze(0)
with torch.no_grad():
sources = apply_model(model, wav, device=device)
# Extract vocals
vocals_idx = model.sources.index('vocals')
vocals = sources[0, vocals_idx].cpu()
return vocals, sr
def denoise_audio(audio, sr):
"""Apply noise reduction"""
try:
import noisereduce as nr
audio_np = audio.squeeze().numpy()
reduced = nr.reduce_noise(
y=audio_np,
sr=sr,
stationary=True,
prop_decrease=1.0,
freq_mask_smooth_hz=500,
time_mask_smooth_ms=50
)
audio = torch.from_numpy(reduced).unsqueeze(0).float()
except Exception as e:
print(f"Denoising skipped: {e}")
return audio
def normalize_loudness(audio, target_dbfs=-20.0):
"""Normalize to target loudness"""
rms = torch.sqrt(torch.mean(audio ** 2))
if rms > 0:
current_dbfs = 20 * torch.log10(rms)
gain_db = target_dbfs - current_dbfs
gain_linear = 10 ** (gain_db / 20)
audio = audio * gain_linear
audio = torch.clamp(audio, -1.0, 1.0)
return audio
def convert_to_mono(audio):
"""Convert to mono"""
if audio.shape[0] > 1:
audio = torch.mean(audio, dim=0, keepdim=True)
return audio
def process_audio(
input_file,
target_sr,
target_dbfs,
use_demucs,
use_denoise,
progress=gr.Progress()
):
"""Complete audio processing pipeline"""
if input_file is None:
return None, "❌ Please upload an audio file"
try:
progress(0.1, desc="Loading audio...")
# Step 1: Vocal separation (optional)
if use_demucs:
progress(0.2, desc="Separating vocals with Demucs...")
audio, sr = separate_vocals_demucs(input_file, DEVICE)
else:
# Load audio using soundfile
audio_data, sr = sf.read(input_file, dtype='float32')
if audio_data.ndim == 1:
audio = torch.from_numpy(audio_data).unsqueeze(0)
else:
audio = torch.from_numpy(audio_data.T)
# Step 2: Convert to mono
progress(0.5, desc="Converting to mono...")
audio = convert_to_mono(audio)
# Step 3: Denoise (optional)
if use_denoise:
progress(0.6, desc="Removing noise...")
audio = denoise_audio(audio, sr)
# Step 4: Normalize
progress(0.7, desc="Normalizing loudness...")
audio = normalize_loudness(audio, target_dbfs)
# Step 5: Resample
if sr != target_sr:
progress(0.8, desc=f"Resampling to {target_sr} Hz...")
resampler = torchaudio.transforms.Resample(sr, target_sr)
audio = resampler(audio)
sr = target_sr
# Save output
progress(0.9, desc="Saving output...")
output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
# Save using soundfile instead of torchaudio
sf.write(output_path, audio.squeeze().numpy().T, sr)
# Get info
duration = audio.shape[1] / sr
size_mb = os.path.getsize(output_path) / (1024 * 1024)
info = f"""
βœ… **Processing Complete!**
πŸ“Š **Output Info:**
- Duration: {duration:.1f} seconds
- Sample Rate: {sr} Hz
- Channels: {audio.shape[0]} (mono)
- Size: {size_mb:.2f} MB
- Loudness: {target_dbfs} dBFS
🎡 **Pipeline Steps:**
{"βœ“ Demucs vocal separation" if use_demucs else "βŠ— Skipped vocal separation"}
{"βœ“ Noise reduction" if use_denoise else "βŠ— Skipped noise reduction"}
βœ“ Loudness normalization
βœ“ Resampled to {target_sr} Hz
βœ“ Converted to mono
"""
progress(1.0, desc="Done!")
return output_path, info
except Exception as e:
import traceback
error_msg = f"❌ **Error:** {str(e)}\n\n```\n{traceback.format_exc()}\n```"
return None, error_msg
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🎡 Audio Processing Pipeline for TTS
Extract clean vocals from podcasts/audio for TTS training
**Pipeline:** Demucs Vocal Separation β†’ Denoise β†’ Normalize β†’ Resample β†’ Mono
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸ“ Input")
input_audio = gr.Audio(
label="Upload Audio (WAV format, 44.1kHz recommended)",
type="filepath"
)
gr.Markdown("### βš™οΈ Options")
target_sr = gr.Radio(
choices=[16000, 22050, 24000, 44100, 48000],
value=24000,
label="Target Sample Rate",
info="24kHz recommended for TTS"
)
target_dbfs = gr.Slider(
minimum=-40,
maximum=0,
value=-20,
step=1,
label="Target Loudness (dBFS)",
info="Normalization level (-20 recommended)"
)
use_demucs = gr.Checkbox(
value=True,
label="Use Demucs Vocal Separation",
info="Extracts clean vocals (slower but better)"
)
use_denoise = gr.Checkbox(
value=True,
label="Apply Noise Reduction",
info="Remove background noise"
)
process_btn = gr.Button("πŸš€ Process Audio", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### πŸ“₯ Output")
output_audio = gr.Audio(
label="Processed Audio",
type="filepath"
)
output_info = gr.Markdown("Upload audio and click 'Process Audio' to start")
gr.Markdown("""
---
### πŸ“– Usage Tips
- **Input:** Upload WAV files (44.1kHz recommended for best quality)
- **Demucs:** Enable for podcasts with music/background sounds
- **Denoise:** Enable for noisy recordings
- **Sample Rate:** Use 24kHz for TTS training, 16kHz for ASR
- **Processing Time:** ~30-60 seconds for 5-minute audio (CPU mode)
### πŸ”§ Technical Details
- **Device:** {} {}
- **Demucs Model:** htdemucs (hybrid transformer)
- **Denoise:** Spectral gating with noisereduce
- **Output:** Mono WAV, normalized loudness
### πŸ’‘ Next Steps
After processing:
1. Download the clean audio
2. Use Pyannote for speaker diarization
3. Use Whisper for transcription
4. Package as TTS training dataset
---
Made with ❀️ for TTS dataset creation
""".format(DEVICE, torch.cuda.get_device_name(0) if DEVICE == "cuda" else ""))
# Connect button
process_btn.click(
fn=process_audio,
inputs=[input_audio, target_sr, target_dbfs, use_demucs, use_denoise],
outputs=[output_audio, output_info]
)
if __name__ == "__main__":
print("Starting Gradio app...")
demo.launch()