|
|
|
|
|
""" |
|
|
Gradio App for Hugging Face Spaces |
|
|
Audio Processing Pipeline: Demucs + Denoise + Normalize + Resample |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import torch |
|
|
import torchaudio |
|
|
import soundfile as sf |
|
|
import os |
|
|
import tempfile |
|
|
from pathlib import Path |
|
|
import numpy as np |
|
|
|
|
|
print("Loading dependencies...") |
|
|
|
|
|
|
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"Device: {DEVICE}") |
|
|
|
|
|
def separate_vocals_demucs(audio_path, device="cpu"): |
|
|
"""Extract vocals using Demucs""" |
|
|
from demucs.pretrained import get_model |
|
|
from demucs.apply import apply_model |
|
|
|
|
|
|
|
|
model = get_model('htdemucs') |
|
|
model.to(device) |
|
|
model.eval() |
|
|
|
|
|
|
|
|
audio_data, sr = sf.read(audio_path, dtype='float32') |
|
|
|
|
|
if audio_data.ndim == 1: |
|
|
wav = torch.from_numpy(audio_data).unsqueeze(0) |
|
|
else: |
|
|
wav = torch.from_numpy(audio_data.T) |
|
|
|
|
|
|
|
|
if sr != 44100: |
|
|
wav = torchaudio.transforms.Resample(sr, 44100)(wav) |
|
|
sr = 44100 |
|
|
|
|
|
|
|
|
wav = wav.to(device) |
|
|
if wav.dim() == 2: |
|
|
wav = wav.unsqueeze(0) |
|
|
|
|
|
with torch.no_grad(): |
|
|
sources = apply_model(model, wav, device=device) |
|
|
|
|
|
|
|
|
vocals_idx = model.sources.index('vocals') |
|
|
vocals = sources[0, vocals_idx].cpu() |
|
|
|
|
|
return vocals, sr |
|
|
|
|
|
|
|
|
def denoise_audio(audio, sr): |
|
|
"""Apply noise reduction""" |
|
|
try: |
|
|
import noisereduce as nr |
|
|
audio_np = audio.squeeze().numpy() |
|
|
reduced = nr.reduce_noise( |
|
|
y=audio_np, |
|
|
sr=sr, |
|
|
stationary=True, |
|
|
prop_decrease=1.0, |
|
|
freq_mask_smooth_hz=500, |
|
|
time_mask_smooth_ms=50 |
|
|
) |
|
|
audio = torch.from_numpy(reduced).unsqueeze(0).float() |
|
|
except Exception as e: |
|
|
print(f"Denoising skipped: {e}") |
|
|
return audio |
|
|
|
|
|
|
|
|
def normalize_loudness(audio, target_dbfs=-20.0): |
|
|
"""Normalize to target loudness""" |
|
|
rms = torch.sqrt(torch.mean(audio ** 2)) |
|
|
if rms > 0: |
|
|
current_dbfs = 20 * torch.log10(rms) |
|
|
gain_db = target_dbfs - current_dbfs |
|
|
gain_linear = 10 ** (gain_db / 20) |
|
|
audio = audio * gain_linear |
|
|
audio = torch.clamp(audio, -1.0, 1.0) |
|
|
return audio |
|
|
|
|
|
|
|
|
def convert_to_mono(audio): |
|
|
"""Convert to mono""" |
|
|
if audio.shape[0] > 1: |
|
|
audio = torch.mean(audio, dim=0, keepdim=True) |
|
|
return audio |
|
|
|
|
|
|
|
|
def process_audio( |
|
|
input_file, |
|
|
target_sr, |
|
|
target_dbfs, |
|
|
use_demucs, |
|
|
use_denoise, |
|
|
progress=gr.Progress() |
|
|
): |
|
|
"""Complete audio processing pipeline""" |
|
|
|
|
|
if input_file is None: |
|
|
return None, "β Please upload an audio file" |
|
|
|
|
|
try: |
|
|
progress(0.1, desc="Loading audio...") |
|
|
|
|
|
|
|
|
if use_demucs: |
|
|
progress(0.2, desc="Separating vocals with Demucs...") |
|
|
audio, sr = separate_vocals_demucs(input_file, DEVICE) |
|
|
else: |
|
|
|
|
|
audio_data, sr = sf.read(input_file, dtype='float32') |
|
|
if audio_data.ndim == 1: |
|
|
audio = torch.from_numpy(audio_data).unsqueeze(0) |
|
|
else: |
|
|
audio = torch.from_numpy(audio_data.T) |
|
|
|
|
|
|
|
|
progress(0.5, desc="Converting to mono...") |
|
|
audio = convert_to_mono(audio) |
|
|
|
|
|
|
|
|
if use_denoise: |
|
|
progress(0.6, desc="Removing noise...") |
|
|
audio = denoise_audio(audio, sr) |
|
|
|
|
|
|
|
|
progress(0.7, desc="Normalizing loudness...") |
|
|
audio = normalize_loudness(audio, target_dbfs) |
|
|
|
|
|
|
|
|
if sr != target_sr: |
|
|
progress(0.8, desc=f"Resampling to {target_sr} Hz...") |
|
|
resampler = torchaudio.transforms.Resample(sr, target_sr) |
|
|
audio = resampler(audio) |
|
|
sr = target_sr |
|
|
|
|
|
|
|
|
progress(0.9, desc="Saving output...") |
|
|
output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name |
|
|
|
|
|
sf.write(output_path, audio.squeeze().numpy().T, sr) |
|
|
|
|
|
|
|
|
duration = audio.shape[1] / sr |
|
|
size_mb = os.path.getsize(output_path) / (1024 * 1024) |
|
|
|
|
|
info = f""" |
|
|
β
**Processing Complete!** |
|
|
|
|
|
π **Output Info:** |
|
|
- Duration: {duration:.1f} seconds |
|
|
- Sample Rate: {sr} Hz |
|
|
- Channels: {audio.shape[0]} (mono) |
|
|
- Size: {size_mb:.2f} MB |
|
|
- Loudness: {target_dbfs} dBFS |
|
|
|
|
|
π΅ **Pipeline Steps:** |
|
|
{"β Demucs vocal separation" if use_demucs else "β Skipped vocal separation"} |
|
|
{"β Noise reduction" if use_denoise else "β Skipped noise reduction"} |
|
|
β Loudness normalization |
|
|
β Resampled to {target_sr} Hz |
|
|
β Converted to mono |
|
|
""" |
|
|
|
|
|
progress(1.0, desc="Done!") |
|
|
return output_path, info |
|
|
|
|
|
except Exception as e: |
|
|
import traceback |
|
|
error_msg = f"β **Error:** {str(e)}\n\n```\n{traceback.format_exc()}\n```" |
|
|
return None, error_msg |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(""" |
|
|
# π΅ Audio Processing Pipeline for TTS |
|
|
|
|
|
Extract clean vocals from podcasts/audio for TTS training |
|
|
|
|
|
**Pipeline:** Demucs Vocal Separation β Denoise β Normalize β Resample β Mono |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### π Input") |
|
|
input_audio = gr.Audio( |
|
|
label="Upload Audio (WAV format, 44.1kHz recommended)", |
|
|
type="filepath" |
|
|
) |
|
|
|
|
|
gr.Markdown("### βοΈ Options") |
|
|
|
|
|
target_sr = gr.Radio( |
|
|
choices=[16000, 22050, 24000, 44100, 48000], |
|
|
value=24000, |
|
|
label="Target Sample Rate", |
|
|
info="24kHz recommended for TTS" |
|
|
) |
|
|
|
|
|
target_dbfs = gr.Slider( |
|
|
minimum=-40, |
|
|
maximum=0, |
|
|
value=-20, |
|
|
step=1, |
|
|
label="Target Loudness (dBFS)", |
|
|
info="Normalization level (-20 recommended)" |
|
|
) |
|
|
|
|
|
use_demucs = gr.Checkbox( |
|
|
value=True, |
|
|
label="Use Demucs Vocal Separation", |
|
|
info="Extracts clean vocals (slower but better)" |
|
|
) |
|
|
|
|
|
use_denoise = gr.Checkbox( |
|
|
value=True, |
|
|
label="Apply Noise Reduction", |
|
|
info="Remove background noise" |
|
|
) |
|
|
|
|
|
process_btn = gr.Button("π Process Audio", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### π₯ Output") |
|
|
output_audio = gr.Audio( |
|
|
label="Processed Audio", |
|
|
type="filepath" |
|
|
) |
|
|
output_info = gr.Markdown("Upload audio and click 'Process Audio' to start") |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
### π Usage Tips |
|
|
|
|
|
- **Input:** Upload WAV files (44.1kHz recommended for best quality) |
|
|
- **Demucs:** Enable for podcasts with music/background sounds |
|
|
- **Denoise:** Enable for noisy recordings |
|
|
- **Sample Rate:** Use 24kHz for TTS training, 16kHz for ASR |
|
|
- **Processing Time:** ~30-60 seconds for 5-minute audio (CPU mode) |
|
|
|
|
|
### π§ Technical Details |
|
|
|
|
|
- **Device:** {} {} |
|
|
- **Demucs Model:** htdemucs (hybrid transformer) |
|
|
- **Denoise:** Spectral gating with noisereduce |
|
|
- **Output:** Mono WAV, normalized loudness |
|
|
|
|
|
### π‘ Next Steps |
|
|
|
|
|
After processing: |
|
|
1. Download the clean audio |
|
|
2. Use Pyannote for speaker diarization |
|
|
3. Use Whisper for transcription |
|
|
4. Package as TTS training dataset |
|
|
|
|
|
--- |
|
|
Made with β€οΈ for TTS dataset creation |
|
|
""".format(DEVICE, torch.cuda.get_device_name(0) if DEVICE == "cuda" else "")) |
|
|
|
|
|
|
|
|
process_btn.click( |
|
|
fn=process_audio, |
|
|
inputs=[input_audio, target_sr, target_dbfs, use_demucs, use_denoise], |
|
|
outputs=[output_audio, output_info] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("Starting Gradio app...") |
|
|
demo.launch() |
|
|
|
|
|
|