Spaces:

redsky17
/

extractor

Sleeping

App Files Files Community

extractor / app.py

redsky17

Update app.py

8b9a956 verified 3 months ago

raw

history blame contribute delete

8.5 kB

	#!/usr/bin/env python3
	"""
	Gradio App for Hugging Face Spaces
	Audio Processing Pipeline: Demucs + Denoise + Normalize + Resample
	"""

	import gradio as gr
	import torch
	import torchaudio
	import soundfile as sf
	import os
	import tempfile
	from pathlib import Path
	import numpy as np

	print("Loading dependencies...")

	# Check device
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Device: {DEVICE}")

	def separate_vocals_demucs(audio_path, device="cpu"):
	"""Extract vocals using Demucs"""
	from demucs.pretrained import get_model
	from demucs.apply import apply_model

	# Load model
	model = get_model('htdemucs')
	model.to(device)
	model.eval()

	# Load audio using soundfile instead of torchaudio
	audio_data, sr = sf.read(audio_path, dtype='float32')
	# Convert to torch tensor and ensure correct shape [channels, samples]
	if audio_data.ndim == 1:
	wav = torch.from_numpy(audio_data).unsqueeze(0) # Add channel dimension
	else:
	wav = torch.from_numpy(audio_data.T) # Transpose to [channels, samples]

	# Resample to 44.1kHz if needed
	if sr != 44100:
	wav = torchaudio.transforms.Resample(sr, 44100)(wav)
	sr = 44100

	# Process
	wav = wav.to(device)
	if wav.dim() == 2:
	wav = wav.unsqueeze(0)

	with torch.no_grad():
	sources = apply_model(model, wav, device=device)

	# Extract vocals
	vocals_idx = model.sources.index('vocals')
	vocals = sources[0, vocals_idx].cpu()

	return vocals, sr


	def denoise_audio(audio, sr):
	"""Apply noise reduction"""
	try:
	import noisereduce as nr
	audio_np = audio.squeeze().numpy()
	reduced = nr.reduce_noise(
	y=audio_np,
	sr=sr,
	stationary=True,
	prop_decrease=1.0,
	freq_mask_smooth_hz=500,
	time_mask_smooth_ms=50
	)
	audio = torch.from_numpy(reduced).unsqueeze(0).float()
	except Exception as e:
	print(f"Denoising skipped: {e}")
	return audio


	def normalize_loudness(audio, target_dbfs=-20.0):
	"""Normalize to target loudness"""
	rms = torch.sqrt(torch.mean(audio ** 2))
	if rms > 0:
	current_dbfs = 20 * torch.log10(rms)
	gain_db = target_dbfs - current_dbfs
	gain_linear = 10 ** (gain_db / 20)
	audio = audio * gain_linear
	audio = torch.clamp(audio, -1.0, 1.0)
	return audio


	def convert_to_mono(audio):
	"""Convert to mono"""
	if audio.shape[0] > 1:
	audio = torch.mean(audio, dim=0, keepdim=True)
	return audio


	def process_audio(
	input_file,
	target_sr,
	target_dbfs,
	use_demucs,
	use_denoise,
	progress=gr.Progress()
	):
	"""Complete audio processing pipeline"""

	if input_file is None:
	return None, "❌ Please upload an audio file"

	try:
	progress(0.1, desc="Loading audio...")

	# Step 1: Vocal separation (optional)
	if use_demucs:
	progress(0.2, desc="Separating vocals with Demucs...")
	audio, sr = separate_vocals_demucs(input_file, DEVICE)
	else:
	# Load audio using soundfile
	audio_data, sr = sf.read(input_file, dtype='float32')
	if audio_data.ndim == 1:
	audio = torch.from_numpy(audio_data).unsqueeze(0)
	else:
	audio = torch.from_numpy(audio_data.T)

	# Step 2: Convert to mono
	progress(0.5, desc="Converting to mono...")
	audio = convert_to_mono(audio)

	# Step 3: Denoise (optional)
	if use_denoise:
	progress(0.6, desc="Removing noise...")
	audio = denoise_audio(audio, sr)

	# Step 4: Normalize
	progress(0.7, desc="Normalizing loudness...")
	audio = normalize_loudness(audio, target_dbfs)

	# Step 5: Resample
	if sr != target_sr:
	progress(0.8, desc=f"Resampling to {target_sr} Hz...")
	resampler = torchaudio.transforms.Resample(sr, target_sr)
	audio = resampler(audio)
	sr = target_sr

	# Save output
	progress(0.9, desc="Saving output...")
	output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
	# Save using soundfile instead of torchaudio
	sf.write(output_path, audio.squeeze().numpy().T, sr)

	# Get info
	duration = audio.shape[1] / sr
	size_mb = os.path.getsize(output_path) / (1024 * 1024)

	info = f"""
	✅ Processing Complete!

	📊 Output Info:
	- Duration: {duration:.1f} seconds
	- Sample Rate: {sr} Hz
	- Channels: {audio.shape[0]} (mono)
	- Size: {size_mb:.2f} MB
	- Loudness: {target_dbfs} dBFS

	🎵 Pipeline Steps:
	{"✓ Demucs vocal separation" if use_demucs else "⊗ Skipped vocal separation"}
	{"✓ Noise reduction" if use_denoise else "⊗ Skipped noise reduction"}
	✓ Loudness normalization
	✓ Resampled to {target_sr} Hz
	✓ Converted to mono
	"""

	progress(1.0, desc="Done!")
	return output_path, info

	except Exception as e:
	import traceback
	error_msg = f"❌ Error: {str(e)}\n\n```\n{traceback.format_exc()}\n```"
	return None, error_msg


	# Create Gradio interface
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🎵 Audio Processing Pipeline for TTS

	Extract clean vocals from podcasts/audio for TTS training

	Pipeline: Demucs Vocal Separation → Denoise → Normalize → Resample → Mono
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📁 Input")
	input_audio = gr.Audio(
	label="Upload Audio (WAV format, 44.1kHz recommended)",
	type="filepath"
	)

	gr.Markdown("### ⚙️ Options")

	target_sr = gr.Radio(
	choices=[16000, 22050, 24000, 44100, 48000],
	value=24000,
	label="Target Sample Rate",
	info="24kHz recommended for TTS"
	)

	target_dbfs = gr.Slider(
	minimum=-40,
	maximum=0,
	value=-20,
	step=1,
	label="Target Loudness (dBFS)",
	info="Normalization level (-20 recommended)"
	)

	use_demucs = gr.Checkbox(
	value=True,
	label="Use Demucs Vocal Separation",
	info="Extracts clean vocals (slower but better)"
	)

	use_denoise = gr.Checkbox(
	value=True,
	label="Apply Noise Reduction",
	info="Remove background noise"
	)

	process_btn = gr.Button("🚀 Process Audio", variant="primary", size="lg")

	with gr.Column(scale=1):
	gr.Markdown("### 📥 Output")
	output_audio = gr.Audio(
	label="Processed Audio",
	type="filepath"
	)
	output_info = gr.Markdown("Upload audio and click 'Process Audio' to start")

	gr.Markdown("""
	---
	### 📖 Usage Tips

	- Input: Upload WAV files (44.1kHz recommended for best quality)
	- Demucs: Enable for podcasts with music/background sounds
	- Denoise: Enable for noisy recordings
	- Sample Rate: Use 24kHz for TTS training, 16kHz for ASR
	- Processing Time: ~30-60 seconds for 5-minute audio (CPU mode)

	### 🔧 Technical Details

	- Device: {} {}
	- Demucs Model: htdemucs (hybrid transformer)
	- Denoise: Spectral gating with noisereduce
	- Output: Mono WAV, normalized loudness

	### 💡 Next Steps

	After processing:
	1. Download the clean audio
	2. Use Pyannote for speaker diarization
	3. Use Whisper for transcription
	4. Package as TTS training dataset

	---
	Made with ❤️ for TTS dataset creation
	""".format(DEVICE, torch.cuda.get_device_name(0) if DEVICE == "cuda" else ""))

	# Connect button
	process_btn.click(
	fn=process_audio,
	inputs=[input_audio, target_sr, target_dbfs, use_demucs, use_denoise],
	outputs=[output_audio, output_info]
	)

	if __name__ == "__main__":
	print("Starting Gradio app...")
	demo.launch()