Spaces:
Sleeping
Sleeping
| import glob | |
| import math | |
| import os | |
| import tempfile | |
| import time | |
| from pathlib import Path | |
| from typing import List, Optional, Tuple, Union | |
| import subprocess | |
| from dataclasses import dataclass | |
| import gradio as gr | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import torch | |
| from loguru import logger | |
| from PIL import Image as PILImage | |
| from torch import Tensor | |
| from torchaudio.backend.common import AudioMetaData | |
| from df import config | |
| from df.enhance import enhance, init_df, load_audio, save_audio | |
| from df.io import resample | |
| # ============================================================================ | |
| # Configuration and Setup | |
| # ============================================================================ | |
| class AppConfig: | |
| """Application configuration""" | |
| device: torch.device | |
| sample_rate: int = 48000 | |
| max_duration_seconds: int = 3600 | |
| cleanup_hours: int = 2 | |
| temp_dir: str = "/tmp" | |
| model_path: str = "./DeepFilterNet2" | |
| fade_duration: float = 0.15 | |
| class AudioProcessor: | |
| """Handles audio processing operations""" | |
| def __init__(self, model, df, config: AppConfig): | |
| self.model = model | |
| self.df = df | |
| self.config = config | |
| def mix_at_snr(self, clean: Tensor, noise: Tensor, snr: float, eps: float = 1e-10) -> Tuple[Tensor, Tensor, Tensor]: | |
| """Mix clean and noise signal at a given SNR with improved error handling.""" | |
| clean = torch.as_tensor(clean).mean(0, keepdim=True) | |
| noise = torch.as_tensor(noise).mean(0, keepdim=True) | |
| if noise.shape[1] < clean.shape[1]: | |
| repeats = int(math.ceil(clean.shape[1] / noise.shape[1])) | |
| noise = noise.repeat((1, repeats)) | |
| max_start = int(noise.shape[1] - clean.shape[1]) | |
| start = torch.randint(0, max_start, ()).item() if max_start > 0 else 0 | |
| noise = noise[:, start : start + clean.shape[1]] | |
| E_speech = torch.mean(clean.pow(2)) + eps | |
| E_noise = torch.mean(noise.pow(2)) + eps | |
| K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps) | |
| noise = noise / K | |
| mixture = clean + noise | |
| assert torch.isfinite(mixture).all(), "Non-finite values detected in mixture" | |
| max_m = mixture.abs().max() | |
| if max_m > 1: | |
| logger.warning(f"Clipping detected during mixing. Reducing gain by {1/max_m:.3f}") | |
| clean, noise, mixture = clean / max_m, noise / max_m, mixture / max_m | |
| return clean, noise, mixture | |
| def enhance_audio(self, audio: Tensor) -> Tensor: | |
| """Enhance audio using the DeepFilterNet model.""" | |
| logger.info(f"Enhancing audio with shape {audio.shape}") | |
| with torch.no_grad(): | |
| enhanced = enhance(self.model, self.df, audio) | |
| sr = self.config.sample_rate | |
| fade_samples = int(sr * self.config.fade_duration) | |
| lim = torch.linspace(0.0, 1.0, fade_samples).unsqueeze(0) | |
| lim = torch.cat((lim, torch.ones(1, enhanced.shape[1] - lim.shape[1])), dim=1) | |
| enhanced = enhanced * lim | |
| return enhanced | |
| class AudioLoader: | |
| """Handles audio loading from various sources""" | |
| def ensure_wav(filepath: str) -> str: | |
| """Convert audio files to WAV using ffmpeg if needed.""" | |
| if not filepath: | |
| return filepath | |
| file_ext = Path(filepath).suffix.lower() | |
| if file_ext in ['.mp3', '.m4a', '.ogg', '.flac', '.aac']: | |
| wav_path = str(Path(filepath).with_suffix('.wav')) | |
| try: | |
| subprocess.run( | |
| ["ffmpeg", "-y", "-i", filepath, "-acodec", "pcm_s16le", wav_path], | |
| check=True, | |
| capture_output=True | |
| ) | |
| logger.info(f"Converted {file_ext} to WAV: {wav_path}") | |
| return wav_path | |
| except subprocess.CalledProcessError as e: | |
| logger.error(f"FFmpeg conversion failed: {e.stderr}") | |
| raise | |
| return filepath | |
| def load_audio_gradio( | |
| audio_or_file: Union[None, str, Tuple[int, np.ndarray]], | |
| sr: int | |
| ) -> Optional[Tuple[Tensor, AudioMetaData]]: | |
| """Load audio from Gradio input.""" | |
| if audio_or_file is None: | |
| return None | |
| if isinstance(audio_or_file, str): | |
| if audio_or_file.lower() == "none": | |
| return None | |
| audio_or_file = AudioLoader.ensure_wav(audio_or_file) | |
| audio, meta = load_audio(audio_or_file, sr) | |
| else: | |
| meta = AudioMetaData(-1, -1, -1, -1, "") | |
| assert isinstance(audio_or_file, (tuple, list)) | |
| meta.sample_rate, audio_np = audio_or_file | |
| audio_np = audio_np.reshape(audio_np.shape[0], -1).T | |
| if audio_np.dtype == np.int16: | |
| audio_np = (audio_np / (1 << 15)).astype(np.float32) | |
| elif audio_np.dtype == np.int32: | |
| audio_np = (audio_np / (1 << 31)).astype(np.float32) | |
| audio = resample(torch.from_numpy(audio_np), meta.sample_rate, sr) | |
| return audio, meta | |
| class SpectrogramVisualizer: | |
| """Handles spectrogram visualization""" | |
| def __init__(self, figsize: Tuple[float, float] = (15.2, 4)): | |
| self.figsize = figsize | |
| plt.style.use('dark_background') | |
| self.fig_noisy, self.ax_noisy = plt.subplots(figsize=figsize) | |
| self.fig_noisy.set_tight_layout(True) | |
| self.fig_enh, self.ax_enh = plt.subplots(figsize=figsize) | |
| self.fig_enh.set_tight_layout(True) | |
| def specshow( | |
| self, | |
| spec: Union[Tensor, np.ndarray], | |
| ax: Optional[plt.Axes] = None, | |
| title: Optional[str] = None, | |
| xlabel: Optional[str] = None, | |
| ylabel: Optional[str] = None, | |
| sr: int = 48000, | |
| n_fft: Optional[int] = None, | |
| hop: Optional[int] = None, | |
| vmin: float = -100, | |
| vmax: float = 0, | |
| cmap: str = "plasma", | |
| ): | |
| """Plot a spectrogram of shape [F, T]""" | |
| spec_np = spec.cpu().numpy() if isinstance(spec, torch.Tensor) else spec | |
| if n_fft is None: | |
| n_fft = spec.shape[0] * 2 if spec.shape[0] % 2 == 0 else (spec.shape[0] - 1) * 2 | |
| hop = hop or n_fft // 4 | |
| t = np.arange(0, spec_np.shape[-1]) * hop / sr | |
| f = np.arange(0, spec_np.shape[0]) * sr // 2 / (n_fft // 2) / 1000 | |
| im = ax.pcolormesh( | |
| t, f, spec_np, | |
| rasterized=True, | |
| shading="auto", | |
| vmin=vmin, | |
| vmax=vmax, | |
| cmap=cmap | |
| ) | |
| if title: | |
| ax.set_title(title, fontsize=14, fontweight='bold', pad=15, color='#e0e0e0') | |
| if xlabel: | |
| ax.set_xlabel(xlabel, fontsize=11, color='#b0b0b0') | |
| if ylabel: | |
| ax.set_ylabel(ylabel, fontsize=11, color='#b0b0b0') | |
| ax.grid(True, alpha=0.15, linestyle='--', linewidth=0.5, color='#555') | |
| ax.tick_params(colors='#888', labelsize=9) | |
| ax.spines['top'].set_color('#333') | |
| ax.spines['bottom'].set_color('#333') | |
| ax.spines['left'].set_color('#333') | |
| ax.spines['right'].set_color('#333') | |
| return im | |
| def create_spectrogram( | |
| self, | |
| audio: Tensor, | |
| figure: plt.Figure, | |
| ax: plt.Axes, | |
| sr: int = 48000, | |
| n_fft: int = 1024, | |
| hop: int = 512, | |
| title: Optional[str] = None, | |
| ) -> PILImage.Image: | |
| """Create spectrogram image from audio tensor""" | |
| audio = torch.as_tensor(audio) | |
| w = torch.hann_window(n_fft, device=audio.device) | |
| spec = torch.stft(audio, n_fft, hop, window=w, return_complex=False) | |
| spec = spec.div_(w.pow(2).sum()) | |
| spec = torch.view_as_complex(spec).abs().clamp_min(1e-12).log10().mul(10) | |
| vmax = max(0.0, spec.max().item()) | |
| if spec.dim() > 2: | |
| spec = spec.squeeze(0) | |
| ax.clear() | |
| self.specshow( | |
| spec, | |
| ax=ax, | |
| title=title, | |
| xlabel="Time [s]", | |
| ylabel="Frequency [kHz]", | |
| sr=sr, | |
| n_fft=n_fft, | |
| hop=hop, | |
| vmax=vmax, | |
| ) | |
| figure.patch.set_facecolor('#0a0a0a') | |
| ax.set_facecolor('#0a0a0a') | |
| figure.canvas.draw() | |
| return PILImage.frombytes( | |
| "RGB", | |
| figure.canvas.get_width_height(), | |
| figure.canvas.tostring_rgb() | |
| ) | |
| class FileManager: | |
| """Manages temporary file cleanup""" | |
| def cleanup_tmp(filter_list: List[str] = None, hours_keep: int = 2, temp_dir: str = "/tmp"): | |
| """Clean up old temporary files.""" | |
| if filter_list is None: | |
| filter_list = [] | |
| filter_list.append("p232") | |
| if not os.path.exists(temp_dir): | |
| return | |
| logger.info(f"Cleaning up temporary files older than {hours_keep} hours") | |
| cleaned = 0 | |
| for filepath in glob.glob(os.path.join(temp_dir, "*")): | |
| try: | |
| is_old = (time.time() - os.path.getmtime(filepath)) / 3600 > hours_keep | |
| filtered = any(filt in filepath for filt in filter_list if filt is not None) | |
| if is_old and not filtered: | |
| os.remove(filepath) | |
| cleaned += 1 | |
| logger.debug(f"Removed file {filepath}") | |
| except Exception as e: | |
| logger.warning(f"Failed to remove file {filepath}: {e}") | |
| if cleaned > 0: | |
| logger.info(f"Cleaned up {cleaned} temporary files") | |
| # ============================================================================ | |
| # Initialize Application | |
| # ============================================================================ | |
| app_config = AppConfig( | |
| device=torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| ) | |
| logger.info(f"Loading DeepFilterNet2 model on {app_config.device}") | |
| model, df, _ = init_df(app_config.model_path, config_allow_defaults=True) | |
| model = model.to(device=app_config.device).eval() | |
| audio_processor = AudioProcessor(model, df, app_config) | |
| audio_loader = AudioLoader() | |
| visualizer = SpectrogramVisualizer() | |
| file_manager = FileManager() | |
| NOISES = { | |
| "None": None, | |
| "π³ Kitchen": "samples/dkitchen.wav", | |
| "ποΈ Living Room": "samples/dliving.wav", | |
| "π River": "samples/nriver.wav", | |
| "β Cafe": "samples/scafe.wav", | |
| } | |
| # ============================================================================ | |
| # Main Processing Function | |
| # ============================================================================ | |
| def process_audio( | |
| speech_file: Optional[str], | |
| noise_type: str, | |
| snr: int, | |
| mic_input: Optional[str] = None, | |
| ) -> Tuple[str, PILImage.Image, str, PILImage.Image]: | |
| """Main audio processing pipeline.""" | |
| try: | |
| if mic_input: | |
| speech_file = mic_input | |
| sr = app_config.sample_rate | |
| logger.info(f"Processing: file={speech_file}, noise={noise_type}, snr={snr}") | |
| if speech_file is not None: | |
| speech_file = audio_loader.ensure_wav(speech_file) | |
| sample, meta = load_audio(speech_file, sr) | |
| max_len = app_config.max_duration_seconds * sr | |
| if sample.shape[-1] > max_len: | |
| logger.warning(f"Audio too long, truncating to {app_config.max_duration_seconds}s") | |
| start = torch.randint(0, sample.shape[-1] - max_len, ()).item() | |
| sample = sample[..., start : start + max_len] | |
| else: | |
| sample, meta = load_audio("samples/p232_013_clean.wav", sr) | |
| sample = sample[..., : app_config.max_duration_seconds * sr] | |
| if sample.dim() > 1 and sample.shape[0] > 1: | |
| logger.info(f"Converting from {sample.shape[0]} channels to mono") | |
| sample = sample.mean(dim=0, keepdim=True) | |
| logger.info(f"Loaded audio with shape {sample.shape}") | |
| noise_fn = NOISES.get(noise_type) | |
| if noise_fn is not None: | |
| noise, _ = load_audio(noise_fn, sr) | |
| logger.info(f"Adding {noise_type} noise at {snr} dB SNR") | |
| _, _, sample = audio_processor.mix_at_snr(sample, noise, int(snr)) | |
| enhanced = audio_processor.enhance_audio(sample) | |
| logger.info("Audio enhancement completed") | |
| if meta.sample_rate != sr and meta.sample_rate > 0: | |
| enhanced = resample(enhanced, sr, meta.sample_rate) | |
| sample = resample(sample, sr, meta.sample_rate) | |
| sr = meta.sample_rate | |
| noisy_wav = tempfile.NamedTemporaryFile(suffix="_noisy.wav", delete=False).name | |
| save_audio(noisy_wav, sample, sr) | |
| enhanced_wav = tempfile.NamedTemporaryFile(suffix="_enhanced.wav", delete=False).name | |
| save_audio(enhanced_wav, enhanced, sr) | |
| logger.info(f"Saved outputs: {noisy_wav}, {enhanced_wav}") | |
| noisy_spec = visualizer.create_spectrogram( | |
| sample, | |
| visualizer.fig_noisy, | |
| visualizer.ax_noisy, | |
| sr=sr, | |
| title="Input Audio Spectrogram" | |
| ) | |
| enhanced_spec = visualizer.create_spectrogram( | |
| enhanced, | |
| visualizer.fig_enh, | |
| visualizer.ax_enh, | |
| sr=sr, | |
| title="Enhanced Audio Spectrogram" | |
| ) | |
| filter_files = [speech_file, noisy_wav, enhanced_wav] | |
| if mic_input: | |
| filter_files.append(mic_input) | |
| file_manager.cleanup_tmp(filter_files, app_config.cleanup_hours) | |
| return noisy_wav, noisy_spec, enhanced_wav, enhanced_spec | |
| except Exception as e: | |
| logger.error(f"Error processing audio: {e}", exc_info=True) | |
| raise gr.Error(f"Processing failed: {str(e)}") | |
| def toggle_input_mode(choice: str): | |
| """Toggle between microphone and file upload.""" | |
| if choice == "mic": | |
| return gr.update(visible=True, value=None), gr.update(visible=False, value=None) | |
| else: | |
| return gr.update(visible=False, value=None), gr.update(visible=True, value=None) | |
| # ============================================================================ | |
| # Custom CSS - Dark Theme | |
| # ============================================================================ | |
| custom_css = """ | |
| /* Global Dark Theme */ | |
| .gradio-container { | |
| font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
| background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 100%) !important; | |
| } | |
| body { | |
| background: #0a0a0a !important; | |
| } | |
| /* Hero Section */ | |
| #hero-section { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| padding: 50px 30px; | |
| border-radius: 20px; | |
| margin-bottom: 40px; | |
| box-shadow: 0 15px 40px rgba(102, 126, 234, 0.6); | |
| text-align: center; | |
| border: 1px solid rgba(255, 255, 255, 0.1); | |
| } | |
| #hero-section h1 { | |
| color: #ffffff; | |
| font-size: 3.2em; | |
| font-weight: 800; | |
| margin: 0 0 15px 0; | |
| text-shadow: 2px 2px 8px rgba(0,0,0,0.4); | |
| letter-spacing: -1px; | |
| } | |
| #hero-section p { | |
| color: rgba(255,255,255,0.95); | |
| font-size: 1.25em; | |
| margin: 10px auto; | |
| max-width: 800px; | |
| line-height: 1.6; | |
| font-weight: 300; | |
| } | |
| /* Feature Cards - Dark */ | |
| .feature-card { | |
| background: linear-gradient(135deg, #1e1e2e 0%, #2d2d44 100%); | |
| padding: 25px; | |
| border-radius: 15px; | |
| box-shadow: 0 8px 25px rgba(0,0,0,0.4); | |
| margin-bottom: 20px; | |
| border: 1px solid rgba(102, 126, 234, 0.3); | |
| transition: all 0.3s ease; | |
| } | |
| .feature-card:hover { | |
| transform: translateY(-3px); | |
| box-shadow: 0 12px 35px rgba(102, 126, 234, 0.5); | |
| border-color: rgba(102, 126, 234, 0.6); | |
| } | |
| /* Input Controls Section */ | |
| .input-controls { | |
| background: linear-gradient(135deg, #1a1a2e 0%, #252545 100%); | |
| padding: 30px; | |
| border-radius: 15px; | |
| box-shadow: 0 5px 20px rgba(0,0,0,0.5); | |
| border: 1px solid rgba(102, 126, 234, 0.2); | |
| } | |
| /* Output Section */ | |
| .output-section { | |
| background: linear-gradient(135deg, #2d1b3d 0%, #3d2952 100%); | |
| padding: 30px; | |
| border-radius: 15px; | |
| box-shadow: 0 5px 20px rgba(0,0,0,0.5); | |
| border: 1px solid rgba(118, 75, 162, 0.3); | |
| } | |
| /* Section Headers */ | |
| .section-header { | |
| color: #a78bfa; | |
| font-size: 1.8em; | |
| font-weight: 700; | |
| margin: 30px 0 20px 0; | |
| text-align: center; | |
| text-shadow: 0 0 20px rgba(167, 139, 250, 0.5); | |
| } | |
| /* Process Button */ | |
| .process-button { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; | |
| border: none !important; | |
| font-size: 1.4em !important; | |
| font-weight: 700 !important; | |
| padding: 20px 50px !important; | |
| border-radius: 50px !important; | |
| box-shadow: 0 10px 40px rgba(102, 126, 234, 0.7) !important; | |
| transition: all 0.3s ease !important; | |
| color: #ffffff !important; | |
| text-transform: uppercase; | |
| letter-spacing: 1px; | |
| } | |
| .process-button:hover { | |
| transform: translateY(-3px) scale(1.02) !important; | |
| box-shadow: 0 15px 50px rgba(102, 126, 234, 0.9) !important; | |
| } | |
| /* Audio Components */ | |
| .audio-wrapper { | |
| background: linear-gradient(135deg, #1e1e2e 0%, #2a2a40 100%); | |
| padding: 20px; | |
| border-radius: 12px; | |
| box-shadow: 0 3px 12px rgba(0,0,0,0.6); | |
| margin: 15px 0; | |
| border: 1px solid rgba(102, 126, 234, 0.2); | |
| } | |
| /* Tabs */ | |
| .tab-nav button { | |
| font-weight: 600 !important; | |
| font-size: 1.1em !important; | |
| padding: 12px 24px !important; | |
| border-radius: 10px 10px 0 0 !important; | |
| background: #1a1a2e !important; | |
| color: #a0a0b0 !important; | |
| border: 1px solid rgba(102, 126, 234, 0.2) !important; | |
| } | |
| .tab-nav button[aria-selected="true"] { | |
| background: linear-gradient(135deg, #667eea, #764ba2) !important; | |
| color: #ffffff !important; | |
| } | |
| /* Info Box */ | |
| .info-box { | |
| background: linear-gradient(135deg, #1e1e3f 0%, #2d2d52 100%); | |
| padding: 25px; | |
| border-radius: 15px; | |
| margin: 25px 0; | |
| border-left: 5px solid #667eea; | |
| box-shadow: 0 4px 20px rgba(0,0,0,0.5); | |
| } | |
| .info-box h3 { | |
| color: #a78bfa; | |
| font-size: 1.4em; | |
| font-weight: 700; | |
| margin-top: 0; | |
| } | |
| .info-box p, .info-box ul, .info-box li { | |
| color: #c0c0d0; | |
| } | |
| .info-box ul { | |
| margin: 10px 0; | |
| padding-left: 25px; | |
| } | |
| .info-box li { | |
| margin: 8px 0; | |
| line-height: 1.6; | |
| } | |
| /* Examples Section */ | |
| .examples-section { | |
| background: linear-gradient(135deg, #2a2a3e 0%, #35354f 100%); | |
| padding: 25px; | |
| border-radius: 15px; | |
| margin-top: 30px; | |
| box-shadow: 0 4px 20px rgba(0,0,0,0.5); | |
| border: 1px solid rgba(102, 126, 234, 0.2); | |
| } | |
| /* Footer */ | |
| #footer { | |
| text-align: center; | |
| padding: 30px 20px; | |
| margin-top: 50px; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| border-radius: 15px; | |
| color: #ffffff; | |
| box-shadow: 0 10px 30px rgba(102, 126, 234, 0.5); | |
| } | |
| #footer h3 { | |
| margin: 0 0 10px 0; | |
| font-size: 1.5em; | |
| font-weight: 700; | |
| } | |
| #footer p { | |
| margin: 5px 0; | |
| opacity: 0.95; | |
| } | |
| /* Radio Buttons */ | |
| .radio-group label { | |
| padding: 12px 20px !important; | |
| border-radius: 10px !important; | |
| font-weight: 600 !important; | |
| transition: all 0.3s ease !important; | |
| background: #1a1a2e !important; | |
| color: #b0b0c0 !important; | |
| border: 1px solid rgba(102, 126, 234, 0.3) !important; | |
| } | |
| .radio-group label:hover { | |
| background: #252545 !important; | |
| border-color: rgba(102, 126, 234, 0.6) !important; | |
| } | |
| /* Dropdowns */ | |
| .dropdown select { | |
| border-radius: 10px !important; | |
| padding: 12px !important; | |
| font-size: 1.05em !important; | |
| background: #1a1a2e !important; | |
| color: #c0c0d0 !important; | |
| border: 2px solid rgba(102, 126, 234, 0.3) !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| .dropdown select:focus { | |
| border-color: #667eea !important; | |
| box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.3) !important; | |
| } | |
| /* Labels and Text */ | |
| label, .label { | |
| color: #b0b0c0 !important; | |
| } | |
| /* Markdown Text */ | |
| .markdown-text, .prose { | |
| color: #c0c0d0 !important; | |
| } | |
| /* Input Fields */ | |
| input, textarea { | |
| background: #1a1a2e !important; | |
| color: #c0c0d0 !important; | |
| border: 1px solid rgba(102, 126, 234, 0.3) !important; | |
| } | |
| /* Scrollbars */ | |
| ::-webkit-scrollbar { | |
| width: 10px; | |
| background: #1a1a2e; | |
| } | |
| ::-webkit-scrollbar-thumb { | |
| background: linear-gradient(135deg, #667eea, #764ba2); | |
| border-radius: 5px; | |
| } | |
| ::-webkit-scrollbar-thumb:hover { | |
| background: linear-gradient(135deg, #764ba2, #667eea); | |
| } | |
| """ | |
| # ============================================================================ | |
| # Gradio Interface | |
| # ============================================================================ | |
| with gr.Blocks(css=custom_css, theme=gr.themes.Base()) as demo: | |
| # Hero Section | |
| gr.HTML(""" | |
| <div id="hero-section"> | |
| <h1>π΅ DeepFilterNet2 Audio Enhancement</h1> | |
| <p>Transform noisy audio into crystal-clear sound using cutting-edge AI technology</p> | |
| <p style="font-size: 0.95em; margin-top: 15px;"> | |
| β¨ Real-time Processing | π― State-of-the-Art Quality | π Lightning Fast | |
| </p> | |
| </div> | |
| """) | |
| # Quick Start Guide | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| <div class="info-box"> | |
| <h3>π Quick Start Guide</h3> | |
| <ul> | |
| <li><strong>Step 1:</strong> Upload an audio file or record using your microphone</li> | |
| <li><strong>Step 2:</strong> Optionally add synthetic noise to test the denoiser</li> | |
| <li><strong>Step 3:</strong> Adjust SNR settings if needed</li> | |
| <li><strong>Step 4:</strong> Click the "Denoise Audio" button</li> | |
| <li><strong>Step 5:</strong> Compare results with interactive spectrograms</li> | |
| </ul> | |
| </div> | |
| """) | |
| # Main Interface | |
| with gr.Row(): | |
| # Left Column - Input Controls | |
| with gr.Column(scale=1): | |
| gr.HTML('<h2 class="section-header">π€ Audio Input</h2>') | |
| with gr.Group(elem_classes="input-controls"): | |
| input_mode = gr.Radio( | |
| ["file", "mic"], | |
| value="file", | |
| label="ποΈ Input Method", | |
| info="Choose your preferred input source", | |
| elem_classes="radio-group" | |
| ) | |
| audio_file = gr.Audio( | |
| type="filepath", | |
| label="π Upload Audio File", | |
| visible=True, | |
| elem_classes="audio-wrapper" | |
| ) | |
| mic_input = gr.Audio( | |
| sources=["microphone"], | |
| type="filepath", | |
| label="π€ Record Audio", | |
| visible=False, | |
| elem_classes="audio-wrapper" | |
| ) | |
| gr.HTML('<h2 class="section-header">βοΈ Enhancement Settings</h2>') | |
| with gr.Group(elem_classes="feature-card"): | |
| noise_type = gr.Dropdown( | |
| label="π Background Noise Type", | |
| choices=list(NOISES.keys()), | |
| value="None", | |
| info="Add synthetic noise for testing", | |
| elem_classes="dropdown" | |
| ) | |
| snr = gr.Dropdown( | |
| label="π Signal-to-Noise Ratio (dB)", | |
| choices=["-5", "0", "10", "20"], | |
| value="10", | |
| info="Higher = cleaner signal", | |
| elem_classes="dropdown" | |
| ) | |
| process_btn = gr.Button( | |
| "π Denoise Audio", | |
| elem_classes="process-button", | |
| size="lg" | |
| ) | |
| # Right Column - Results | |
| with gr.Column(scale=2): | |
| gr.HTML('<h2 class="section-header">π Results & Comparison</h2>') | |
| with gr.Tabs(): | |
| with gr.Tab("π΄ Input Audio", elem_classes="output-section"): | |
| noisy_audio = gr.Audio( | |
| type="filepath", | |
| label="Original/Noisy Audio", | |
| elem_classes="audio-wrapper" | |
| ) | |
| noisy_spec = gr.Image( | |
| label="Input Spectrogram", | |
| elem_classes="audio-wrapper" | |
| ) | |
| with gr.Tab("π’ Enhanced Audio", elem_classes="output-section"): | |
| enhanced_audio = gr.Audio( | |
| type="filepath", | |
| label="Enhanced Audio", | |
| elem_classes="audio-wrapper" | |
| ) | |
| enhanced_spec = gr.Image( | |
| label="Enhanced Spectrogram", | |
| elem_classes="audio-wrapper" | |
| ) | |
| # Examples Section | |
| gr.HTML('<h2 class="section-header">π― Try These Examples</h2>') | |
| with gr.Group(elem_classes="examples-section"): | |
| gr.Examples( | |
| examples=[ | |
| ["./samples/p232_013_clean.wav", "π³ Kitchen", "10"], | |
| ["./samples/p232_013_clean.wav", "β Cafe", "10"], | |
| ["./samples/p232_019_clean.wav", "β Cafe", "10"], | |
| ["./samples/p232_019_clean.wav", "π River", "10"], | |
| ], | |
| inputs=[audio_file, noise_type, snr], | |
| outputs=[noisy_audio, noisy_spec, enhanced_audio, enhanced_spec], | |
| fn=process_audio, | |
| cache_examples=True, | |
| label="Click any example to try it instantly", | |
| ) | |
| # Technical Information | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown(""" | |
| <div class="info-box"> | |
| <h3>π‘ How It Works</h3> | |
| <p><strong>DeepFilterNet2</strong> uses advanced deep learning to identify and remove unwanted background noise while preserving speech clarity. The model analyzes spectral patterns to distinguish between signal and noise components.</p> | |
| </div> | |
| """) | |
| with gr.Column(): | |
| gr.Markdown(""" | |
| <div class="info-box"> | |
| <h3>π Technical Specifications</h3> | |
| <ul> | |
| <li><strong>Model:</strong> DeepFilterNet2 (State-of-the-art)</li> | |
| <li><strong>Sample Rate:</strong> 48 kHz</li> | |
| <li><strong>Max Duration:</strong> 1 hour</li> | |
| <li><strong>Formats:</strong> WAV, MP3, M4A, OGG, FLAC, AAC</li> | |
| <li><strong>Processing:</strong> Real-time capable</li> | |
| </ul> | |
| </div> | |
| """) | |
| # Footer | |
| gr.HTML(""" | |
| <div id="footer"> | |
| <h3>π΅ Powered by DeepFilterNet2</h3> | |
| <p>Advanced AI-driven audio enhancement technology</p> | |
| <p><em>Built with Gradio β’ Optimized for Performance</em></p> | |
| </div> | |
| """) | |
| # Event Handlers | |
| process_btn.click( | |
| fn=process_audio, | |
| inputs=[audio_file, noise_type, snr, mic_input], | |
| outputs=[noisy_audio, noisy_spec, enhanced_audio, enhanced_spec], | |
| api_name="denoise", | |
| ) | |
| input_mode.change( | |
| fn=toggle_input_mode, | |
| inputs=input_mode, | |
| outputs=[mic_input, audio_file], | |
| ) | |
| # Initial cleanup | |
| file_manager.cleanup_tmp() | |
| # Launch application | |
| if __name__ == "__main__": | |
| demo.queue().launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| ) |