DeepFilterNet2

Build error

App Files Files Community

humairawan commited on Dec 16, 2025

Commit

cf1b40e

verified ·

1 Parent(s): fb24e3e

Update app.py

Browse files

Files changed (1) hide show

app.py +661 -81

app.py CHANGED Viewed

@@ -29,124 +29,281 @@ from df.io import resample
 class AppConfig:
     """Application configuration"""
     device: torch.device
-    model_sample_rate: int = 48000
     max_duration_seconds: int = 3600
     cleanup_hours: int = 2
     temp_dir: str = "/tmp"
     model_path: str = "./DeepFilterNet2"
     fade_duration: float = 0.15
-# ============================================================================
-# Audio Processing Classes
-# ============================================================================
 class AudioProcessor:
     def __init__(self, model, df, config: AppConfig):
         self.model = model
         self.df = df
         self.config = config
     def mix_at_snr(self, clean: Tensor, noise: Tensor, snr: float, eps: float = 1e-10) -> Tuple[Tensor, Tensor, Tensor]:
         clean = torch.as_tensor(clean).mean(0, keepdim=True)
         noise = torch.as_tensor(noise).mean(0, keepdim=True)
         if noise.shape[1] < clean.shape[1]:
             repeats = int(math.ceil(clean.shape[1] / noise.shape[1]))
             noise = noise.repeat((1, repeats))
         max_start = int(noise.shape[1] - clean.shape[1])
         start = torch.randint(0, max_start, ()).item() if max_start > 0 else 0
-        noise = noise[:, start:start + clean.shape[1]]
         E_speech = torch.mean(clean.pow(2)) + eps
         E_noise = torch.mean(noise.pow(2)) + eps
         K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps)
         noise = noise / K
         mixture = clean + noise
         max_m = mixture.abs().max()
         if max_m > 1:
             clean, noise, mixture = clean / max_m, noise / max_m, mixture / max_m
         return clean, noise, mixture
     def enhance_audio(self, audio: Tensor) -> Tensor:
         with torch.no_grad():
             enhanced = enhance(self.model, self.df, audio)
-        sr = self.config.model_sample_rate
         fade_samples = int(sr * self.config.fade_duration)
         lim = torch.linspace(0.0, 1.0, fade_samples).unsqueeze(0)
-        lim = torch.cat((lim, torch.ones(1, enhanced.shape[1] - fade_samples)), dim=1)
-        return enhanced * lim
 class AudioLoader:
     @staticmethod
     def ensure_wav(filepath: str) -> str:
         if not filepath:
             return filepath
-        ext = Path(filepath).suffix.lower()
-        if ext in [".mp3", ".m4a", ".ogg", ".flac", ".aac"]:
-            wav_path = str(Path(filepath).with_suffix(".wav"))
-            subprocess.run(["ffmpeg", "-y", "-i", filepath, "-acodec", "pcm_s16le", wav_path],
-                           check=True, capture_output=True)
-            return wav_path
         return filepath
     @staticmethod
-    def load_and_resample(audio_or_file: Union[str, Tuple[int, np.ndarray]], target_sr: int) -> Tensor:
         if isinstance(audio_or_file, str):
             audio_or_file = AudioLoader.ensure_wav(audio_or_file)
-            audio, meta = load_audio(audio_or_file, target_sr)
         else:
-            sr, audio_np = audio_or_file
             audio_np = audio_np.reshape(audio_np.shape[0], -1).T
             if audio_np.dtype == np.int16:
                 audio_np = (audio_np / (1 << 15)).astype(np.float32)
             elif audio_np.dtype == np.int32:
                 audio_np = (audio_np / (1 << 31)).astype(np.float32)
-            audio = torch.from_numpy(audio_np)
-            if sr != target_sr:
-                audio = resample(audio, target_sr, sr)
-        return audio
 class SpectrogramVisualizer:
-    def __init__(self, figsize=(15,4)):
         self.figsize = figsize
         self.fig_noisy, self.ax_noisy = plt.subplots(figsize=figsize)
         self.fig_enh, self.ax_enh = plt.subplots(figsize=figsize)
-    def create_spectrogram(self, audio: Tensor, figure: plt.Figure, ax: plt.Axes,
-                           sr: int, n_fft: int = 1024, hop: int = 512, title: str = None) -> PILImage.Image:
         audio = torch.as_tensor(audio)
         w = torch.hann_window(n_fft, device=audio.device)
         spec = torch.stft(audio, n_fft, hop, window=w, return_complex=False)
         spec = spec.div_(w.pow(2).sum())
         spec = torch.view_as_complex(spec).abs().clamp_min(1e-12).log10().mul(10)
         if spec.dim() > 2:
             spec = spec.squeeze(0)
         ax.clear()
-        t = np.arange(spec.shape[-1]) * hop / sr
-        f = np.arange(spec.shape[0]) * sr // 2 / (n_fft // 2) / 1000
-        ax.pcolormesh(t, f, spec.cpu().numpy(), shading="auto", cmap="inferno", vmin=-100, vmax=0)
-        if title:
-            ax.set_title(title)
         figure.canvas.draw()
-        return PILImage.frombytes("RGB", figure.canvas.get_width_height(), figure.canvas.tostring_rgb())
 # ============================================================================
-# Initialization
 # ============================================================================
-app_config = AppConfig(device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
 model, df, _ = init_df(app_config.model_path, config_allow_defaults=True)
 model = model.to(device=app_config.device).eval()
 audio_processor = AudioProcessor(model, df, app_config)
 audio_loader = AudioLoader()
 visualizer = SpectrogramVisualizer()
 NOISES = {
     "None": None,
-    "Kitchen": "samples/dkitchen.wav",
-    "Living Room": "samples/dliving.wav",
-    "River": "samples/nriver.wav",
-    "Cafe": "samples/scafe.wav",
 }
 # ============================================================================
 # Main Processing Function
 # ============================================================================
@@ -155,61 +312,484 @@ def process_audio(
     speech_file: Optional[str],
     noise_type: str,
     snr: int,
-    target_rate: int = 22050,
     mic_input: Optional[str] = None,
 ) -> Tuple[str, PILImage.Image, str, PILImage.Image]:
-    if mic_input:
-        speech_file = mic_input
-    model_sr = app_config.model_sample_rate
-    audio = audio_loader.load_and_resample(speech_file, model_sr)
-    # Add noise if requested
-    noise_fn = NOISES.get(noise_type)
-    if noise_fn:
-        noise_audio = audio_loader.load_and_resample(noise_fn, model_sr)
-        _, _, audio = audio_processor.mix_at_snr(audio, noise_audio, snr)
-    enhanced = audio_processor.enhance_audio(audio)
-    # Downsample back to target rate if needed
-    if target_rate != model_sr:
-        enhanced = resample(enhanced, target_rate, model_sr)
-        audio = resample(audio, target_rate, model_sr)
-    noisy_wav = tempfile.NamedTemporaryFile(suffix="_noisy.wav", delete=False).name
-    enhanced_wav = tempfile.NamedTemporaryFile(suffix="_enhanced.wav", delete=False).name
-    save_audio(noisy_wav, audio, target_rate)
-    save_audio(enhanced_wav, enhanced, target_rate)
-    noisy_spec = visualizer.create_spectrogram(audio, visualizer.fig_noisy, visualizer.ax_noisy,
-                                               sr=target_rate, title="Noisy Audio")
-    enhanced_spec = visualizer.create_spectrogram(enhanced, visualizer.fig_enh, visualizer.ax_enh,
-                                                  sr=target_rate, title="Enhanced Audio")
-    return noisy_wav, noisy_spec, enhanced_wav, enhanced_spec
 # ============================================================================
 # Gradio Interface
 # ============================================================================
-with gr.Blocks() as demo:
-    gr.Markdown("# 🎵 DeepFilterNet2 Denoiser with Resampling Support")
-    audio_file = gr.Audio(type="filepath", label="Upload Audio")
-    mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
-    noise_type = gr.Dropdown(label="Noise Type", choices=list(NOISES.keys()), value="None")
-    snr = gr.Slider(label="SNR (dB)", minimum=-10, maximum=30, step=1, value=10)
-    target_rate = gr.Dropdown(label="Output Sample Rate", choices=[16000, 22050, 44100, 48000], value=22050)
-    process_btn = gr.Button("🚀 Enhance Audio")
-    noisy_audio = gr.Audio(type="filepath")
-    noisy_spec = gr.Image()
-    enhanced_audio = gr.Audio(type="filepath")
-    enhanced_spec = gr.Image()
     process_btn.click(
         fn=process_audio,
-        inputs=[audio_file, noise_type, snr, target_rate, mic_input],
-        outputs=[noisy_audio, noisy_spec, enhanced_audio, enhanced_spec]
     )
 if __name__ == "__main__":
-    demo.launch()

 class AppConfig:
     """Application configuration"""
     device: torch.device
+    sample_rate: int = 48000
     max_duration_seconds: int = 3600
     cleanup_hours: int = 2
     temp_dir: str = "/tmp"
     model_path: str = "./DeepFilterNet2"
     fade_duration: float = 0.15
 class AudioProcessor:
+    """Handles audio processing operations"""
     def __init__(self, model, df, config: AppConfig):
         self.model = model
         self.df = df
         self.config = config
     def mix_at_snr(self, clean: Tensor, noise: Tensor, snr: float, eps: float = 1e-10) -> Tuple[Tensor, Tensor, Tensor]:
+        """Mix clean and noise signal at a given SNR with improved error handling."""
         clean = torch.as_tensor(clean).mean(0, keepdim=True)
         noise = torch.as_tensor(noise).mean(0, keepdim=True)
         if noise.shape[1] < clean.shape[1]:
             repeats = int(math.ceil(clean.shape[1] / noise.shape[1]))
             noise = noise.repeat((1, repeats))
         max_start = int(noise.shape[1] - clean.shape[1])
         start = torch.randint(0, max_start, ()).item() if max_start > 0 else 0
+        noise = noise[:, start : start + clean.shape[1]]
         E_speech = torch.mean(clean.pow(2)) + eps
         E_noise = torch.mean(noise.pow(2)) + eps
         K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps)
         noise = noise / K
         mixture = clean + noise
+        assert torch.isfinite(mixture).all(), "Non-finite values detected in mixture"
         max_m = mixture.abs().max()
         if max_m > 1:
+            logger.warning(f"Clipping detected during mixing. Reducing gain by {1/max_m:.3f}")
             clean, noise, mixture = clean / max_m, noise / max_m, mixture / max_m
         return clean, noise, mixture
     def enhance_audio(self, audio: Tensor) -> Tensor:
+        """Enhance audio using the DeepFilterNet model."""
+        logger.info(f"Enhancing audio with shape {audio.shape}")
         with torch.no_grad():
             enhanced = enhance(self.model, self.df, audio)
+        sr = self.config.sample_rate
         fade_samples = int(sr * self.config.fade_duration)
         lim = torch.linspace(0.0, 1.0, fade_samples).unsqueeze(0)
+        lim = torch.cat((lim, torch.ones(1, enhanced.shape[1] - lim.shape[1])), dim=1)
+        enhanced = enhanced * lim
+        return enhanced
 class AudioLoader:
+    """Handles audio loading from various sources"""
     @staticmethod
     def ensure_wav(filepath: str) -> str:
+        """Convert audio files to WAV using ffmpeg if needed."""
         if not filepath:
             return filepath
+        file_ext = Path(filepath).suffix.lower()
+        if file_ext in ['.mp3', '.m4a', '.ogg', '.flac', '.aac']:
+            wav_path = str(Path(filepath).with_suffix('.wav'))
+            try:
+                subprocess.run(
+                    ["ffmpeg", "-y", "-i", filepath, "-acodec", "pcm_s16le", wav_path],
+                    check=True,
+                    capture_output=True
+                )
+                logger.info(f"Converted {file_ext} to WAV: {wav_path}")
+                return wav_path
+            except subprocess.CalledProcessError as e:
+                logger.error(f"FFmpeg conversion failed: {e.stderr}")
+                raise
         return filepath
     @staticmethod
+    def load_audio_gradio(
+        audio_or_file: Union[None, str, Tuple[int, np.ndarray]],
+        sr: int
+    ) -> Optional[Tuple[Tensor, AudioMetaData]]:
+        """Load audio from Gradio input."""
+        if audio_or_file is None:
+            return None
         if isinstance(audio_or_file, str):
+            if audio_or_file.lower() == "none":
+                return None
             audio_or_file = AudioLoader.ensure_wav(audio_or_file)
+            audio, meta = load_audio(audio_or_file, sr)
         else:
+            meta = AudioMetaData(-1, -1, -1, -1, "")
+            assert isinstance(audio_or_file, (tuple, list))
+            meta.sample_rate, audio_np = audio_or_file
             audio_np = audio_np.reshape(audio_np.shape[0], -1).T
             if audio_np.dtype == np.int16:
                 audio_np = (audio_np / (1 << 15)).astype(np.float32)
             elif audio_np.dtype == np.int32:
                 audio_np = (audio_np / (1 << 31)).astype(np.float32)
+            audio = resample(torch.from_numpy(audio_np), meta.sample_rate, sr)
+        return audio, meta
 class SpectrogramVisualizer:
+    """Handles spectrogram visualization"""
+    def __init__(self, figsize: Tuple[float, float] = (15.2, 4)):
         self.figsize = figsize
+        plt.style.use('dark_background')
         self.fig_noisy, self.ax_noisy = plt.subplots(figsize=figsize)
+        self.fig_noisy.set_tight_layout(True)
         self.fig_enh, self.ax_enh = plt.subplots(figsize=figsize)
+        self.fig_enh.set_tight_layout(True)
+    def specshow(
+        self,
+        spec: Union[Tensor, np.ndarray],
+        ax: Optional[plt.Axes] = None,
+        title: Optional[str] = None,
+        xlabel: Optional[str] = None,
+        ylabel: Optional[str] = None,
+        sr: int = 48000,
+        n_fft: Optional[int] = None,
+        hop: Optional[int] = None,
+        vmin: float = -100,
+        vmax: float = 0,
+        cmap: str = "viridis",
+    ):
+        """Plot a spectrogram of shape [F, T]"""
+        spec_np = spec.cpu().numpy() if isinstance(spec, torch.Tensor) else spec
+        if n_fft is None:
+            n_fft = spec.shape[0] * 2 if spec.shape[0] % 2 == 0 else (spec.shape[0] - 1) * 2
+        hop = hop or n_fft // 4
+        t = np.arange(0, spec_np.shape[-1]) * hop / sr
+        f = np.arange(0, spec_np.shape[0]) * sr // 2 / (n_fft // 2) / 1000
+        im = ax.pcolormesh(
+            t, f, spec_np,
+            rasterized=True,
+            shading="auto",
+            vmin=vmin,
+            vmax=vmax,
+            cmap=cmap
+        )
+        if title:
+            ax.set_title(title, fontsize=14, fontweight='bold', pad=15, color='white')
+        if xlabel:
+            ax.set_xlabel(xlabel, fontsize=11, color='white')
+        if ylabel:
+            ax.set_ylabel(ylabel, fontsize=11, color='white')
+        ax.grid(True, alpha=0.15, linestyle='--', linewidth=0.5)
+        ax.tick_params(colors='white', labelsize=9)
+        return im
+    def create_spectrogram(
+        self,
+        audio: Tensor,
+        figure: plt.Figure,
+        ax: plt.Axes,
+        sr: int = 48000,
+        n_fft: int = 1024,
+        hop: int = 512,
+        title: Optional[str] = None,
+    ) -> PILImage.Image:
+        """Create spectrogram image from audio tensor"""
         audio = torch.as_tensor(audio)
         w = torch.hann_window(n_fft, device=audio.device)
         spec = torch.stft(audio, n_fft, hop, window=w, return_complex=False)
         spec = spec.div_(w.pow(2).sum())
         spec = torch.view_as_complex(spec).abs().clamp_min(1e-12).log10().mul(10)
+        vmax = max(0.0, spec.max().item())
         if spec.dim() > 2:
             spec = spec.squeeze(0)
         ax.clear()
+        self.specshow(
+            spec,
+            ax=ax,
+            title=title,
+            xlabel="Time [s]",
+            ylabel="Frequency [kHz]",
+            sr=sr,
+            n_fft=n_fft,
+            hop=hop,
+            vmax=vmax,
+        )
+        figure.patch.set_facecolor('#0f0f0f')
+        ax.set_facecolor('#0f0f0f')
         figure.canvas.draw()
+        return PILImage.frombytes(
+            "RGB",
+            figure.canvas.get_width_height(),
+            figure.canvas.tostring_rgb()
+        )
+class FileManager:
+    """Manages temporary file cleanup"""
+    @staticmethod
+    def cleanup_tmp(filter_list: List[str] = None, hours_keep: int = 2, temp_dir: str = "/tmp"):
+        """Clean up old temporary files."""
+        if filter_list is None:
+            filter_list = []
+        filter_list.append("p232")
+        if not os.path.exists(temp_dir):
+            return
+        logger.info(f"Cleaning up temporary files older than {hours_keep} hours")
+        cleaned = 0
+        for filepath in glob.glob(os.path.join(temp_dir, "*")):
+            try:
+                is_old = (time.time() - os.path.getmtime(filepath)) / 3600 > hours_keep
+                filtered = any(filt in filepath for filt in filter_list if filt is not None)
+                if is_old and not filtered:
+                    os.remove(filepath)
+                    cleaned += 1
+                    logger.debug(f"Removed file {filepath}")
+            except Exception as e:
+                logger.warning(f"Failed to remove file {filepath}: {e}")
+        if cleaned > 0:
+            logger.info(f"Cleaned up {cleaned} temporary files")
 # ============================================================================
+# Initialize Application
 # ============================================================================
+app_config = AppConfig(
+    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
+)
+logger.info(f"Loading DeepFilterNet2 model on {app_config.device}")
 model, df, _ = init_df(app_config.model_path, config_allow_defaults=True)
 model = model.to(device=app_config.device).eval()
 audio_processor = AudioProcessor(model, df, app_config)
 audio_loader = AudioLoader()
 visualizer = SpectrogramVisualizer()
+file_manager = FileManager()
 NOISES = {
     "None": None,
+    "🍳 Kitchen": "samples/dkitchen.wav",
+    "🛋️ Living Room": "samples/dliving.wav",
+    "🌊 River": "samples/nriver.wav",
+    "☕ Cafe": "samples/scafe.wav",
 }
 # ============================================================================
 # Main Processing Function
 # ============================================================================
     speech_file: Optional[str],
     noise_type: str,
     snr: int,
     mic_input: Optional[str] = None,
 ) -> Tuple[str, PILImage.Image, str, PILImage.Image]:
+    """Main audio processing pipeline."""
+    try:
+        if mic_input:
+            speech_file = mic_input
+        sr = app_config.sample_rate
+        logger.info(f"Processing: file={speech_file}, noise={noise_type}, snr={snr}")
+        if speech_file is not None:
+            speech_file = audio_loader.ensure_wav(speech_file)
+            sample, meta = load_audio(speech_file, sr)
+            max_len = app_config.max_duration_seconds * sr
+            if sample.shape[-1] > max_len:
+                logger.warning(f"Audio too long, truncating to {app_config.max_duration_seconds}s")
+                start = torch.randint(0, sample.shape[-1] - max_len, ()).item()
+                sample = sample[..., start : start + max_len]
+        else:
+            sample, meta = load_audio("samples/p232_013_clean.wav", sr)
+            sample = sample[..., : app_config.max_duration_seconds * sr]
+        if sample.dim() > 1 and sample.shape[0] > 1:
+            logger.info(f"Converting from {sample.shape[0]} channels to mono")
+            sample = sample.mean(dim=0, keepdim=True)
+        logger.info(f"Loaded audio with shape {sample.shape}")
+        noise_fn = NOISES.get(noise_type)
+        if noise_fn is not None:
+            noise, _ = load_audio(noise_fn, sr)
+            logger.info(f"Adding {noise_type} noise at {snr} dB SNR")
+            _, _, sample = audio_processor.mix_at_snr(sample, noise, int(snr))
+        enhanced = audio_processor.enhance_audio(sample)
+        logger.info("Audio enhancement completed")
+        if meta.sample_rate != sr and meta.sample_rate > 0:
+            enhanced = resample(enhanced, sr, meta.sample_rate)
+            sample = resample(sample, sr, meta.sample_rate)
+            sr = meta.sample_rate
+        noisy_wav = tempfile.NamedTemporaryFile(suffix="_noisy.wav", delete=False).name
+        save_audio(noisy_wav, sample, sr)
+        enhanced_wav = tempfile.NamedTemporaryFile(suffix="_enhanced.wav", delete=False).name
+        save_audio(enhanced_wav, enhanced, sr)
+        logger.info(f"Saved outputs: {noisy_wav}, {enhanced_wav}")
+        noisy_spec = visualizer.create_spectrogram(
+            sample,
+            visualizer.fig_noisy,
+            visualizer.ax_noisy,
+            sr=sr,
+            title="Input Audio Spectrogram"
+        )
+        enhanced_spec = visualizer.create_spectrogram(
+            enhanced,
+            visualizer.fig_enh,
+            visualizer.ax_enh,
+            sr=sr,
+            title="Enhanced Audio Spectrogram"
+        )
+        filter_files = [speech_file, noisy_wav, enhanced_wav]
+        if mic_input:
+            filter_files.append(mic_input)
+        file_manager.cleanup_tmp(filter_files, app_config.cleanup_hours)
+        return noisy_wav, noisy_spec, enhanced_wav, enhanced_spec
+    except Exception as e:
+        logger.error(f"Error processing audio: {e}", exc_info=True)
+        raise gr.Error(f"Processing failed: {str(e)}")
+def toggle_input_mode(choice: str):
+    """Toggle between microphone and file upload."""
+    if choice == "mic":
+        return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
+    else:
+        return gr.update(visible=False, value=None), gr.update(visible=True, value=None)
+# ============================================================================
+# Custom CSS
+# ============================================================================
+custom_css = """
+/* Global Styles */
+.gradio-container {
+    font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
+}
+/* Hero Section */
+#hero-section {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    padding: 50px 30px;
+    border-radius: 20px;
+    margin-bottom: 40px;
+    box-shadow: 0 15px 40px rgba(102, 126, 234, 0.4);
+    text-align: center;
+}
+#hero-section h1 {
+    color: white;
+    font-size: 3.2em;
+    font-weight: 800;
+    margin: 0 0 15px 0;
+    text-shadow: 2px 2px 8px rgba(0,0,0,0.2);
+    letter-spacing: -1px;
+}
+#hero-section p {
+    color: rgba(255,255,255,0.95);
+    font-size: 1.25em;
+    margin: 10px auto;
+    max-width: 800px;
+    line-height: 1.6;
+    font-weight: 300;
+}
+/* Feature Cards */
+.feature-card {
+    background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
+    padding: 25px;
+    border-radius: 15px;
+    box-shadow: 0 4px 15px rgba(0,0,0,0.08);
+    margin-bottom: 20px;
+    border: 1px solid rgba(255,255,255,0.5);
+    transition: all 0.3s ease;
+}
+.feature-card:hover {
+    transform: translateY(-3px);
+    box-shadow: 0 8px 25px rgba(0,0,0,0.12);
+}
+/* Input Controls Section */
+.input-controls {
+    background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%);
+    padding: 30px;
+    border-radius: 15px;
+    box-shadow: 0 5px 20px rgba(0,0,0,0.1);
+}
+/* Output Section */
+.output-section {
+    background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%);
+    padding: 30px;
+    border-radius: 15px;
+    box-shadow: 0 5px 20px rgba(0,0,0,0.1);
+}
+/* Section Headers */
+.section-header {
+    color: #667eea;
+    font-size: 1.8em;
+    font-weight: 700;
+    margin: 30px 0 20px 0;
+    text-align: center;
+    background: linear-gradient(135deg, #667eea, #764ba2);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+}
+/* Process Button */
+.process-button {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    border: none !important;
+    font-size: 1.4em !important;
+    font-weight: 700 !important;
+    padding: 20px 50px !important;
+    border-radius: 50px !important;
+    box-shadow: 0 10px 30px rgba(102, 126, 234, 0.5) !important;
+    transition: all 0.3s ease !important;
+    color: white !important;
+    text-transform: uppercase;
+    letter-spacing: 1px;
+}
+.process-button:hover {
+    transform: translateY(-3px) scale(1.02) !important;
+    box-shadow: 0 15px 40px rgba(102, 126, 234, 0.7) !important;
+}
+/* Audio Components */
+.audio-wrapper {
+    background: white;
+    padding: 20px;
+    border-radius: 12px;
+    box-shadow: 0 3px 12px rgba(0,0,0,0.08);
+    margin: 15px 0;
+}
+/* Tabs */
+.tab-nav button {
+    font-weight: 600 !important;
+    font-size: 1.1em !important;
+    padding: 12px 24px !important;
+    border-radius: 10px 10px 0 0 !important;
+}
+.tab-nav button[aria-selected="true"] {
+    background: linear-gradient(135deg, #667eea, #764ba2) !important;
+    color: white !important;
+}
+/* Info Box */
+.info-box {
+    background: linear-gradient(135deg, #e0c3fc 0%, #8ec5fc 100%);
+    padding: 25px;
+    border-radius: 15px;
+    margin: 25px 0;
+    border-left: 5px solid #667eea;
+    box-shadow: 0 4px 15px rgba(0,0,0,0.1);
+}
+.info-box h3 {
+    color: #667eea;
+    font-size: 1.4em;
+    font-weight: 700;
+    margin-top: 0;
+}
+.info-box ul {
+    margin: 10px 0;
+    padding-left: 25px;
+}
+.info-box li {
+    margin: 8px 0;
+    line-height: 1.6;
+}
+/* Examples Section */
+.examples-section {
+    background: linear-gradient(135deg, #ffeaa7 0%, #dfe6e9 100%);
+    padding: 25px;
+    border-radius: 15px;
+    margin-top: 30px;
+    box-shadow: 0 4px 15px rgba(0,0,0,0.08);
+}
+/* Footer */
+#footer {
+    text-align: center;
+    padding: 30px 20px;
+    margin-top: 50px;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    border-radius: 15px;
+    color: white;
+}
+#footer h3 {
+    margin: 0 0 10px 0;
+    font-size: 1.5em;
+    font-weight: 700;
+}
+#footer p {
+    margin: 5px 0;
+    opacity: 0.9;
+}
+/* Radio Buttons */
+.radio-group label {
+    padding: 12px 20px !important;
+    border-radius: 10px !important;
+    font-weight: 600 !important;
+    transition: all 0.3s ease !important;
+}
+/* Dropdowns */
+.dropdown select {
+    border-radius: 10px !important;
+    padding: 12px !important;
+    font-size: 1.05em !important;
+    border: 2px solid #e0e0e0 !important;
+    transition: all 0.3s ease !important;
+}
+.dropdown select:focus {
+    border-color: #667eea !important;
+    box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
+}
+"""
 # ============================================================================
 # Gradio Interface
 # ============================================================================
+with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as demo:
+    # Hero Section
+    gr.HTML("""
+        <div id="hero-section">
+            <h1>🎵 DeepFilterNet2 Audio Enhancement</h1>
+            <p>Transform noisy audio into crystal-clear sound using cutting-edge AI technology</p>
+            <p style="font-size: 0.95em; margin-top: 15px;">
+                ✨ Real-time Processing | 🎯 State-of-the-Art Quality | 🚀 Lightning Fast
+            </p>
+        </div>
+    """)
+    # Quick Start Guide
+    with gr.Row():
+        gr.Markdown("""
+        <div class="info-box">
+            <h3>🚀 Quick Start Guide</h3>
+            <ul>
+                <li><strong>Step 1:</strong> Upload an audio file or record using your microphone</li>
+                <li><strong>Step 2:</strong> Optionally add synthetic noise to test the denoiser</li>
+                <li><strong>Step 3:</strong> Adjust SNR settings if needed</li>
+                <li><strong>Step 4:</strong> Click the "Denoise Audio" button</li>
+                <li><strong>Step 5:</strong> Compare results with interactive spectrograms</li>
+            </ul>
+        </div>
+        """)
+    # Main Interface
+    with gr.Row():
+        # Left Column - Input Controls
+        with gr.Column(scale=1):
+            gr.HTML('<h2 class="section-header">📤 Audio Input</h2>')
+            with gr.Group(elem_classes="input-controls"):
+                input_mode = gr.Radio(
+                    ["file", "mic"],
+                    value="file",
+                    label="🎙️ Input Method",
+                    info="Choose your preferred input source",
+                    elem_classes="radio-group"
+                )
+                audio_file = gr.Audio(
+                    type="filepath",
+                    label="📁 Upload Audio File",
+                    visible=True,
+                    elem_classes="audio-wrapper"
+                )
+                mic_input = gr.Audio(
+                    sources=["microphone"],
+                    type="filepath",
+                    label="🎤 Record Audio",
+                    visible=False,
+                    elem_classes="audio-wrapper"
+                )
+            gr.HTML('<h2 class="section-header">⚙️ Enhancement Settings</h2>')
+            with gr.Group(elem_classes="feature-card"):
+                noise_type = gr.Dropdown(
+                    label="🔊 Background Noise Type",
+                    choices=list(NOISES.keys()),
+                    value="None",
+                    info="Add synthetic noise for testing",
+                    elem_classes="dropdown"
+                )
+                snr = gr.Dropdown(
+                    label="📊 Signal-to-Noise Ratio (dB)",
+                    choices=["-5", "0", "10", "20"],
+                    value="10",
+                    info="Higher = cleaner signal",
+                    elem_classes="dropdown"
+                )
+            process_btn = gr.Button(
+                "🚀 Denoise Audio",
+                elem_classes="process-button",
+                size="lg"
+            )
+        # Right Column - Results
+        with gr.Column(scale=2):
+            gr.HTML('<h2 class="section-header">📊 Results & Comparison</h2>')
+            with gr.Tabs():
+                with gr.Tab("🔴 Input Audio", elem_classes="output-section"):
+                    noisy_audio = gr.Audio(
+                        type="filepath",
+                        label="Original/Noisy Audio",
+                        elem_classes="audio-wrapper"
+                    )
+                    noisy_spec = gr.Image(
+                        label="Input Spectrogram",
+                        elem_classes="audio-wrapper"
+                    )
+                with gr.Tab("🟢 Enhanced Audio", elem_classes="output-section"):
+                    enhanced_audio = gr.Audio(
+                        type="filepath",
+                        label="Enhanced Audio",
+                        elem_classes="audio-wrapper"
+                    )
+                    enhanced_spec = gr.Image(
+                        label="Enhanced Spectrogram",
+                        elem_classes="audio-wrapper"
+                    )
+    # Examples Section
+    gr.HTML('<h2 class="section-header">🎯 Try These Examples</h2>')
+    with gr.Group(elem_classes="examples-section"):
+        gr.Examples(
+            examples=[
+                ["./samples/p232_013_clean.wav", "🍳 Kitchen", "10"],
+                ["./samples/p232_013_clean.wav", "☕ Cafe", "10"],
+                ["./samples/p232_019_clean.wav", "☕ Cafe", "10"],
+                ["./samples/p232_019_clean.wav", "🌊 River", "10"],
+            ],
+            inputs=[audio_file, noise_type, snr],
+            outputs=[noisy_audio, noisy_spec, enhanced_audio, enhanced_spec],
+            fn=process_audio,
+            cache_examples=True,
+            label="Click any example to try it instantly",
+        )
+    # Technical Information
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("""
+            <div class="info-box">
+                <h3>💡 How It Works</h3>
+                <p><strong>DeepFilterNet2</strong> uses advanced deep learning to identify and remove unwanted background noise while preserving speech clarity. The model analyzes spectral patterns to distinguish between signal and noise components.</p>
+            </div>
+            """)
+        with gr.Column():
+            gr.Markdown("""
+            <div class="info-box">
+                <h3>📋 Technical Specifications</h3>
+                <ul>
+                    <li><strong>Model:</strong> DeepFilterNet2 (State-of-the-art)</li>
+                    <li><strong>Sample Rate:</strong> 48 kHz</li>
+                    <li><strong>Max Duration:</strong> 1 hour</li>
+                    <li><strong>Formats:</strong> WAV, MP3, M4A, OGG, FLAC, AAC</li>
+                    <li><strong>Processing:</strong> Real-time capable</li>
+                </ul>
+            </div>
+            """)
+    # Footer
+    gr.HTML("""
+        <div id="footer">
+            <h3>🎵 Powered by DeepFilterNet2</h3>
+            <p>Advanced AI-driven audio enhancement technology</p>
+            <p><em>Built with Gradio • Optimized for Performance</em></p>
+        </div>
+    """)
+    # Event Handlers
     process_btn.click(
         fn=process_audio,
+        inputs=[audio_file, noise_type, snr, mic_input],
+        outputs=[noisy_audio, noisy_spec, enhanced_audio, enhanced_spec],
+        api_name="denoise",
     )
+    input_mode.change(
+        fn=toggle_input_mode,
+        inputs=input_mode,
+        outputs=[mic_input, audio_file],
+    )
+# Initial cleanup
+file_manager.cleanup_tmp()
+# Launch application
 if __name__ == "__main__":
+    demo.queue().launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+    )