Spaces:

Clearwave48
/

clearwave-api

Sleeping

App Files Files Community

Clearwave48 commited on 19 days ago

Commit

aff15cd

verified ·

1 Parent(s): fa1e7ae

Update denoiser.py

Browse files

Files changed (1) hide show

denoiser.py +52 -52

denoiser.py CHANGED Viewed

@@ -72,7 +72,6 @@ FILLER_WORDS = {
 # ---------------------------------------------------------------------------
 # Module-level model cache (survives across Denoiser() instances on same Space)
 # ---------------------------------------------------------------------------
-_SEPFORMER_MODEL = None   # speechbrain SepFormer
 _SILERO_MODEL    = None   # Silero VAD
 _SILERO_UTILS    = None
@@ -91,7 +90,8 @@ class Denoiser:
                 remove_breaths: bool      = True,
                 remove_mouth_sounds: bool = True,
                 remove_stutters: bool     = True,
-                word_segments: list       = None) -> dict:
         """
         Full professional pipeline.
@@ -159,10 +159,40 @@ class Denoiser:
         # ── 8. Normalize Loudness ─────────────────────────────────────
         mono = self._normalise(mono, sr)
-        # ── 9. Restore stereo / save ──────────────────────────────────
         out_audio = np.stack([mono, mono], axis=1) if n_ch == 2 else mono
-        out_path  = os.path.join(out_dir, "denoised.wav")
-        sf.write(out_path, out_audio, sr, subtype="PCM_24")
         stats['processing_sec'] = round(time.time() - t0, 2)
         print(f"[Denoiser] ✅ Done in {stats['processing_sec']}s | {stats}")
@@ -274,10 +304,16 @@ class Denoiser:
         return result.astype(np.float32)
     # ══════════════════════════════════════════════════════════════════
-    # BACKGROUND NOISE REMOVAL  ← UPGRADED
-    # Chain: DeepFilterNet → SepFormer → two-pass noisereduce → passthrough
-    # DeepFilterNet is PRIMARY — Rust installed in Dockerfile, weights
-    # pre-downloaded at build time, native 48kHz matches TARGET_SR exactly.
     # ══════════════════════════════════════════════════════════════════
     def _remove_background_noise(self, audio, sr):
         # ── Primary: DeepFilterNet (SOTA, Rust available via Docker) ─────
@@ -288,65 +324,29 @@ class Denoiser:
         except Exception as e:
             logger.warning(f"[Denoiser] DeepFilterNet unavailable ({e})")
-        # ── Fallback A: SepFormer (speechbrain, CPU-safe) ─────────────────
-        try:
-            result = self._sepformer_enhance(audio, sr)
-            print("[Denoiser] ✅ SepFormer noise removal done")
-            return result, "SepFormer"
-        except Exception as e:
-            logger.warning(f"[Denoiser] SepFormer unavailable ({e})")
-        # ── Fallback B: Two-pass noisereduce ─────────────���───────────────
-        # Pass 1 (stationary) removes steady hum/hiss.
-        # Pass 2 (non-stationary, gentler) catches residual without artifacts.
         try:
             import noisereduce as nr
             pass1 = nr.reduce_noise(
                 y=audio, sr=sr,
                 stationary=True,
-                prop_decrease=0.70,
             ).astype(np.float32)
             pass2 = nr.reduce_noise(
                 y=pass1, sr=sr,
                 stationary=False,
-                prop_decrease=0.40,   # gentle — avoids introducing artifacts
-                freq_mask_smooth_hz=300,
-                time_mask_smooth_ms=60,
             ).astype(np.float32)
-            print("[Denoiser] ✅ Two-pass noisereduce done")
             return pass2, "noisereduce_2pass"
         except Exception as e:
             logger.warning(f"noisereduce failed: {e}")
         return audio, "none"
-    def _sepformer_enhance(self, audio: np.ndarray, sr: int) -> np.ndarray:
-        """
-        SepFormer speech enhancement via speechbrain (HuggingFace weights).
-        Cached globally so the model is only downloaded/loaded once per Space.
-        """
-        global _SEPFORMER_MODEL
-        import torch
-        if _SEPFORMER_MODEL is None:
-            from speechbrain.pretrained import SepformerSeparation
-            _SEPFORMER_MODEL = SepformerSeparation.from_hparams(
-                source="speechbrain/sepformer-wham16k-enhancement",
-                savedir="/tmp/sepformer_cache",
-                run_opts={"device": "cpu"},
-            )
-            print("[Denoiser] SepFormer model loaded (cached)")
-        model_sr = 16000
-        a = self._resample(audio, sr, model_sr)
-        t = torch.from_numpy(a).unsqueeze(0)   # (1, T)
-        with torch.no_grad():
-            out = _SEPFORMER_MODEL.separate_batch(t)   # (1, T, 1)
-        enhanced = out[0, :, 0].numpy().astype(np.float32)
-        return self._resample(enhanced, model_sr, sr)
     def _deepfilter(self, audio: np.ndarray, sr: int) -> np.ndarray:
         """DeepFilterNet enhancement (local only — requires Rust compiler)."""
         from df.enhance import enhance, init_df

 # ---------------------------------------------------------------------------
 # Module-level model cache (survives across Denoiser() instances on same Space)
 # ---------------------------------------------------------------------------
 _SILERO_MODEL    = None   # Silero VAD
 _SILERO_UTILS    = None
                 remove_breaths: bool      = True,
                 remove_mouth_sounds: bool = True,
                 remove_stutters: bool     = True,
+                word_segments: list       = None,
+                original_filename: str    = None) -> dict:
         """
         Full professional pipeline.
         # ── 8. Normalize Loudness ─────────────────────────────────────
         mono = self._normalise(mono, sr)
+        # ── 9. Restore stereo / save as MP3 ──────────────────────────
         out_audio = np.stack([mono, mono], axis=1) if n_ch == 2 else mono
+        # Build output filename: strip original extension, append _cleared.mp3
+        # e.g. "output.wav" → "output_cleared.mp3"
+        if original_filename:
+            base = os.path.splitext(os.path.basename(original_filename))[0]
+        else:
+            base = os.path.splitext(os.path.basename(audio_path))[0]
+        out_name = f"{base}_cleared.mp3"
+        # Write a temporary WAV first (soundfile can't encode MP3),
+        # then convert to MP3 via ffmpeg (already in the Dockerfile).
+        tmp_wav  = os.path.join(out_dir, "denoised_tmp.wav")
+        out_path = os.path.join(out_dir, out_name)
+        sf.write(tmp_wav, out_audio, sr, subtype="PCM_24")
+        result = subprocess.run([
+            "ffmpeg", "-y", "-i", tmp_wav,
+            "-codec:a", "libmp3lame",
+            "-qscale:a", "2",   # VBR quality 2 ≈ 190 kbps — transparent quality
+            "-ar", str(sr),
+            out_path
+        ], capture_output=True)
+        if result.returncode != 0:
+            stderr = result.stderr.decode(errors="replace")
+            logger.warning(f"MP3 export failed, falling back to WAV: {stderr[-300:]}")
+            out_path = tmp_wav   # graceful fallback — still return something
+        else:
+            try:
+                os.remove(tmp_wav)   # clean up temp WAV
+            except OSError:
+                pass
         stats['processing_sec'] = round(time.time() - t0, 2)
         print(f"[Denoiser] ✅ Done in {stats['processing_sec']}s | {stats}")
         return result.astype(np.float32)
     # ══════════════════════════════════════════════════════════════════
+    # BACKGROUND NOISE REMOVAL
+    # Chain: DeepFilterNet → two-pass noisereduce → passthrough
+    #
+    # SepFormer REMOVED — it is a speech separation model, not a denoiser.
+    # It reconstructs voice artificially → robotic output.
+    #
+    # Two-pass noisereduce is the safe CPU fallback:
+    #   Pass 1 (stationary)     — removes steady hum/hiss/fan noise
+    #   Pass 2 (non-stationary) — catches residual at low prop_decrease
+    #                             so original voice character is preserved
     # ══════════════════════════════════════════════════════════════════
     def _remove_background_noise(self, audio, sr):
         # ── Primary: DeepFilterNet (SOTA, Rust available via Docker) ─────
         except Exception as e:
             logger.warning(f"[Denoiser] DeepFilterNet unavailable ({e})")
+        # ── Fallback: Two-pass noisereduce (voice-preserving) ─────────────
+        # prop_decrease kept LOW on both passes to avoid speech artifacts.
         try:
             import noisereduce as nr
             pass1 = nr.reduce_noise(
                 y=audio, sr=sr,
                 stationary=True,
+                prop_decrease=0.65,
             ).astype(np.float32)
             pass2 = nr.reduce_noise(
                 y=pass1, sr=sr,
                 stationary=False,
+                prop_decrease=0.30,       # very gentle — voice stays natural
+                freq_mask_smooth_hz=400,
+                time_mask_smooth_ms=80,
             ).astype(np.float32)
+            print("[Denoiser] ✅ Two-pass noisereduce done (voice-preserving)")
             return pass2, "noisereduce_2pass"
         except Exception as e:
             logger.warning(f"noisereduce failed: {e}")
         return audio, "none"
     def _deepfilter(self, audio: np.ndarray, sr: int) -> np.ndarray:
         """DeepFilterNet enhancement (local only — requires Rust compiler)."""
         from df.enhance import enhance, init_df