Spaces:

Clearwave48
/

clearwave-api

Running

App Files Files Community

Clearwave48 commited on 11 days ago

Commit

455f45c

verified ·

1 Parent(s): 5224e6a

Update denoiser.py

Browse files

Files changed (1) hide show

denoiser.py +28 -16

denoiser.py CHANGED Viewed

@@ -14,8 +14,10 @@ Matches CleanVoice feature-for-feature using FREE local models:
 FIXES APPLIED:
   - TARGET_SR set to 48000 to match DeepFilterNet natively (no double resampling)
   - Mouth sound threshold raised 4.5→6.0 std (was removing real consonants p/b/t)
-  - noisereduce prop_decrease lowered 0.85→0.70 (was causing speech artifacts)
   - Room tone fallback: uses first 100ms if audio too short
   - Stutter detection fixed: now catches triple+ repeats (I I I was → I was)
   - Filler removal: also returns cleaned transcript text
@@ -33,9 +35,9 @@ import logging
 logger = logging.getLogger(__name__)
-# NOTE: 44100 used on HF Spaces (DeepFilterNet not available — no Rust compiler)
-# Locally with DeepFilterNet installed, change this to 48000 for best quality
-TARGET_SR       = 44100
 TARGET_LOUDNESS = -18.0
 # Filler words (English + Telugu + Hindi)
@@ -193,7 +195,7 @@ class Denoiser:
     # BACKGROUND NOISE REMOVAL
     # ══════════════════════════════════════════════════════════════════
     def _remove_background_noise(self, audio, sr):
-        # Try DeepFilterNet (SOTA) — native SR is 48kHz, matches TARGET_SR now
         try:
             result = self._deepfilter(audio, sr)
             print("[Denoiser] ✅ DeepFilterNet noise removal done")
@@ -201,13 +203,14 @@ class Denoiser:
         except Exception as e:
             logger.warning(f"[Denoiser] DeepFilterNet unavailable ({e})")
-        # FIX 2: Lower prop_decrease 0.85→0.70 to reduce speech artifacts
         try:
             import noisereduce as nr
             cleaned = nr.reduce_noise(
                 y=audio, sr=sr,
                 stationary=True,
-                prop_decrease=0.70,  # was 0.85 — too aggressive, caused artifacts
             ).astype(np.float32)
             print("[Denoiser] ✅ noisereduce noise removal done")
             return cleaned, "noisereduce"
@@ -216,19 +219,30 @@ class Denoiser:
             return audio, "none"
     def _deepfilter(self, audio, sr):
         if not self._df_loaded:
             from df.enhance import enhance, init_df
             self._df_model, self._df_state, _ = init_df()
             self._df_loaded = True
         from df.enhance import enhance
         import torch
         df_sr = self._df_state.sr()
-        # FIX: TARGET_SR now matches DeepFilterNet's native SR (48kHz)
-        # so resampling is skipped in most cases
-        a     = self._resample(audio, sr, df_sr) if sr != df_sr else audio
-        t     = torch.from_numpy(a).unsqueeze(0)
-        out   = enhance(self._df_model, self._df_state, t)
-        res   = out.squeeze().numpy().astype(np.float32)
         return self._resample(res, df_sr, sr) if df_sr != sr else res
     # ══════════════════════════════════════════════════════════════════
@@ -494,12 +508,10 @@ class Denoiser:
                 print(f"[Denoiser] ✅ Normalized: {loudness:.1f} → {TARGET_LOUDNESS} LUFS")
         except Exception:
             # FIX: Corrected RMS fallback formula
-            # Old: audio * (10 ** (TARGET_LOUDNESS / 20.0) / rms)  ← wrong
-            # New: scale so RMS matches target linear amplitude
             rms = np.sqrt(np.mean(audio**2))
             if rms > 1e-9:
                 target_rms = 10 ** (TARGET_LOUDNESS / 20.0)  # ≈ 0.126
-                audio = audio * (target_rms / rms)            # correct ratio
         return np.clip(audio, -1.0, 1.0).astype(np.float32)
     # ══════════════════════════════════════════════════════════════════

 FIXES APPLIED:
   - TARGET_SR set to 48000 to match DeepFilterNet natively (no double resampling)
+  - DeepFilterNet now installed via Dockerfile (deepfilternet pip package)
+  - Double-pass DeepFilterNet for Zoom audio (removes layered noise + echo)
   - Mouth sound threshold raised 4.5→6.0 std (was removing real consonants p/b/t)
+  - noisereduce fallback prop_decrease raised back to 0.85 + n_std_thresh=1.5 (stronger fallback)
   - Room tone fallback: uses first 100ms if audio too short
   - Stutter detection fixed: now catches triple+ repeats (I I I was → I was)
   - Filler removal: also returns cleaned transcript text
 logger = logging.getLogger(__name__)
+# FIX: Changed from 44100 → 48000 to match DeepFilterNet's native SR
+# DeepFilterNet is now properly installed via Dockerfile (no more Rust compiler issue)
+TARGET_SR       = 48000
 TARGET_LOUDNESS = -18.0
 # Filler words (English + Telugu + Hindi)
     # BACKGROUND NOISE REMOVAL
     # ══════════════════════════════════════════════════════════════════
     def _remove_background_noise(self, audio, sr):
+        # Try DeepFilterNet (SOTA) — now properly installed via Dockerfile
         try:
             result = self._deepfilter(audio, sr)
             print("[Denoiser] ✅ DeepFilterNet noise removal done")
         except Exception as e:
             logger.warning(f"[Denoiser] DeepFilterNet unavailable ({e})")
+        # FIX: Raised prop_decrease back to 0.85 + added n_std_thresh for stronger fallback
         try:
             import noisereduce as nr
             cleaned = nr.reduce_noise(
                 y=audio, sr=sr,
                 stationary=True,
+                prop_decrease=0.85,
+                n_std_thresh_stationary=1.5,  # FIX: more aggressive noise floor
             ).astype(np.float32)
             print("[Denoiser] ✅ noisereduce noise removal done")
             return cleaned, "noisereduce"
             return audio, "none"
     def _deepfilter(self, audio, sr):
+        """
+        FIX: Added double-pass enhancement for Zoom audio.
+        Zoom meetings have layered noise (background + echo + mic hiss).
+        One pass removes the main noise; second pass cleans the residual.
+        """
         if not self._df_loaded:
             from df.enhance import enhance, init_df
             self._df_model, self._df_state, _ = init_df()
             self._df_loaded = True
         from df.enhance import enhance
         import torch
         df_sr = self._df_state.sr()
+        # TARGET_SR now matches DeepFilterNet's native SR (48kHz) — no resampling needed
+        a = self._resample(audio, sr, df_sr) if sr != df_sr else audio
+        t = torch.from_numpy(a).unsqueeze(0)
+        # FIX: Pass 1 — remove main background noise
+        out1 = enhance(self._df_model, self._df_state, t)
+        # FIX: Pass 2 — clean residual noise (critical for Zoom/meeting audio)
+        out2 = enhance(self._df_model, self._df_state, out1)
+        res = out2.squeeze().numpy().astype(np.float32)
         return self._resample(res, df_sr, sr) if df_sr != sr else res
     # ══════════════════════════════════════════════════════════════════
                 print(f"[Denoiser] ✅ Normalized: {loudness:.1f} → {TARGET_LOUDNESS} LUFS")
         except Exception:
             # FIX: Corrected RMS fallback formula
             rms = np.sqrt(np.mean(audio**2))
             if rms > 1e-9:
                 target_rms = 10 ** (TARGET_LOUDNESS / 20.0)  # ≈ 0.126
+                audio = audio * (target_rms / rms)
         return np.clip(audio, -1.0, 1.0).astype(np.float32)
     # ══════════════════════════════════════════════════════════════════