Clearwave48 commited on
Commit
aff15cd
Β·
verified Β·
1 Parent(s): fa1e7ae

Update denoiser.py

Browse files
Files changed (1) hide show
  1. denoiser.py +52 -52
denoiser.py CHANGED
@@ -72,7 +72,6 @@ FILLER_WORDS = {
72
  # ---------------------------------------------------------------------------
73
  # Module-level model cache (survives across Denoiser() instances on same Space)
74
  # ---------------------------------------------------------------------------
75
- _SEPFORMER_MODEL = None # speechbrain SepFormer
76
  _SILERO_MODEL = None # Silero VAD
77
  _SILERO_UTILS = None
78
 
@@ -91,7 +90,8 @@ class Denoiser:
91
  remove_breaths: bool = True,
92
  remove_mouth_sounds: bool = True,
93
  remove_stutters: bool = True,
94
- word_segments: list = None) -> dict:
 
95
  """
96
  Full professional pipeline.
97
 
@@ -159,10 +159,40 @@ class Denoiser:
159
  # ── 8. Normalize Loudness ─────────────────────────────────────
160
  mono = self._normalise(mono, sr)
161
 
162
- # ── 9. Restore stereo / save ──────────────────────────────────
163
  out_audio = np.stack([mono, mono], axis=1) if n_ch == 2 else mono
164
- out_path = os.path.join(out_dir, "denoised.wav")
165
- sf.write(out_path, out_audio, sr, subtype="PCM_24")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  stats['processing_sec'] = round(time.time() - t0, 2)
168
  print(f"[Denoiser] βœ… Done in {stats['processing_sec']}s | {stats}")
@@ -274,10 +304,16 @@ class Denoiser:
274
  return result.astype(np.float32)
275
 
276
  # ══════════════════════════════════════════════════════════════════
277
- # BACKGROUND NOISE REMOVAL ← UPGRADED
278
- # Chain: DeepFilterNet β†’ SepFormer β†’ two-pass noisereduce β†’ passthrough
279
- # DeepFilterNet is PRIMARY β€” Rust installed in Dockerfile, weights
280
- # pre-downloaded at build time, native 48kHz matches TARGET_SR exactly.
 
 
 
 
 
 
281
  # ══════════════════════════════════════════════════════════════════
282
  def _remove_background_noise(self, audio, sr):
283
  # ── Primary: DeepFilterNet (SOTA, Rust available via Docker) ─────
@@ -288,65 +324,29 @@ class Denoiser:
288
  except Exception as e:
289
  logger.warning(f"[Denoiser] DeepFilterNet unavailable ({e})")
290
 
291
- # ── Fallback A: SepFormer (speechbrain, CPU-safe) ─────────────────
292
- try:
293
- result = self._sepformer_enhance(audio, sr)
294
- print("[Denoiser] βœ… SepFormer noise removal done")
295
- return result, "SepFormer"
296
- except Exception as e:
297
- logger.warning(f"[Denoiser] SepFormer unavailable ({e})")
298
-
299
- # ── Fallback B: Two-pass noisereduce ─────────────���───────────────
300
- # Pass 1 (stationary) removes steady hum/hiss.
301
- # Pass 2 (non-stationary, gentler) catches residual without artifacts.
302
  try:
303
  import noisereduce as nr
304
  pass1 = nr.reduce_noise(
305
  y=audio, sr=sr,
306
  stationary=True,
307
- prop_decrease=0.70,
308
  ).astype(np.float32)
309
  pass2 = nr.reduce_noise(
310
  y=pass1, sr=sr,
311
  stationary=False,
312
- prop_decrease=0.40, # gentle β€” avoids introducing artifacts
313
- freq_mask_smooth_hz=300,
314
- time_mask_smooth_ms=60,
315
  ).astype(np.float32)
316
- print("[Denoiser] βœ… Two-pass noisereduce done")
317
  return pass2, "noisereduce_2pass"
318
  except Exception as e:
319
  logger.warning(f"noisereduce failed: {e}")
320
 
321
  return audio, "none"
322
 
323
- def _sepformer_enhance(self, audio: np.ndarray, sr: int) -> np.ndarray:
324
- """
325
- SepFormer speech enhancement via speechbrain (HuggingFace weights).
326
- Cached globally so the model is only downloaded/loaded once per Space.
327
- """
328
- global _SEPFORMER_MODEL
329
- import torch
330
-
331
- if _SEPFORMER_MODEL is None:
332
- from speechbrain.pretrained import SepformerSeparation
333
- _SEPFORMER_MODEL = SepformerSeparation.from_hparams(
334
- source="speechbrain/sepformer-wham16k-enhancement",
335
- savedir="/tmp/sepformer_cache",
336
- run_opts={"device": "cpu"},
337
- )
338
- print("[Denoiser] SepFormer model loaded (cached)")
339
-
340
- model_sr = 16000
341
- a = self._resample(audio, sr, model_sr)
342
- t = torch.from_numpy(a).unsqueeze(0) # (1, T)
343
-
344
- with torch.no_grad():
345
- out = _SEPFORMER_MODEL.separate_batch(t) # (1, T, 1)
346
-
347
- enhanced = out[0, :, 0].numpy().astype(np.float32)
348
- return self._resample(enhanced, model_sr, sr)
349
-
350
  def _deepfilter(self, audio: np.ndarray, sr: int) -> np.ndarray:
351
  """DeepFilterNet enhancement (local only β€” requires Rust compiler)."""
352
  from df.enhance import enhance, init_df
 
72
  # ---------------------------------------------------------------------------
73
  # Module-level model cache (survives across Denoiser() instances on same Space)
74
  # ---------------------------------------------------------------------------
 
75
  _SILERO_MODEL = None # Silero VAD
76
  _SILERO_UTILS = None
77
 
 
90
  remove_breaths: bool = True,
91
  remove_mouth_sounds: bool = True,
92
  remove_stutters: bool = True,
93
+ word_segments: list = None,
94
+ original_filename: str = None) -> dict:
95
  """
96
  Full professional pipeline.
97
 
 
159
  # ── 8. Normalize Loudness ─────────────────────────────────────
160
  mono = self._normalise(mono, sr)
161
 
162
+ # ── 9. Restore stereo / save as MP3 ──────────────────────────
163
  out_audio = np.stack([mono, mono], axis=1) if n_ch == 2 else mono
164
+
165
+ # Build output filename: strip original extension, append _cleared.mp3
166
+ # e.g. "output.wav" β†’ "output_cleared.mp3"
167
+ if original_filename:
168
+ base = os.path.splitext(os.path.basename(original_filename))[0]
169
+ else:
170
+ base = os.path.splitext(os.path.basename(audio_path))[0]
171
+ out_name = f"{base}_cleared.mp3"
172
+
173
+ # Write a temporary WAV first (soundfile can't encode MP3),
174
+ # then convert to MP3 via ffmpeg (already in the Dockerfile).
175
+ tmp_wav = os.path.join(out_dir, "denoised_tmp.wav")
176
+ out_path = os.path.join(out_dir, out_name)
177
+ sf.write(tmp_wav, out_audio, sr, subtype="PCM_24")
178
+
179
+ result = subprocess.run([
180
+ "ffmpeg", "-y", "-i", tmp_wav,
181
+ "-codec:a", "libmp3lame",
182
+ "-qscale:a", "2", # VBR quality 2 β‰ˆ 190 kbps β€” transparent quality
183
+ "-ar", str(sr),
184
+ out_path
185
+ ], capture_output=True)
186
+
187
+ if result.returncode != 0:
188
+ stderr = result.stderr.decode(errors="replace")
189
+ logger.warning(f"MP3 export failed, falling back to WAV: {stderr[-300:]}")
190
+ out_path = tmp_wav # graceful fallback β€” still return something
191
+ else:
192
+ try:
193
+ os.remove(tmp_wav) # clean up temp WAV
194
+ except OSError:
195
+ pass
196
 
197
  stats['processing_sec'] = round(time.time() - t0, 2)
198
  print(f"[Denoiser] βœ… Done in {stats['processing_sec']}s | {stats}")
 
304
  return result.astype(np.float32)
305
 
306
  # ══════════════════════════════════════════════════════════════════
307
+ # BACKGROUND NOISE REMOVAL
308
+ # Chain: DeepFilterNet β†’ two-pass noisereduce β†’ passthrough
309
+ #
310
+ # SepFormer REMOVED β€” it is a speech separation model, not a denoiser.
311
+ # It reconstructs voice artificially β†’ robotic output.
312
+ #
313
+ # Two-pass noisereduce is the safe CPU fallback:
314
+ # Pass 1 (stationary) β€” removes steady hum/hiss/fan noise
315
+ # Pass 2 (non-stationary) β€” catches residual at low prop_decrease
316
+ # so original voice character is preserved
317
  # ══════════════════════════════════════════════════════════════════
318
  def _remove_background_noise(self, audio, sr):
319
  # ── Primary: DeepFilterNet (SOTA, Rust available via Docker) ─────
 
324
  except Exception as e:
325
  logger.warning(f"[Denoiser] DeepFilterNet unavailable ({e})")
326
 
327
+ # ── Fallback: Two-pass noisereduce (voice-preserving) ─────────────
328
+ # prop_decrease kept LOW on both passes to avoid speech artifacts.
 
 
 
 
 
 
 
 
 
329
  try:
330
  import noisereduce as nr
331
  pass1 = nr.reduce_noise(
332
  y=audio, sr=sr,
333
  stationary=True,
334
+ prop_decrease=0.65,
335
  ).astype(np.float32)
336
  pass2 = nr.reduce_noise(
337
  y=pass1, sr=sr,
338
  stationary=False,
339
+ prop_decrease=0.30, # very gentle β€” voice stays natural
340
+ freq_mask_smooth_hz=400,
341
+ time_mask_smooth_ms=80,
342
  ).astype(np.float32)
343
+ print("[Denoiser] βœ… Two-pass noisereduce done (voice-preserving)")
344
  return pass2, "noisereduce_2pass"
345
  except Exception as e:
346
  logger.warning(f"noisereduce failed: {e}")
347
 
348
  return audio, "none"
349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  def _deepfilter(self, audio: np.ndarray, sr: int) -> np.ndarray:
351
  """DeepFilterNet enhancement (local only β€” requires Rust compiler)."""
352
  from df.enhance import enhance, init_df