Clearwave48 commited on
Commit
455f45c
Β·
verified Β·
1 Parent(s): 5224e6a

Update denoiser.py

Browse files
Files changed (1) hide show
  1. denoiser.py +28 -16
denoiser.py CHANGED
@@ -14,8 +14,10 @@ Matches CleanVoice feature-for-feature using FREE local models:
14
 
15
  FIXES APPLIED:
16
  - TARGET_SR set to 48000 to match DeepFilterNet natively (no double resampling)
 
 
17
  - Mouth sound threshold raised 4.5β†’6.0 std (was removing real consonants p/b/t)
18
- - noisereduce prop_decrease lowered 0.85β†’0.70 (was causing speech artifacts)
19
  - Room tone fallback: uses first 100ms if audio too short
20
  - Stutter detection fixed: now catches triple+ repeats (I I I was β†’ I was)
21
  - Filler removal: also returns cleaned transcript text
@@ -33,9 +35,9 @@ import logging
33
 
34
  logger = logging.getLogger(__name__)
35
 
36
- # NOTE: 44100 used on HF Spaces (DeepFilterNet not available β€” no Rust compiler)
37
- # Locally with DeepFilterNet installed, change this to 48000 for best quality
38
- TARGET_SR = 44100
39
  TARGET_LOUDNESS = -18.0
40
 
41
  # Filler words (English + Telugu + Hindi)
@@ -193,7 +195,7 @@ class Denoiser:
193
  # BACKGROUND NOISE REMOVAL
194
  # ══════════════════════════════════════════════════════════════════
195
  def _remove_background_noise(self, audio, sr):
196
- # Try DeepFilterNet (SOTA) β€” native SR is 48kHz, matches TARGET_SR now
197
  try:
198
  result = self._deepfilter(audio, sr)
199
  print("[Denoiser] βœ… DeepFilterNet noise removal done")
@@ -201,13 +203,14 @@ class Denoiser:
201
  except Exception as e:
202
  logger.warning(f"[Denoiser] DeepFilterNet unavailable ({e})")
203
 
204
- # FIX 2: Lower prop_decrease 0.85β†’0.70 to reduce speech artifacts
205
  try:
206
  import noisereduce as nr
207
  cleaned = nr.reduce_noise(
208
  y=audio, sr=sr,
209
  stationary=True,
210
- prop_decrease=0.70, # was 0.85 β€” too aggressive, caused artifacts
 
211
  ).astype(np.float32)
212
  print("[Denoiser] βœ… noisereduce noise removal done")
213
  return cleaned, "noisereduce"
@@ -216,19 +219,30 @@ class Denoiser:
216
  return audio, "none"
217
 
218
  def _deepfilter(self, audio, sr):
 
 
 
 
 
219
  if not self._df_loaded:
220
  from df.enhance import enhance, init_df
221
  self._df_model, self._df_state, _ = init_df()
222
  self._df_loaded = True
223
  from df.enhance import enhance
224
  import torch
 
225
  df_sr = self._df_state.sr()
226
- # FIX: TARGET_SR now matches DeepFilterNet's native SR (48kHz)
227
- # so resampling is skipped in most cases
228
- a = self._resample(audio, sr, df_sr) if sr != df_sr else audio
229
- t = torch.from_numpy(a).unsqueeze(0)
230
- out = enhance(self._df_model, self._df_state, t)
231
- res = out.squeeze().numpy().astype(np.float32)
 
 
 
 
 
232
  return self._resample(res, df_sr, sr) if df_sr != sr else res
233
 
234
  # ══════════════════════════════════════════════════════════════════
@@ -494,12 +508,10 @@ class Denoiser:
494
  print(f"[Denoiser] βœ… Normalized: {loudness:.1f} β†’ {TARGET_LOUDNESS} LUFS")
495
  except Exception:
496
  # FIX: Corrected RMS fallback formula
497
- # Old: audio * (10 ** (TARGET_LOUDNESS / 20.0) / rms) ← wrong
498
- # New: scale so RMS matches target linear amplitude
499
  rms = np.sqrt(np.mean(audio**2))
500
  if rms > 1e-9:
501
  target_rms = 10 ** (TARGET_LOUDNESS / 20.0) # β‰ˆ 0.126
502
- audio = audio * (target_rms / rms) # correct ratio
503
  return np.clip(audio, -1.0, 1.0).astype(np.float32)
504
 
505
  # ══════════════════════════════════════════════════════════════════
 
14
 
15
  FIXES APPLIED:
16
  - TARGET_SR set to 48000 to match DeepFilterNet natively (no double resampling)
17
+ - DeepFilterNet now installed via Dockerfile (deepfilternet pip package)
18
+ - Double-pass DeepFilterNet for Zoom audio (removes layered noise + echo)
19
  - Mouth sound threshold raised 4.5β†’6.0 std (was removing real consonants p/b/t)
20
+ - noisereduce fallback prop_decrease raised back to 0.85 + n_std_thresh=1.5 (stronger fallback)
21
  - Room tone fallback: uses first 100ms if audio too short
22
  - Stutter detection fixed: now catches triple+ repeats (I I I was β†’ I was)
23
  - Filler removal: also returns cleaned transcript text
 
35
 
36
  logger = logging.getLogger(__name__)
37
 
38
+ # FIX: Changed from 44100 β†’ 48000 to match DeepFilterNet's native SR
39
+ # DeepFilterNet is now properly installed via Dockerfile (no more Rust compiler issue)
40
+ TARGET_SR = 48000
41
  TARGET_LOUDNESS = -18.0
42
 
43
  # Filler words (English + Telugu + Hindi)
 
195
  # BACKGROUND NOISE REMOVAL
196
  # ══════════════════════════════════════════════════════════════════
197
  def _remove_background_noise(self, audio, sr):
198
+ # Try DeepFilterNet (SOTA) β€” now properly installed via Dockerfile
199
  try:
200
  result = self._deepfilter(audio, sr)
201
  print("[Denoiser] βœ… DeepFilterNet noise removal done")
 
203
  except Exception as e:
204
  logger.warning(f"[Denoiser] DeepFilterNet unavailable ({e})")
205
 
206
+ # FIX: Raised prop_decrease back to 0.85 + added n_std_thresh for stronger fallback
207
  try:
208
  import noisereduce as nr
209
  cleaned = nr.reduce_noise(
210
  y=audio, sr=sr,
211
  stationary=True,
212
+ prop_decrease=0.85,
213
+ n_std_thresh_stationary=1.5, # FIX: more aggressive noise floor
214
  ).astype(np.float32)
215
  print("[Denoiser] βœ… noisereduce noise removal done")
216
  return cleaned, "noisereduce"
 
219
  return audio, "none"
220
 
221
  def _deepfilter(self, audio, sr):
222
+ """
223
+ FIX: Added double-pass enhancement for Zoom audio.
224
+ Zoom meetings have layered noise (background + echo + mic hiss).
225
+ One pass removes the main noise; second pass cleans the residual.
226
+ """
227
  if not self._df_loaded:
228
  from df.enhance import enhance, init_df
229
  self._df_model, self._df_state, _ = init_df()
230
  self._df_loaded = True
231
  from df.enhance import enhance
232
  import torch
233
+
234
  df_sr = self._df_state.sr()
235
+ # TARGET_SR now matches DeepFilterNet's native SR (48kHz) β€” no resampling needed
236
+ a = self._resample(audio, sr, df_sr) if sr != df_sr else audio
237
+ t = torch.from_numpy(a).unsqueeze(0)
238
+
239
+ # FIX: Pass 1 β€” remove main background noise
240
+ out1 = enhance(self._df_model, self._df_state, t)
241
+
242
+ # FIX: Pass 2 β€” clean residual noise (critical for Zoom/meeting audio)
243
+ out2 = enhance(self._df_model, self._df_state, out1)
244
+
245
+ res = out2.squeeze().numpy().astype(np.float32)
246
  return self._resample(res, df_sr, sr) if df_sr != sr else res
247
 
248
  # ══════════════════════════════════════════════════════════════════
 
508
  print(f"[Denoiser] βœ… Normalized: {loudness:.1f} β†’ {TARGET_LOUDNESS} LUFS")
509
  except Exception:
510
  # FIX: Corrected RMS fallback formula
 
 
511
  rms = np.sqrt(np.mean(audio**2))
512
  if rms > 1e-9:
513
  target_rms = 10 ** (TARGET_LOUDNESS / 20.0) # β‰ˆ 0.126
514
+ audio = audio * (target_rms / rms)
515
  return np.clip(audio, -1.0, 1.0).astype(np.float32)
516
 
517
  # ══════════════════════════════════════════════════════════════════