Spaces:
Running
Running
Update denoiser.py
Browse files- denoiser.py +28 -16
denoiser.py
CHANGED
|
@@ -14,8 +14,10 @@ Matches CleanVoice feature-for-feature using FREE local models:
|
|
| 14 |
|
| 15 |
FIXES APPLIED:
|
| 16 |
- TARGET_SR set to 48000 to match DeepFilterNet natively (no double resampling)
|
|
|
|
|
|
|
| 17 |
- Mouth sound threshold raised 4.5β6.0 std (was removing real consonants p/b/t)
|
| 18 |
-
- noisereduce prop_decrease
|
| 19 |
- Room tone fallback: uses first 100ms if audio too short
|
| 20 |
- Stutter detection fixed: now catches triple+ repeats (I I I was β I was)
|
| 21 |
- Filler removal: also returns cleaned transcript text
|
|
@@ -33,9 +35,9 @@ import logging
|
|
| 33 |
|
| 34 |
logger = logging.getLogger(__name__)
|
| 35 |
|
| 36 |
-
#
|
| 37 |
-
#
|
| 38 |
-
TARGET_SR =
|
| 39 |
TARGET_LOUDNESS = -18.0
|
| 40 |
|
| 41 |
# Filler words (English + Telugu + Hindi)
|
|
@@ -193,7 +195,7 @@ class Denoiser:
|
|
| 193 |
# BACKGROUND NOISE REMOVAL
|
| 194 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 195 |
def _remove_background_noise(self, audio, sr):
|
| 196 |
-
# Try DeepFilterNet (SOTA) β
|
| 197 |
try:
|
| 198 |
result = self._deepfilter(audio, sr)
|
| 199 |
print("[Denoiser] β
DeepFilterNet noise removal done")
|
|
@@ -201,13 +203,14 @@ class Denoiser:
|
|
| 201 |
except Exception as e:
|
| 202 |
logger.warning(f"[Denoiser] DeepFilterNet unavailable ({e})")
|
| 203 |
|
| 204 |
-
# FIX
|
| 205 |
try:
|
| 206 |
import noisereduce as nr
|
| 207 |
cleaned = nr.reduce_noise(
|
| 208 |
y=audio, sr=sr,
|
| 209 |
stationary=True,
|
| 210 |
-
prop_decrease=0.
|
|
|
|
| 211 |
).astype(np.float32)
|
| 212 |
print("[Denoiser] β
noisereduce noise removal done")
|
| 213 |
return cleaned, "noisereduce"
|
|
@@ -216,19 +219,30 @@ class Denoiser:
|
|
| 216 |
return audio, "none"
|
| 217 |
|
| 218 |
def _deepfilter(self, audio, sr):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
if not self._df_loaded:
|
| 220 |
from df.enhance import enhance, init_df
|
| 221 |
self._df_model, self._df_state, _ = init_df()
|
| 222 |
self._df_loaded = True
|
| 223 |
from df.enhance import enhance
|
| 224 |
import torch
|
|
|
|
| 225 |
df_sr = self._df_state.sr()
|
| 226 |
-
#
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
return self._resample(res, df_sr, sr) if df_sr != sr else res
|
| 233 |
|
| 234 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -494,12 +508,10 @@ class Denoiser:
|
|
| 494 |
print(f"[Denoiser] β
Normalized: {loudness:.1f} β {TARGET_LOUDNESS} LUFS")
|
| 495 |
except Exception:
|
| 496 |
# FIX: Corrected RMS fallback formula
|
| 497 |
-
# Old: audio * (10 ** (TARGET_LOUDNESS / 20.0) / rms) β wrong
|
| 498 |
-
# New: scale so RMS matches target linear amplitude
|
| 499 |
rms = np.sqrt(np.mean(audio**2))
|
| 500 |
if rms > 1e-9:
|
| 501 |
target_rms = 10 ** (TARGET_LOUDNESS / 20.0) # β 0.126
|
| 502 |
-
audio = audio * (target_rms / rms)
|
| 503 |
return np.clip(audio, -1.0, 1.0).astype(np.float32)
|
| 504 |
|
| 505 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 14 |
|
| 15 |
FIXES APPLIED:
|
| 16 |
- TARGET_SR set to 48000 to match DeepFilterNet natively (no double resampling)
|
| 17 |
+
- DeepFilterNet now installed via Dockerfile (deepfilternet pip package)
|
| 18 |
+
- Double-pass DeepFilterNet for Zoom audio (removes layered noise + echo)
|
| 19 |
- Mouth sound threshold raised 4.5β6.0 std (was removing real consonants p/b/t)
|
| 20 |
+
- noisereduce fallback prop_decrease raised back to 0.85 + n_std_thresh=1.5 (stronger fallback)
|
| 21 |
- Room tone fallback: uses first 100ms if audio too short
|
| 22 |
- Stutter detection fixed: now catches triple+ repeats (I I I was β I was)
|
| 23 |
- Filler removal: also returns cleaned transcript text
|
|
|
|
| 35 |
|
| 36 |
logger = logging.getLogger(__name__)
|
| 37 |
|
| 38 |
+
# FIX: Changed from 44100 β 48000 to match DeepFilterNet's native SR
|
| 39 |
+
# DeepFilterNet is now properly installed via Dockerfile (no more Rust compiler issue)
|
| 40 |
+
TARGET_SR = 48000
|
| 41 |
TARGET_LOUDNESS = -18.0
|
| 42 |
|
| 43 |
# Filler words (English + Telugu + Hindi)
|
|
|
|
| 195 |
# BACKGROUND NOISE REMOVAL
|
| 196 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 197 |
def _remove_background_noise(self, audio, sr):
|
| 198 |
+
# Try DeepFilterNet (SOTA) β now properly installed via Dockerfile
|
| 199 |
try:
|
| 200 |
result = self._deepfilter(audio, sr)
|
| 201 |
print("[Denoiser] β
DeepFilterNet noise removal done")
|
|
|
|
| 203 |
except Exception as e:
|
| 204 |
logger.warning(f"[Denoiser] DeepFilterNet unavailable ({e})")
|
| 205 |
|
| 206 |
+
# FIX: Raised prop_decrease back to 0.85 + added n_std_thresh for stronger fallback
|
| 207 |
try:
|
| 208 |
import noisereduce as nr
|
| 209 |
cleaned = nr.reduce_noise(
|
| 210 |
y=audio, sr=sr,
|
| 211 |
stationary=True,
|
| 212 |
+
prop_decrease=0.85,
|
| 213 |
+
n_std_thresh_stationary=1.5, # FIX: more aggressive noise floor
|
| 214 |
).astype(np.float32)
|
| 215 |
print("[Denoiser] β
noisereduce noise removal done")
|
| 216 |
return cleaned, "noisereduce"
|
|
|
|
| 219 |
return audio, "none"
|
| 220 |
|
| 221 |
def _deepfilter(self, audio, sr):
|
| 222 |
+
"""
|
| 223 |
+
FIX: Added double-pass enhancement for Zoom audio.
|
| 224 |
+
Zoom meetings have layered noise (background + echo + mic hiss).
|
| 225 |
+
One pass removes the main noise; second pass cleans the residual.
|
| 226 |
+
"""
|
| 227 |
if not self._df_loaded:
|
| 228 |
from df.enhance import enhance, init_df
|
| 229 |
self._df_model, self._df_state, _ = init_df()
|
| 230 |
self._df_loaded = True
|
| 231 |
from df.enhance import enhance
|
| 232 |
import torch
|
| 233 |
+
|
| 234 |
df_sr = self._df_state.sr()
|
| 235 |
+
# TARGET_SR now matches DeepFilterNet's native SR (48kHz) β no resampling needed
|
| 236 |
+
a = self._resample(audio, sr, df_sr) if sr != df_sr else audio
|
| 237 |
+
t = torch.from_numpy(a).unsqueeze(0)
|
| 238 |
+
|
| 239 |
+
# FIX: Pass 1 β remove main background noise
|
| 240 |
+
out1 = enhance(self._df_model, self._df_state, t)
|
| 241 |
+
|
| 242 |
+
# FIX: Pass 2 β clean residual noise (critical for Zoom/meeting audio)
|
| 243 |
+
out2 = enhance(self._df_model, self._df_state, out1)
|
| 244 |
+
|
| 245 |
+
res = out2.squeeze().numpy().astype(np.float32)
|
| 246 |
return self._resample(res, df_sr, sr) if df_sr != sr else res
|
| 247 |
|
| 248 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 508 |
print(f"[Denoiser] β
Normalized: {loudness:.1f} β {TARGET_LOUDNESS} LUFS")
|
| 509 |
except Exception:
|
| 510 |
# FIX: Corrected RMS fallback formula
|
|
|
|
|
|
|
| 511 |
rms = np.sqrt(np.mean(audio**2))
|
| 512 |
if rms > 1e-9:
|
| 513 |
target_rms = 10 ** (TARGET_LOUDNESS / 20.0) # β 0.126
|
| 514 |
+
audio = audio * (target_rms / rms)
|
| 515 |
return np.clip(audio, -1.0, 1.0).astype(np.float32)
|
| 516 |
|
| 517 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|