Clearwave48 commited on
Commit
3b1c60e
Β·
verified Β·
1 Parent(s): d0c4c17

Delete denoiser.py

Browse files
Files changed (1) hide show
  1. denoiser.py +0 -727
denoiser.py DELETED
@@ -1,727 +0,0 @@
1
- """
2
- Department 1 β€” Professional Audio Enhancer (v2 β€” HF Spaces Optimised)
3
- =======================================================================
4
-
5
- βœ… Background noise removal β†’ SepFormer (HF/speechbrain, no Rust needed)
6
- β†’ Two-pass noisereduce (stationary + non-stat) fallback
7
- βœ… Filler word removal β†’ Whisper confidence-gated word-level timestamps
8
- βœ… Stutter removal β†’ Phonetic-similarity aware repeat detection
9
- βœ… Long silence removal β†’ Adaptive VAD threshold (percentile-based, env-aware)
10
- βœ… Breath sound reduction β†’ Spectral gating (noisereduce non-stationary)
11
- βœ… Mouth sound reduction β†’ Amplitude z-score transient suppression
12
- βœ… Room tone fill β†’ Seamless crossfade splice (no edit seams/clicks)
13
- βœ… Audio normalization β†’ pyloudnorm -18 LUFS
14
- βœ… CD quality output β†’ 44100Hz PCM_24 (HF Spaces compatible)
15
-
16
- UPGRADES v2:
17
- [NOISE] SepFormer (speechbrain) as primary β€” no Rust, works on HF Spaces
18
- [NOISE] Two-pass noisereduce fallback: stationary first, then non-stationary
19
- to catch residual noise without aggressive single-pass artifacts
20
- [FILLER] Whisper avg_logprob + no_speech_prob confidence gating β€”
21
- low-confidence words are not blindly cut anymore
22
- [FILLER] Min-duration guard: skips cuts shorter than 80ms (avoids micro-glitches)
23
- [STUTTER] Phonetic normalisation (jellyfish/editdistance) catches near-repeats
24
- e.g. "the" / "tha", "and" / "an" β€” not just exact matches
25
- [SILENCE] Adaptive threshold: uses 15th-percentile RMS of the recording
26
- instead of fixed 0.008 β€” works in noisy rooms and quiet studios alike
27
- [SPLICE] Crossfade blending on ALL cuts (fillers, stutters, silences) β€”
28
- smooth 20ms equal-power fade eliminates click/seam artifacts
29
- [PERF] Model singleton caching β€” SepFormer loaded once, reused across calls
30
- [PERF] VAD pre-scan with Silero (if available) to skip non-speech segments
31
- before heavy processing
32
- [ROBUST] Every stage returns original audio on failure (already true, kept)
33
- [ROBUST] ffmpeg stderr captured and logged on non-zero exit
34
- """
35
-
36
- import os
37
- import re
38
- import time
39
- import subprocess
40
- import numpy as np
41
- import soundfile as sf
42
- import logging
43
-
44
- logger = logging.getLogger(__name__)
45
-
46
- TARGET_SR = 48000 # 48kHz matches DeepFilterNet native SR (Rust available via Docker)
47
- TARGET_LOUDNESS = -18.0
48
-
49
- # Minimum duration of a detected cut to actually apply it (avoids micro-glitches)
50
- MIN_CUT_SEC = 0.08
51
-
52
- # Whisper confidence gate: only cut a word if its log-probability is above this.
53
- # Whisper avg_logprob is in range (-inf, 0]; -0.3 β‰ˆ "fairly confident".
54
- FILLER_MIN_LOGPROB = -0.5 # below this β†’ too uncertain to cut
55
- FILLER_MAX_NO_SPEECH = 0.4 # above this β†’ Whisper thinks it's non-speech anyway
56
-
57
- # Filler words (English + Telugu + Hindi)
58
- FILLER_WORDS = {
59
- "um", "umm", "ummm", "uh", "uhh", "uhhh",
60
- "hmm", "hm", "hmmm",
61
- "er", "err", "errr",
62
- "eh", "ahh", "ah",
63
- "like", "basically", "literally",
64
- "you know", "i mean", "so",
65
- "right", "okay", "ok",
66
- # Telugu
67
- "ante", "ane", "mane", "arey", "enti",
68
- # Hindi
69
- "matlab", "yani", "bas", "acha",
70
- }
71
-
72
- # ---------------------------------------------------------------------------
73
- # Module-level model cache (survives across Denoiser() instances on same Space)
74
- # ---------------------------------------------------------------------------
75
- _SILERO_MODEL = None # Silero VAD
76
- _SILERO_UTILS = None
77
-
78
-
79
- class Denoiser:
80
- def __init__(self):
81
- self._room_tone = None
82
- print("[Denoiser] βœ… Professional Audio Enhancer v2 ready (HF Spaces mode)")
83
-
84
- # ══════════════════════════════════════════════════════════════════
85
- # MAIN ENTRY POINT
86
- # ══════════════════════════════════════════════════════════════════
87
- def process(self, audio_path: str, out_dir: str,
88
- remove_fillers: bool = True,
89
- remove_silences: bool = True,
90
- remove_breaths: bool = True,
91
- remove_mouth_sounds: bool = True,
92
- remove_stutters: bool = True,
93
- word_segments: list = None,
94
- original_filename: str = None) -> dict:
95
- """
96
- Full professional pipeline.
97
-
98
- word_segments: list of dicts from Whisper word-level timestamps.
99
- Each dict: {
100
- 'word': str,
101
- 'start': float, # seconds
102
- 'end': float, # seconds
103
- 'avg_logprob': float, # optional β€” Whisper segment-level confidence
104
- 'no_speech_prob':float, # optional β€” Whisper no-speech probability
105
- }
106
-
107
- Returns: {'audio_path': str, 'stats': dict}
108
- """
109
- t0 = time.time()
110
- stats = {}
111
- print("[Denoiser] β–Ά Starting professional enhancement pipeline v2...")
112
-
113
- # ── 0. Convert to standard WAV ───────────────────────────────
114
- wav_in = os.path.join(out_dir, "stage0_input.wav")
115
- self._to_wav(audio_path, wav_in, TARGET_SR)
116
- audio, sr = sf.read(wav_in, always_2d=True)
117
- n_ch = audio.shape[1]
118
- duration = len(audio) / sr
119
- print(f"[Denoiser] Input: {sr}Hz, {n_ch}ch, {duration:.1f}s")
120
-
121
- # Work in mono float32
122
- mono = audio.mean(axis=1).astype(np.float32)
123
-
124
- # ── 1. Capture room tone BEFORE any denoising ────────────────
125
- self._room_tone = self._capture_room_tone(mono, sr)
126
-
127
- # ── 2. Background Noise Removal ──────────────────────────────
128
- mono, noise_method = self._remove_background_noise(mono, sr)
129
- stats['noise_method'] = noise_method
130
-
131
- # ── 3. Mouth Sound Reduction (clicks/pops) ───────────────────
132
- if remove_mouth_sounds:
133
- mono, n_clicks = self._reduce_mouth_sounds(mono, sr)
134
- stats['mouth_sounds_removed'] = n_clicks
135
-
136
- # ── 4. Breath Reduction ──────────────────────────────────────
137
- if remove_breaths:
138
- mono = self._reduce_breaths(mono, sr)
139
- stats['breaths_reduced'] = True
140
-
141
- # ── 5. Filler Word Removal ───────────────────────────────────
142
- stats['fillers_removed'] = 0
143
- if remove_fillers and word_segments:
144
- mono, n_fillers = self._remove_fillers(mono, sr, word_segments)
145
- stats['fillers_removed'] = n_fillers
146
-
147
- # ── 6. Stutter Removal ───────────────────────────────────────
148
- stats['stutters_removed'] = 0
149
- if remove_stutters and word_segments:
150
- mono, n_stutters = self._remove_stutters(mono, sr, word_segments)
151
- stats['stutters_removed'] = n_stutters
152
-
153
- # ── 7. Long Silence Removal ───────────────────────────────────
154
- stats['silences_removed_sec'] = 0.0
155
- if remove_silences:
156
- mono, sil_sec = self._remove_long_silences(mono, sr)
157
- stats['silences_removed_sec'] = round(sil_sec, 2)
158
-
159
- # ── 8. Normalize Loudness ─────────────────────────────────────
160
- mono = self._normalise(mono, sr)
161
-
162
- # ── 9. Restore stereo / save as MP3 ──────────────────────────
163
- out_audio = np.stack([mono, mono], axis=1) if n_ch == 2 else mono
164
-
165
- # Build output filename: strip original extension, append _cleared.mp3
166
- # e.g. "output.wav" β†’ "output_cleared.mp3"
167
- if original_filename:
168
- base = os.path.splitext(os.path.basename(original_filename))[0]
169
- else:
170
- base = os.path.splitext(os.path.basename(audio_path))[0]
171
- out_name = f"{base}_cleared.mp3"
172
-
173
- # Write a temporary WAV first (soundfile can't encode MP3),
174
- # then convert to MP3 via ffmpeg (already in the Dockerfile).
175
- tmp_wav = os.path.join(out_dir, "denoised_tmp.wav")
176
- out_path = os.path.join(out_dir, out_name)
177
- sf.write(tmp_wav, out_audio, sr, format="WAV", subtype="PCM_24")
178
-
179
- result = subprocess.run([
180
- "ffmpeg", "-y", "-i", tmp_wav,
181
- "-codec:a", "libmp3lame",
182
- "-qscale:a", "2", # VBR quality 2 β‰ˆ 190 kbps β€” transparent quality
183
- "-ar", str(sr),
184
- out_path
185
- ], capture_output=True)
186
-
187
- if result.returncode != 0:
188
- stderr = result.stderr.decode(errors="replace")
189
- logger.warning(f"MP3 export failed, falling back to WAV: {stderr[-300:]}")
190
- out_path = tmp_wav # graceful fallback β€” still return something
191
- else:
192
- try:
193
- os.remove(tmp_wav) # clean up temp WAV
194
- except OSError:
195
- pass
196
-
197
- stats['processing_sec'] = round(time.time() - t0, 2)
198
- print(f"[Denoiser] βœ… Done in {stats['processing_sec']}s | {stats}")
199
- return {'audio_path': out_path, 'stats': stats}
200
-
201
- # ══════════════════════════════════════════════════════════════════
202
- # ROOM TONE CAPTURE
203
- # ════════��═════════════════════════════════════════════════════════
204
- def _capture_room_tone(self, audio: np.ndarray, sr: int,
205
- sample_sec: float = 0.5) -> np.ndarray:
206
- """Find the quietest 0.5s window in the recording β€” that's the room tone."""
207
- try:
208
- frame = int(sr * sample_sec)
209
-
210
- if len(audio) < frame * 2:
211
- fallback_len = min(int(sr * 0.1), len(audio))
212
- print("[Denoiser] Short audio β€” using first 100ms as room tone")
213
- return audio[:fallback_len].copy().astype(np.float32)
214
-
215
- best_rms = float('inf')
216
- best_start = 0
217
- step = sr # 1-second steps
218
-
219
- for i in range(0, len(audio) - frame, step):
220
- rms = float(np.sqrt(np.mean(audio[i:i + frame] ** 2)))
221
- if rms < best_rms:
222
- best_rms, best_start = rms, i
223
-
224
- room = audio[best_start: best_start + frame].copy()
225
- print(f"[Denoiser] Room tone captured: RMS={best_rms:.5f}")
226
- return room
227
- except Exception as e:
228
- logger.warning(f"Room tone capture failed: {e}")
229
- return np.zeros(int(sr * sample_sec), dtype=np.float32)
230
-
231
- def _fill_with_room_tone(self, length: int) -> np.ndarray:
232
- """Tile room tone to fill a gap of `length` samples."""
233
- if self._room_tone is None or len(self._room_tone) == 0:
234
- return np.zeros(length, dtype=np.float32)
235
- reps = length // len(self._room_tone) + 1
236
- tiled = np.tile(self._room_tone, reps)[:length]
237
- fade = min(int(0.01 * len(tiled)), 64)
238
- if fade > 0:
239
- tiled[:fade] *= np.linspace(0, 1, fade)
240
- tiled[-fade:] *= np.linspace(1, 0, fade)
241
- return tiled.astype(np.float32)
242
-
243
- # ══════════════════════════════════════════════════════════════════
244
- # CROSSFADE SPLICE ← NEW
245
- # Replaces abrupt room-tone insertion with smooth equal-power blend.
246
- # ══════════════════════════════════════════════════════════════════
247
- def _crossfade_join(self, a: np.ndarray, b: np.ndarray,
248
- fade_ms: float = 20.0, sr: int = TARGET_SR) -> np.ndarray:
249
- """
250
- Equal-power crossfade between the tail of `a` and the head of `b`.
251
- Eliminates click/seam artifacts at all edit points.
252
- """
253
- fade_n = int(sr * fade_ms / 1000)
254
- fade_n = min(fade_n, len(a), len(b))
255
-
256
- if fade_n < 2:
257
- return np.concatenate([a, b])
258
-
259
- t = np.linspace(0, np.pi / 2, fade_n)
260
- fade_out = np.cos(t) # equal-power: cosΒ²+sinΒ²=1
261
- fade_in = np.sin(t)
262
-
263
- overlap = a[-fade_n:] * fade_out + b[:fade_n] * fade_in
264
- return np.concatenate([a[:-fade_n], overlap, b[fade_n:]])
265
-
266
- def _build_with_crossfade(self, audio: np.ndarray, cuts: list,
267
- sr: int, fill_tone: bool = True) -> np.ndarray:
268
- """
269
- Build output from a list of (start_sec, end_sec) cuts,
270
- filling gaps with room tone and crossfading every join.
271
-
272
- cuts: sorted list of (start_sec, end_sec) to REMOVE.
273
- """
274
- segments = []
275
- prev = 0.0
276
-
277
- for start, end in sorted(cuts, key=lambda x: x[0]):
278
- # Guard: skip cuts shorter than minimum
279
- if (end - start) < MIN_CUT_SEC:
280
- continue
281
-
282
- keep_sta = int(prev * sr)
283
- keep_end = int(start * sr)
284
- if keep_sta < keep_end:
285
- segments.append(audio[keep_sta:keep_end])
286
-
287
- gap_len = int((end - start) * sr)
288
- if fill_tone and gap_len > 0:
289
- segments.append(self._fill_with_room_tone(gap_len))
290
-
291
- prev = end
292
-
293
- remain = int(prev * sr)
294
- if remain < len(audio):
295
- segments.append(audio[remain:])
296
-
297
- if not segments:
298
- return audio
299
-
300
- # Crossfade every adjacent pair
301
- result = segments[0]
302
- for seg in segments[1:]:
303
- result = self._crossfade_join(result, seg, fade_ms=20.0, sr=sr)
304
- return result.astype(np.float32)
305
-
306
- # ══════════════════════════════════════════════════════════════════
307
- # BACKGROUND NOISE REMOVAL
308
- # Chain: DeepFilterNet β†’ two-pass noisereduce β†’ passthrough
309
- #
310
- # SepFormer REMOVED β€” it is a speech separation model, not a denoiser.
311
- # It reconstructs voice artificially β†’ robotic output.
312
- #
313
- # Two-pass noisereduce is the safe CPU fallback:
314
- # Pass 1 (stationary) β€” removes steady hum/hiss/fan noise
315
- # Pass 2 (non-stationary) β€” catches residual at low prop_decrease
316
- # so original voice character is preserved
317
- # ══════════════════════════════════════════════════════════════════
318
- def _remove_background_noise(self, audio, sr):
319
- # ── Primary: DeepFilterNet (SOTA, Rust available via Docker) ─────
320
- try:
321
- result = self._deepfilter(audio, sr)
322
- print("[Denoiser] βœ… DeepFilterNet noise removal done")
323
- return result, "DeepFilterNet"
324
- except Exception as e:
325
- logger.warning(f"[Denoiser] DeepFilterNet unavailable ({e})")
326
-
327
- # ── Fallback: Single-pass noisereduce, stationary only ────────────
328
- # PHILOSOPHY: do as little as possible to the signal.
329
- # - stationary=True β†’ only targets steady/consistent noise (fan,
330
- # hum, AC, room hiss). Leaves transient
331
- # speech harmonics completely untouched.
332
- # - prop_decrease=0.5 β†’ reduces noise by ~50%, not 100%.
333
- # Keeps a thin noise floor so the voice
334
- # never sounds "hollow" or over-processed.
335
- # - No second pass, no non-stationary processing β€” those modes
336
- # touch voice frequencies and cause the robotic effect.
337
- try:
338
- import noisereduce as nr
339
- cleaned = nr.reduce_noise(
340
- y=audio, sr=sr,
341
- stationary=True,
342
- prop_decrease=0.50,
343
- ).astype(np.float32)
344
- print("[Denoiser] βœ… noisereduce done (voice-preserving, stationary only)")
345
- return cleaned, "noisereduce_stationary"
346
- except Exception as e:
347
- logger.warning(f"noisereduce failed: {e}")
348
-
349
- return audio, "none"
350
-
351
- def _deepfilter(self, audio: np.ndarray, sr: int) -> np.ndarray:
352
- """DeepFilterNet enhancement (local only β€” requires Rust compiler)."""
353
- from df.enhance import enhance, init_df
354
- import torch
355
-
356
- # Lazy-load, module-level cache not needed (rarely reached on HF Spaces)
357
- if not hasattr(self, '_df_model') or self._df_model is None:
358
- self._df_model, self._df_state, _ = init_df()
359
-
360
- df_sr = self._df_state.sr()
361
- a = self._resample(audio, sr, df_sr) if sr != df_sr else audio
362
- t = torch.from_numpy(a).unsqueeze(0)
363
- out = enhance(self._df_model, self._df_state, t)
364
- res = out.squeeze().numpy().astype(np.float32)
365
- return self._resample(res, df_sr, sr) if df_sr != sr else res
366
-
367
- # ══════════════════════════════════════════════════════════════════
368
- # FILLER WORD REMOVAL ← UPGRADED (confidence-gated + crossfade)
369
- # ══════════════════════════════════════════════════════════════════
370
- def _remove_fillers(self, audio: np.ndarray, sr: int, segments: list):
371
- """
372
- Cuts filler words using Whisper word-level timestamps.
373
-
374
- UPGRADE: Confidence gating β€” words are only cut if:
375
- 1. avg_logprob β‰₯ FILLER_MIN_LOGPROB (Whisper was confident)
376
- 2. no_speech_prob ≀ FILLER_MAX_NO_SPEECH (audio is actually speech)
377
- 3. Duration β‰₯ MIN_CUT_SEC (not a micro-glitch timestamp artefact)
378
-
379
- Falls back gracefully when confidence fields are absent (older Whisper).
380
- Gaps filled with room tone + crossfade for seamless edits.
381
- """
382
- try:
383
- cuts = []
384
- for seg in segments:
385
- word = seg.get('word', '').strip().lower()
386
- word = re.sub(r'[^a-z\s]', '', word).strip()
387
-
388
- if word not in FILLER_WORDS:
389
- continue
390
-
391
- start = seg.get('start', 0.0)
392
- end = seg.get('end', 0.0)
393
-
394
- # Duration guard
395
- if (end - start) < MIN_CUT_SEC:
396
- continue
397
-
398
- # Confidence gate (optional fields β€” skip gate if absent)
399
- avg_logprob = seg.get('avg_logprob', None)
400
- no_speech_prob = seg.get('no_speech_prob', None)
401
-
402
- if avg_logprob is not None and avg_logprob < FILLER_MIN_LOGPROB:
403
- logger.debug(f"[Denoiser] Filler '{word}' skipped: "
404
- f"low confidence ({avg_logprob:.2f})")
405
- continue
406
-
407
- if no_speech_prob is not None and no_speech_prob > FILLER_MAX_NO_SPEECH:
408
- logger.debug(f"[Denoiser] Filler '{word}' skipped: "
409
- f"no_speech_prob={no_speech_prob:.2f}")
410
- continue
411
-
412
- cuts.append((start, end))
413
-
414
- if not cuts:
415
- return audio, 0
416
-
417
- out = self._build_with_crossfade(audio, cuts, sr, fill_tone=True)
418
- print(f"[Denoiser] βœ… Removed {len(cuts)} filler words")
419
- return out, len(cuts)
420
- except Exception as e:
421
- logger.warning(f"Filler removal failed: {e}")
422
- return audio, 0
423
-
424
- def clean_transcript_fillers(self, transcript: str) -> str:
425
- """Remove filler words from transcript TEXT to match cleaned audio."""
426
- words = transcript.split()
427
- result = []
428
- i = 0
429
- while i < len(words):
430
- w = re.sub(r'[^a-z\s]', '', words[i].lower()).strip()
431
- if i + 1 < len(words):
432
- two = w + " " + re.sub(r'[^a-z\s]', '', words[i+1].lower()).strip()
433
- if two in FILLER_WORDS:
434
- i += 2
435
- continue
436
- if w in FILLER_WORDS:
437
- i += 1
438
- continue
439
- result.append(words[i])
440
- i += 1
441
- return " ".join(result)
442
-
443
- # ══════════════════════════════════════════════════════════════════
444
- # STUTTER REMOVAL ← UPGRADED (phonetic similarity + crossfade)
445
- # ══════════════════════════════════════════════════════════════════
446
- def _remove_stutters(self, audio: np.ndarray, sr: int, segments: list):
447
- """
448
- UPGRADE: Phonetic near-match detection in addition to exact repeats.
449
- e.g. "the" / "tha", "and" / "an", "I" / "I" all caught.
450
-
451
- Uses jellyfish.jaro_winkler_similarity if available;
452
- falls back to plain edit-distance ratio, then exact match only.
453
-
454
- Confidence gating applied here too (same thresholds as filler removal).
455
- Crossfade used on all splices.
456
- """
457
- try:
458
- if len(segments) < 2:
459
- return audio, 0
460
-
461
- # Choose similarity function
462
- sim_fn = self._word_similarity_fn()
463
-
464
- cuts = []
465
- stutters_found = 0
466
- i = 0
467
-
468
- while i < len(segments):
469
- seg_i = segments[i]
470
- word = re.sub(r'[^a-z]', '', seg_i.get('word', '').lower())
471
-
472
- if not word:
473
- i += 1
474
- continue
475
-
476
- # Confidence gate on the anchor word
477
- if not self._passes_confidence_gate(seg_i):
478
- i += 1
479
- continue
480
-
481
- # Look ahead for consecutive near-matches
482
- j = i + 1
483
- while j < len(segments):
484
- seg_j = segments[j]
485
- next_word = re.sub(r'[^a-z]', '', seg_j.get('word', '').lower())
486
-
487
- if not next_word:
488
- j += 1
489
- continue
490
-
491
- similarity = sim_fn(word, next_word)
492
- if similarity >= 0.88: # β‰₯88% similar = stutter
493
- cuts.append((seg_i['start'], seg_i['end']))
494
- stutters_found += 1
495
- i = j
496
- j += 1
497
- else:
498
- break
499
-
500
- i += 1
501
-
502
- if not cuts:
503
- return audio, 0
504
-
505
- out = self._build_with_crossfade(audio, cuts, sr, fill_tone=True)
506
- print(f"[Denoiser] βœ… Removed {stutters_found} stutters")
507
- return out, stutters_found
508
- except Exception as e:
509
- logger.warning(f"Stutter removal failed: {e}")
510
- return audio, 0
511
-
512
- @staticmethod
513
- def _word_similarity_fn():
514
- """Return best available string-similarity function."""
515
- try:
516
- import jellyfish
517
- return jellyfish.jaro_winkler_similarity
518
- except ImportError:
519
- pass
520
- try:
521
- import editdistance
522
- def _ed_ratio(a, b):
523
- if not a and not b:
524
- return 1.0
525
- dist = editdistance.eval(a, b)
526
- return 1.0 - dist / max(len(a), len(b))
527
- return _ed_ratio
528
- except ImportError:
529
- pass
530
- # Plain exact match as last resort
531
- return lambda a, b: 1.0 if a == b else 0.0
532
-
533
- @staticmethod
534
- def _passes_confidence_gate(seg: dict) -> bool:
535
- """Return True if Whisper confidence is acceptable (or fields absent)."""
536
- avg_logprob = seg.get('avg_logprob', None)
537
- no_speech_prob = seg.get('no_speech_prob', None)
538
- if avg_logprob is not None and avg_logprob < FILLER_MIN_LOGPROB:
539
- return False
540
- if no_speech_prob is not None and no_speech_prob > FILLER_MAX_NO_SPEECH:
541
- return False
542
- return True
543
-
544
- # ══════════════════════════════════════════════════════════════════
545
- # BREATH REDUCTION
546
- # ══════════════════════════════════════════════════════════════════
547
- def _reduce_breaths(self, audio: np.ndarray, sr: int) -> np.ndarray:
548
- """Non-stationary spectral gating β€” catches short broadband breath bursts."""
549
- try:
550
- import noisereduce as nr
551
- cleaned = nr.reduce_noise(
552
- y=audio, sr=sr,
553
- stationary=False,
554
- prop_decrease=0.60,
555
- freq_mask_smooth_hz=400,
556
- time_mask_smooth_ms=40,
557
- ).astype(np.float32)
558
- print("[Denoiser] βœ… Breath reduction done")
559
- return cleaned
560
- except Exception as e:
561
- logger.warning(f"Breath reduction failed: {e}")
562
- return audio
563
-
564
- # ══════════════════════════════════════════════════════════════════
565
- # MOUTH SOUND REDUCTION
566
- # ══════════════════════════════════════════════════════════════════
567
- def _reduce_mouth_sounds(self, audio: np.ndarray, sr: int):
568
- """
569
- Suppress very short, very high-amplitude transients (clicks/pops).
570
- Threshold at 6.0 std to avoid removing real consonants (p, b, t).
571
- """
572
- try:
573
- result = audio.copy()
574
- win = int(sr * 0.003) # 3ms window
575
- hop = win // 2
576
- rms_arr = np.array([
577
- float(np.sqrt(np.mean(audio[i:i+win]**2)))
578
- for i in range(0, len(audio) - win, hop)
579
- ])
580
-
581
- if len(rms_arr) == 0:
582
- return audio, 0
583
-
584
- threshold = float(np.mean(rms_arr)) + 6.0 * float(np.std(rms_arr))
585
- n_removed = 0
586
-
587
- for idx, rms in enumerate(rms_arr):
588
- if rms > threshold:
589
- start = idx * hop
590
- end = min(start + win, len(result))
591
- result[start:end] *= np.linspace(1, 0, end - start)
592
- n_removed += 1
593
-
594
- if n_removed:
595
- print(f"[Denoiser] βœ… Suppressed {n_removed} mouth sound transients")
596
- return result.astype(np.float32), n_removed
597
- except Exception as e:
598
- logger.warning(f"Mouth sound reduction failed: {e}")
599
- return audio, 0
600
-
601
- # ══════════════════════════════════════════════════════════════════
602
- # LONG SILENCE REMOVAL ← UPGRADED (adaptive threshold)
603
- # ══════════════════════════════════════════════════════════════════
604
- def _remove_long_silences(self, audio: np.ndarray, sr: int,
605
- max_silence_sec: float = 1.5,
606
- keep_pause_sec: float = 0.4) -> tuple:
607
- """
608
- UPGRADE: Adaptive silence threshold.
609
- Old code used a hardcoded RMS=0.008 β€” worked in quiet studios only.
610
- New: threshold = 15th-percentile of per-frame RMS values.
611
- This self-calibrates to the recording's actual noise floor,
612
- so it works equally well in noisy rooms and near-silent studios.
613
-
614
- Silences replaced with room tone + crossfade.
615
- """
616
- try:
617
- frame_len = int(sr * 0.02) # 20ms frames
618
-
619
- # ── Compute per-frame RMS ─────────────────────────────────
620
- n_frames = (len(audio) - frame_len) // frame_len
621
- rms_frames = np.array([
622
- float(np.sqrt(np.mean(audio[i*frame_len:(i+1)*frame_len]**2)))
623
- for i in range(n_frames)
624
- ])
625
-
626
- if len(rms_frames) == 0:
627
- return audio, 0.0
628
-
629
- # ── Adaptive threshold: 15th percentile of RMS ───────────
630
- threshold = float(np.percentile(rms_frames, 15))
631
- # Clamp: never go below 0.001 (avoids mis-classifying very quiet speech)
632
- threshold = max(threshold, 0.001)
633
- print(f"[Denoiser] Adaptive silence threshold: RMS={threshold:.5f}")
634
-
635
- max_sil_frames = int(max_silence_sec / 0.02)
636
- keep_frames = int(keep_pause_sec / 0.02)
637
-
638
- kept = []
639
- silence_count = 0
640
- total_removed = 0
641
- in_long_sil = False
642
-
643
- for i in range(n_frames):
644
- frame = audio[i*frame_len:(i+1)*frame_len]
645
- rms = rms_frames[i]
646
-
647
- if rms < threshold:
648
- silence_count += 1
649
- if silence_count <= max_sil_frames:
650
- kept.append(frame)
651
- else:
652
- total_removed += frame_len
653
- in_long_sil = True
654
- else:
655
- if in_long_sil:
656
- pad = self._fill_with_room_tone(keep_frames * frame_len)
657
- kept.append(pad)
658
- in_long_sil = False
659
- silence_count = 0
660
- kept.append(frame)
661
-
662
- # Tail of audio
663
- tail_start = n_frames * frame_len
664
- if tail_start < len(audio):
665
- kept.append(audio[tail_start:])
666
-
667
- if not kept:
668
- return audio, 0.0
669
-
670
- # Crossfade each frame join for smooth output
671
- result = kept[0]
672
- for seg in kept[1:]:
673
- result = self._crossfade_join(result, seg, fade_ms=5.0, sr=sr)
674
-
675
- removed_sec = total_removed / sr
676
- if removed_sec > 0:
677
- print(f"[Denoiser] βœ… Removed {removed_sec:.1f}s of long silences")
678
- return result.astype(np.float32), removed_sec
679
- except Exception as e:
680
- logger.warning(f"Silence removal failed: {e}")
681
- return audio, 0.0
682
-
683
- # ══════════════════════════════════════════════════════════════════
684
- # NORMALIZATION
685
- # ══════════════════════════════════════════════════════════════════
686
- def _normalise(self, audio: np.ndarray, sr: int) -> np.ndarray:
687
- try:
688
- import pyloudnorm as pyln
689
- meter = pyln.Meter(sr)
690
- loudness = meter.integrated_loudness(audio)
691
- if np.isfinite(loudness) and loudness < 0:
692
- audio = pyln.normalize.loudness(audio, loudness, TARGET_LOUDNESS)
693
- print(f"[Denoiser] βœ… Normalized: {loudness:.1f} β†’ {TARGET_LOUDNESS} LUFS")
694
- except Exception:
695
- rms = np.sqrt(np.mean(audio**2))
696
- if rms > 1e-9:
697
- target_rms = 10 ** (TARGET_LOUDNESS / 20.0)
698
- audio = audio * (target_rms / rms)
699
- return np.clip(audio, -1.0, 1.0).astype(np.float32)
700
-
701
- # ══════════════════════════════════════════════════════════════════
702
- # HELPERS
703
- # ══════════════════════════════════════════════════════════════════
704
- def _to_wav(self, src: str, dst: str, target_sr: int):
705
- result = subprocess.run([
706
- "ffmpeg", "-y", "-i", src,
707
- "-acodec", "pcm_s24le", "-ar", str(target_sr), dst
708
- ], capture_output=True)
709
- if result.returncode != 0:
710
- stderr = result.stderr.decode(errors='replace')
711
- logger.warning(f"ffmpeg non-zero exit: {stderr[-400:]}")
712
- # Fallback: soundfile passthrough
713
- data, sr = sf.read(src, always_2d=True)
714
- sf.write(dst, data, sr, format="WAV", subtype="PCM_24")
715
-
716
- def _resample(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
717
- if orig_sr == target_sr:
718
- return audio
719
- try:
720
- import librosa
721
- return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
722
- except Exception:
723
- length = int(len(audio) * target_sr / orig_sr)
724
- return np.interp(
725
- np.linspace(0, len(audio), length),
726
- np.arange(len(audio)), audio
727
- ).astype(np.float32)