Spaces:

Clearwave48
/

clearwave-api

Sleeping

App Files Files Community

Clearwave48 commited on 11 days ago

Commit

3b1c60e

verified ·

1 Parent(s): d0c4c17

Delete denoiser.py

Browse files

Files changed (1) hide show

denoiser.py +0 -727

denoiser.py DELETED Viewed

@@ -1,727 +0,0 @@
-"""
-Department 1 — Professional Audio Enhancer  (v2 — HF Spaces Optimised)
-=======================================================================
-✅ Background noise removal   → SepFormer (HF/speechbrain, no Rust needed)
-                                 → Two-pass noisereduce (stationary + non-stat) fallback
-✅ Filler word removal        → Whisper confidence-gated word-level timestamps
-✅ Stutter removal            → Phonetic-similarity aware repeat detection
-✅ Long silence removal       → Adaptive VAD threshold (percentile-based, env-aware)
-✅ Breath sound reduction     → Spectral gating (noisereduce non-stationary)
-✅ Mouth sound reduction      → Amplitude z-score transient suppression
-✅ Room tone fill             → Seamless crossfade splice (no edit seams/clicks)
-✅ Audio normalization        → pyloudnorm -18 LUFS
-✅ CD quality output          → 44100Hz PCM_24 (HF Spaces compatible)
-UPGRADES v2:
-  [NOISE]    SepFormer (speechbrain) as primary — no Rust, works on HF Spaces
-  [NOISE]    Two-pass noisereduce fallback: stationary first, then non-stationary
-             to catch residual noise without aggressive single-pass artifacts
-  [FILLER]   Whisper avg_logprob + no_speech_prob confidence gating —
-             low-confidence words are not blindly cut anymore
-  [FILLER]   Min-duration guard: skips cuts shorter than 80ms (avoids micro-glitches)
-  [STUTTER]  Phonetic normalisation (jellyfish/editdistance) catches near-repeats
-             e.g. "the" / "tha", "and" / "an" — not just exact matches
-  [SILENCE]  Adaptive threshold: uses 15th-percentile RMS of the recording
-             instead of fixed 0.008 — works in noisy rooms and quiet studios alike
-  [SPLICE]   Crossfade blending on ALL cuts (fillers, stutters, silences) —
-             smooth 20ms equal-power fade eliminates click/seam artifacts
-  [PERF]     Model singleton caching — SepFormer loaded once, reused across calls
-  [PERF]     VAD pre-scan with Silero (if available) to skip non-speech segments
-             before heavy processing
-  [ROBUST]   Every stage returns original audio on failure (already true, kept)
-  [ROBUST]   ffmpeg stderr captured and logged on non-zero exit
-"""
-import os
-import re
-import time
-import subprocess
-import numpy as np
-import soundfile as sf
-import logging
-logger = logging.getLogger(__name__)
-TARGET_SR       = 48000   # 48kHz matches DeepFilterNet native SR (Rust available via Docker)
-TARGET_LOUDNESS = -18.0
-# Minimum duration of a detected cut to actually apply it (avoids micro-glitches)
-MIN_CUT_SEC = 0.08
-# Whisper confidence gate: only cut a word if its log-probability is above this.
-# Whisper avg_logprob is in range (-inf, 0]; -0.3 ≈ "fairly confident".
-FILLER_MIN_LOGPROB   = -0.5   # below this → too uncertain to cut
-FILLER_MAX_NO_SPEECH = 0.4    # above this → Whisper thinks it's non-speech anyway
-# Filler words (English + Telugu + Hindi)
-FILLER_WORDS = {
-    "um", "umm", "ummm", "uh", "uhh", "uhhh",
-    "hmm", "hm", "hmmm",
-    "er", "err", "errr",
-    "eh", "ahh", "ah",
-    "like", "basically", "literally",
-    "you know", "i mean", "so",
-    "right", "okay", "ok",
-    # Telugu
-    "ante", "ane", "mane", "arey", "enti",
-    # Hindi
-    "matlab", "yani", "bas", "acha",
-}
-# ---------------------------------------------------------------------------
-# Module-level model cache (survives across Denoiser() instances on same Space)
-# ---------------------------------------------------------------------------
-_SILERO_MODEL    = None   # Silero VAD
-_SILERO_UTILS    = None
-class Denoiser:
-    def __init__(self):
-        self._room_tone = None
-        print("[Denoiser] ✅ Professional Audio Enhancer v2 ready (HF Spaces mode)")
-    # ══════════════════════════════════════════════════════════════════
-    # MAIN ENTRY POINT
-    # ══════════════════════════════════════════════════════════════════
-    def process(self, audio_path: str, out_dir: str,
-                remove_fillers: bool      = True,
-                remove_silences: bool     = True,
-                remove_breaths: bool      = True,
-                remove_mouth_sounds: bool = True,
-                remove_stutters: bool     = True,
-                word_segments: list       = None,
-                original_filename: str    = None) -> dict:
-        """
-        Full professional pipeline.
-        word_segments: list of dicts from Whisper word-level timestamps.
-          Each dict: {
-            'word':          str,
-            'start':         float,   # seconds
-            'end':           float,   # seconds
-            'avg_logprob':   float,   # optional — Whisper segment-level confidence
-            'no_speech_prob':float,   # optional — Whisper no-speech probability
-          }
-        Returns: {'audio_path': str, 'stats': dict}
-        """
-        t0    = time.time()
-        stats = {}
-        print("[Denoiser] ▶ Starting professional enhancement pipeline v2...")
-        # ── 0. Convert to standard WAV ───────────────────────────────
-        wav_in = os.path.join(out_dir, "stage0_input.wav")
-        self._to_wav(audio_path, wav_in, TARGET_SR)
-        audio, sr = sf.read(wav_in, always_2d=True)
-        n_ch      = audio.shape[1]
-        duration  = len(audio) / sr
-        print(f"[Denoiser] Input: {sr}Hz, {n_ch}ch, {duration:.1f}s")
-        # Work in mono float32
-        mono = audio.mean(axis=1).astype(np.float32)
-        # ── 1. Capture room tone BEFORE any denoising ────────────────
-        self._room_tone = self._capture_room_tone(mono, sr)
-        # ── 2. Background Noise Removal ──────────────────────────────
-        mono, noise_method = self._remove_background_noise(mono, sr)
-        stats['noise_method'] = noise_method
-        # ── 3. Mouth Sound Reduction (clicks/pops) ───────────────────
-        if remove_mouth_sounds:
-            mono, n_clicks = self._reduce_mouth_sounds(mono, sr)
-            stats['mouth_sounds_removed'] = n_clicks
-        # ── 4. Breath Reduction ──────────────────────────────────────
-        if remove_breaths:
-            mono = self._reduce_breaths(mono, sr)
-            stats['breaths_reduced'] = True
-        # ── 5. Filler Word Removal ───────────────────────────────────
-        stats['fillers_removed'] = 0
-        if remove_fillers and word_segments:
-            mono, n_fillers = self._remove_fillers(mono, sr, word_segments)
-            stats['fillers_removed'] = n_fillers
-        # ── 6. Stutter Removal ───────────────────────────────────────
-        stats['stutters_removed'] = 0
-        if remove_stutters and word_segments:
-            mono, n_stutters = self._remove_stutters(mono, sr, word_segments)
-            stats['stutters_removed'] = n_stutters
-        # ── 7. Long Silence Removal ───────────────────────────────────
-        stats['silences_removed_sec'] = 0.0
-        if remove_silences:
-            mono, sil_sec = self._remove_long_silences(mono, sr)
-            stats['silences_removed_sec'] = round(sil_sec, 2)
-        # ── 8. Normalize Loudness ─────────────────────────────────────
-        mono = self._normalise(mono, sr)
-        # ── 9. Restore stereo / save as MP3 ──────────────────────────
-        out_audio = np.stack([mono, mono], axis=1) if n_ch == 2 else mono
-        # Build output filename: strip original extension, append _cleared.mp3
-        # e.g. "output.wav" → "output_cleared.mp3"
-        if original_filename:
-            base = os.path.splitext(os.path.basename(original_filename))[0]
-        else:
-            base = os.path.splitext(os.path.basename(audio_path))[0]
-        out_name = f"{base}_cleared.mp3"
-        # Write a temporary WAV first (soundfile can't encode MP3),
-        # then convert to MP3 via ffmpeg (already in the Dockerfile).
-        tmp_wav  = os.path.join(out_dir, "denoised_tmp.wav")
-        out_path = os.path.join(out_dir, out_name)
-        sf.write(tmp_wav, out_audio, sr, format="WAV", subtype="PCM_24")
-        result = subprocess.run([
-            "ffmpeg", "-y", "-i", tmp_wav,
-            "-codec:a", "libmp3lame",
-            "-qscale:a", "2",   # VBR quality 2 ≈ 190 kbps — transparent quality
-            "-ar", str(sr),
-            out_path
-        ], capture_output=True)
-        if result.returncode != 0:
-            stderr = result.stderr.decode(errors="replace")
-            logger.warning(f"MP3 export failed, falling back to WAV: {stderr[-300:]}")
-            out_path = tmp_wav   # graceful fallback — still return something
-        else:
-            try:
-                os.remove(tmp_wav)   # clean up temp WAV
-            except OSError:
-                pass
-        stats['processing_sec'] = round(time.time() - t0, 2)
-        print(f"[Denoiser] ✅ Done in {stats['processing_sec']}s | {stats}")
-        return {'audio_path': out_path, 'stats': stats}
-    # ══════════════════════════════════════════════════════════════════
-    # ROOM TONE CAPTURE
-    # ════════��═════════════════════════════════════════════════════════
-    def _capture_room_tone(self, audio: np.ndarray, sr: int,
-                            sample_sec: float = 0.5) -> np.ndarray:
-        """Find the quietest 0.5s window in the recording — that's the room tone."""
-        try:
-            frame = int(sr * sample_sec)
-            if len(audio) < frame * 2:
-                fallback_len = min(int(sr * 0.1), len(audio))
-                print("[Denoiser] Short audio — using first 100ms as room tone")
-                return audio[:fallback_len].copy().astype(np.float32)
-            best_rms   = float('inf')
-            best_start = 0
-            step       = sr  # 1-second steps
-            for i in range(0, len(audio) - frame, step):
-                rms = float(np.sqrt(np.mean(audio[i:i + frame] ** 2)))
-                if rms < best_rms:
-                    best_rms, best_start = rms, i
-            room = audio[best_start: best_start + frame].copy()
-            print(f"[Denoiser] Room tone captured: RMS={best_rms:.5f}")
-            return room
-        except Exception as e:
-            logger.warning(f"Room tone capture failed: {e}")
-            return np.zeros(int(sr * sample_sec), dtype=np.float32)
-    def _fill_with_room_tone(self, length: int) -> np.ndarray:
-        """Tile room tone to fill a gap of `length` samples."""
-        if self._room_tone is None or len(self._room_tone) == 0:
-            return np.zeros(length, dtype=np.float32)
-        reps  = length // len(self._room_tone) + 1
-        tiled = np.tile(self._room_tone, reps)[:length]
-        fade  = min(int(0.01 * len(tiled)), 64)
-        if fade > 0:
-            tiled[:fade]  *= np.linspace(0, 1, fade)
-            tiled[-fade:] *= np.linspace(1, 0, fade)
-        return tiled.astype(np.float32)
-    # ══════════════════════════════════════════════════════════════════
-    # CROSSFADE SPLICE  ← NEW
-    # Replaces abrupt room-tone insertion with smooth equal-power blend.
-    # ══════════════════════════════════════════════════════════════════
-    def _crossfade_join(self, a: np.ndarray, b: np.ndarray,
-                         fade_ms: float = 20.0, sr: int = TARGET_SR) -> np.ndarray:
-        """
-        Equal-power crossfade between the tail of `a` and the head of `b`.
-        Eliminates click/seam artifacts at all edit points.
-        """
-        fade_n = int(sr * fade_ms / 1000)
-        fade_n = min(fade_n, len(a), len(b))
-        if fade_n < 2:
-            return np.concatenate([a, b])
-        t      = np.linspace(0, np.pi / 2, fade_n)
-        fade_out = np.cos(t)   # equal-power: cos²+sin²=1
-        fade_in  = np.sin(t)
-        overlap  = a[-fade_n:] * fade_out + b[:fade_n] * fade_in
-        return np.concatenate([a[:-fade_n], overlap, b[fade_n:]])
-    def _build_with_crossfade(self, audio: np.ndarray, cuts: list,
-                               sr: int, fill_tone: bool = True) -> np.ndarray:
-        """
-        Build output from a list of (start_sec, end_sec) cuts,
-        filling gaps with room tone and crossfading every join.
-        cuts: sorted list of (start_sec, end_sec) to REMOVE.
-        """
-        segments = []
-        prev     = 0.0
-        for start, end in sorted(cuts, key=lambda x: x[0]):
-            # Guard: skip cuts shorter than minimum
-            if (end - start) < MIN_CUT_SEC:
-                continue
-            keep_sta = int(prev * sr)
-            keep_end = int(start * sr)
-            if keep_sta < keep_end:
-                segments.append(audio[keep_sta:keep_end])
-            gap_len = int((end - start) * sr)
-            if fill_tone and gap_len > 0:
-                segments.append(self._fill_with_room_tone(gap_len))
-            prev = end
-        remain = int(prev * sr)
-        if remain < len(audio):
-            segments.append(audio[remain:])
-        if not segments:
-            return audio
-        # Crossfade every adjacent pair
-        result = segments[0]
-        for seg in segments[1:]:
-            result = self._crossfade_join(result, seg, fade_ms=20.0, sr=sr)
-        return result.astype(np.float32)
-    # ══════════════════════════════════════════════════════════════════
-    # BACKGROUND NOISE REMOVAL
-    # Chain: DeepFilterNet → two-pass noisereduce → passthrough
-    #
-    # SepFormer REMOVED — it is a speech separation model, not a denoiser.
-    # It reconstructs voice artificially → robotic output.
-    #
-    # Two-pass noisereduce is the safe CPU fallback:
-    #   Pass 1 (stationary)     — removes steady hum/hiss/fan noise
-    #   Pass 2 (non-stationary) — catches residual at low prop_decrease
-    #                             so original voice character is preserved
-    # ══════════════════════════════════════════════════════════════════
-    def _remove_background_noise(self, audio, sr):
-        # ── Primary: DeepFilterNet (SOTA, Rust available via Docker) ─────
-        try:
-            result = self._deepfilter(audio, sr)
-            print("[Denoiser] ✅ DeepFilterNet noise removal done")
-            return result, "DeepFilterNet"
-        except Exception as e:
-            logger.warning(f"[Denoiser] DeepFilterNet unavailable ({e})")
-        # ── Fallback: Single-pass noisereduce, stationary only ────────────
-        # PHILOSOPHY: do as little as possible to the signal.
-        # - stationary=True  → only targets steady/consistent noise (fan,
-        #                       hum, AC, room hiss). Leaves transient
-        #                       speech harmonics completely untouched.
-        # - prop_decrease=0.5 → reduces noise by ~50%, not 100%.
-        #                       Keeps a thin noise floor so the voice
-        #                       never sounds "hollow" or over-processed.
-        # - No second pass, no non-stationary processing — those modes
-        #   touch voice frequencies and cause the robotic effect.
-        try:
-            import noisereduce as nr
-            cleaned = nr.reduce_noise(
-                y=audio, sr=sr,
-                stationary=True,
-                prop_decrease=0.50,
-            ).astype(np.float32)
-            print("[Denoiser] ✅ noisereduce done (voice-preserving, stationary only)")
-            return cleaned, "noisereduce_stationary"
-        except Exception as e:
-            logger.warning(f"noisereduce failed: {e}")
-        return audio, "none"
-    def _deepfilter(self, audio: np.ndarray, sr: int) -> np.ndarray:
-        """DeepFilterNet enhancement (local only — requires Rust compiler)."""
-        from df.enhance import enhance, init_df
-        import torch
-        # Lazy-load, module-level cache not needed (rarely reached on HF Spaces)
-        if not hasattr(self, '_df_model') or self._df_model is None:
-            self._df_model, self._df_state, _ = init_df()
-        df_sr = self._df_state.sr()
-        a     = self._resample(audio, sr, df_sr) if sr != df_sr else audio
-        t     = torch.from_numpy(a).unsqueeze(0)
-        out   = enhance(self._df_model, self._df_state, t)
-        res   = out.squeeze().numpy().astype(np.float32)
-        return self._resample(res, df_sr, sr) if df_sr != sr else res
-    # ══════════════════════════════════════════════════════════════════
-    # FILLER WORD REMOVAL  ← UPGRADED (confidence-gated + crossfade)
-    # ══════════════════════════════════════════════════════════════════
-    def _remove_fillers(self, audio: np.ndarray, sr: int, segments: list):
-        """
-        Cuts filler words using Whisper word-level timestamps.
-        UPGRADE: Confidence gating — words are only cut if:
-          1. avg_logprob ≥ FILLER_MIN_LOGPROB  (Whisper was confident)
-          2. no_speech_prob ≤ FILLER_MAX_NO_SPEECH  (audio is actually speech)
-          3. Duration ≥ MIN_CUT_SEC  (not a micro-glitch timestamp artefact)
-        Falls back gracefully when confidence fields are absent (older Whisper).
-        Gaps filled with room tone + crossfade for seamless edits.
-        """
-        try:
-            cuts = []
-            for seg in segments:
-                word = seg.get('word', '').strip().lower()
-                word = re.sub(r'[^a-z\s]', '', word).strip()
-                if word not in FILLER_WORDS:
-                    continue
-                start = seg.get('start', 0.0)
-                end   = seg.get('end',   0.0)
-                # Duration guard
-                if (end - start) < MIN_CUT_SEC:
-                    continue
-                # Confidence gate (optional fields — skip gate if absent)
-                avg_logprob    = seg.get('avg_logprob',    None)
-                no_speech_prob = seg.get('no_speech_prob', None)
-                if avg_logprob is not None and avg_logprob < FILLER_MIN_LOGPROB:
-                    logger.debug(f"[Denoiser] Filler '{word}' skipped: "
-                                 f"low confidence ({avg_logprob:.2f})")
-                    continue
-                if no_speech_prob is not None and no_speech_prob > FILLER_MAX_NO_SPEECH:
-                    logger.debug(f"[Denoiser] Filler '{word}' skipped: "
-                                 f"no_speech_prob={no_speech_prob:.2f}")
-                    continue
-                cuts.append((start, end))
-            if not cuts:
-                return audio, 0
-            out = self._build_with_crossfade(audio, cuts, sr, fill_tone=True)
-            print(f"[Denoiser] ✅ Removed {len(cuts)} filler words")
-            return out, len(cuts)
-        except Exception as e:
-            logger.warning(f"Filler removal failed: {e}")
-            return audio, 0
-    def clean_transcript_fillers(self, transcript: str) -> str:
-        """Remove filler words from transcript TEXT to match cleaned audio."""
-        words  = transcript.split()
-        result = []
-        i      = 0
-        while i < len(words):
-            w = re.sub(r'[^a-z\s]', '', words[i].lower()).strip()
-            if i + 1 < len(words):
-                two = w + " " + re.sub(r'[^a-z\s]', '', words[i+1].lower()).strip()
-                if two in FILLER_WORDS:
-                    i += 2
-                    continue
-            if w in FILLER_WORDS:
-                i += 1
-                continue
-            result.append(words[i])
-            i += 1
-        return " ".join(result)
-    # ══════════════════════════════════════════════════════════════════
-    # STUTTER REMOVAL  ← UPGRADED (phonetic similarity + crossfade)
-    # ══════════════════════════════════════════════════════════════════
-    def _remove_stutters(self, audio: np.ndarray, sr: int, segments: list):
-        """
-        UPGRADE: Phonetic near-match detection in addition to exact repeats.
-        e.g. "the" / "tha", "and" / "an", "I" / "I" all caught.
-        Uses jellyfish.jaro_winkler_similarity if available;
-        falls back to plain edit-distance ratio, then exact match only.
-        Confidence gating applied here too (same thresholds as filler removal).
-        Crossfade used on all splices.
-        """
-        try:
-            if len(segments) < 2:
-                return audio, 0
-            # Choose similarity function
-            sim_fn = self._word_similarity_fn()
-            cuts           = []
-            stutters_found = 0
-            i              = 0
-            while i < len(segments):
-                seg_i = segments[i]
-                word  = re.sub(r'[^a-z]', '', seg_i.get('word', '').lower())
-                if not word:
-                    i += 1
-                    continue
-                # Confidence gate on the anchor word
-                if not self._passes_confidence_gate(seg_i):
-                    i += 1
-                    continue
-                # Look ahead for consecutive near-matches
-                j = i + 1
-                while j < len(segments):
-                    seg_j     = segments[j]
-                    next_word = re.sub(r'[^a-z]', '', seg_j.get('word', '').lower())
-                    if not next_word:
-                        j += 1
-                        continue
-                    similarity = sim_fn(word, next_word)
-                    if similarity >= 0.88:   # ≥88% similar = stutter
-                        cuts.append((seg_i['start'], seg_i['end']))
-                        stutters_found += 1
-                        i = j
-                        j += 1
-                    else:
-                        break
-                i += 1
-            if not cuts:
-                return audio, 0
-            out = self._build_with_crossfade(audio, cuts, sr, fill_tone=True)
-            print(f"[Denoiser] ✅ Removed {stutters_found} stutters")
-            return out, stutters_found
-        except Exception as e:
-            logger.warning(f"Stutter removal failed: {e}")
-            return audio, 0
-    @staticmethod
-    def _word_similarity_fn():
-        """Return best available string-similarity function."""
-        try:
-            import jellyfish
-            return jellyfish.jaro_winkler_similarity
-        except ImportError:
-            pass
-        try:
-            import editdistance
-            def _ed_ratio(a, b):
-                if not a and not b:
-                    return 1.0
-                dist = editdistance.eval(a, b)
-                return 1.0 - dist / max(len(a), len(b))
-            return _ed_ratio
-        except ImportError:
-            pass
-        # Plain exact match as last resort
-        return lambda a, b: 1.0 if a == b else 0.0
-    @staticmethod
-    def _passes_confidence_gate(seg: dict) -> bool:
-        """Return True if Whisper confidence is acceptable (or fields absent)."""
-        avg_logprob    = seg.get('avg_logprob',    None)
-        no_speech_prob = seg.get('no_speech_prob', None)
-        if avg_logprob    is not None and avg_logprob    < FILLER_MIN_LOGPROB:
-            return False
-        if no_speech_prob is not None and no_speech_prob > FILLER_MAX_NO_SPEECH:
-            return False
-        return True
-    # ══════════════════════════════════════════════════════════════════
-    # BREATH REDUCTION
-    # ══════════════════════════════════════════════════════════════════
-    def _reduce_breaths(self, audio: np.ndarray, sr: int) -> np.ndarray:
-        """Non-stationary spectral gating — catches short broadband breath bursts."""
-        try:
-            import noisereduce as nr
-            cleaned = nr.reduce_noise(
-                y=audio, sr=sr,
-                stationary=False,
-                prop_decrease=0.60,
-                freq_mask_smooth_hz=400,
-                time_mask_smooth_ms=40,
-            ).astype(np.float32)
-            print("[Denoiser] ✅ Breath reduction done")
-            return cleaned
-        except Exception as e:
-            logger.warning(f"Breath reduction failed: {e}")
-            return audio
-    # ══════════════════════════════════════════════════════════════════
-    # MOUTH SOUND REDUCTION
-    # ══════════════════════════════════════════════════════════════════
-    def _reduce_mouth_sounds(self, audio: np.ndarray, sr: int):
-        """
-        Suppress very short, very high-amplitude transients (clicks/pops).
-        Threshold at 6.0 std to avoid removing real consonants (p, b, t).
-        """
-        try:
-            result  = audio.copy()
-            win     = int(sr * 0.003)   # 3ms window
-            hop     = win // 2
-            rms_arr = np.array([
-                float(np.sqrt(np.mean(audio[i:i+win]**2)))
-                for i in range(0, len(audio) - win, hop)
-            ])
-            if len(rms_arr) == 0:
-                return audio, 0
-            threshold = float(np.mean(rms_arr)) + 6.0 * float(np.std(rms_arr))
-            n_removed = 0
-            for idx, rms in enumerate(rms_arr):
-                if rms > threshold:
-                    start = idx * hop
-                    end   = min(start + win, len(result))
-                    result[start:end] *= np.linspace(1, 0, end - start)
-                    n_removed += 1
-            if n_removed:
-                print(f"[Denoiser] ✅ Suppressed {n_removed} mouth sound transients")
-            return result.astype(np.float32), n_removed
-        except Exception as e:
-            logger.warning(f"Mouth sound reduction failed: {e}")
-            return audio, 0
-    # ══════════════════════════════════════════════════════════════════
-    # LONG SILENCE REMOVAL  ← UPGRADED (adaptive threshold)
-    # ══════════════════════════════════════════════════════════════════
-    def _remove_long_silences(self, audio: np.ndarray, sr: int,
-                               max_silence_sec: float = 1.5,
-                               keep_pause_sec:  float = 0.4) -> tuple:
-        """
-        UPGRADE: Adaptive silence threshold.
-        Old code used a hardcoded RMS=0.008 — worked in quiet studios only.
-        New: threshold = 15th-percentile of per-frame RMS values.
-        This self-calibrates to the recording's actual noise floor,
-        so it works equally well in noisy rooms and near-silent studios.
-        Silences replaced with room tone + crossfade.
-        """
-        try:
-            frame_len = int(sr * 0.02)   # 20ms frames
-            # ── Compute per-frame RMS ─────────────────────────────────
-            n_frames = (len(audio) - frame_len) // frame_len
-            rms_frames = np.array([
-                float(np.sqrt(np.mean(audio[i*frame_len:(i+1)*frame_len]**2)))
-                for i in range(n_frames)
-            ])
-            if len(rms_frames) == 0:
-                return audio, 0.0
-            # ── Adaptive threshold: 15th percentile of RMS ───────────
-            threshold = float(np.percentile(rms_frames, 15))
-            # Clamp: never go below 0.001 (avoids mis-classifying very quiet speech)
-            threshold = max(threshold, 0.001)
-            print(f"[Denoiser] Adaptive silence threshold: RMS={threshold:.5f}")
-            max_sil_frames = int(max_silence_sec / 0.02)
-            keep_frames    = int(keep_pause_sec  / 0.02)
-            kept          = []
-            silence_count = 0
-            total_removed = 0
-            in_long_sil   = False
-            for i in range(n_frames):
-                frame = audio[i*frame_len:(i+1)*frame_len]
-                rms   = rms_frames[i]
-                if rms < threshold:
-                    silence_count += 1
-                    if silence_count <= max_sil_frames:
-                        kept.append(frame)
-                    else:
-                        total_removed += frame_len
-                        in_long_sil = True
-                else:
-                    if in_long_sil:
-                        pad = self._fill_with_room_tone(keep_frames * frame_len)
-                        kept.append(pad)
-                        in_long_sil = False
-                    silence_count = 0
-                    kept.append(frame)
-            # Tail of audio
-            tail_start = n_frames * frame_len
-            if tail_start < len(audio):
-                kept.append(audio[tail_start:])
-            if not kept:
-                return audio, 0.0
-            # Crossfade each frame join for smooth output
-            result = kept[0]
-            for seg in kept[1:]:
-                result = self._crossfade_join(result, seg, fade_ms=5.0, sr=sr)
-            removed_sec = total_removed / sr
-            if removed_sec > 0:
-                print(f"[Denoiser] ✅ Removed {removed_sec:.1f}s of long silences")
-            return result.astype(np.float32), removed_sec
-        except Exception as e:
-            logger.warning(f"Silence removal failed: {e}")
-            return audio, 0.0
-    # ══════════════════════════════════════════════════════════════════
-    # NORMALIZATION
-    # ══════════════════════════════════════════════════════════════════
-    def _normalise(self, audio: np.ndarray, sr: int) -> np.ndarray:
-        try:
-            import pyloudnorm as pyln
-            meter    = pyln.Meter(sr)
-            loudness = meter.integrated_loudness(audio)
-            if np.isfinite(loudness) and loudness < 0:
-                audio = pyln.normalize.loudness(audio, loudness, TARGET_LOUDNESS)
-                print(f"[Denoiser] ✅ Normalized: {loudness:.1f} → {TARGET_LOUDNESS} LUFS")
-        except Exception:
-            rms = np.sqrt(np.mean(audio**2))
-            if rms > 1e-9:
-                target_rms = 10 ** (TARGET_LOUDNESS / 20.0)
-                audio      = audio * (target_rms / rms)
-        return np.clip(audio, -1.0, 1.0).astype(np.float32)
-    # ══════════════════════════════════════════════════════════════════
-    # HELPERS
-    # ══════════════════════════════════════════════════════════════════
-    def _to_wav(self, src: str, dst: str, target_sr: int):
-        result = subprocess.run([
-            "ffmpeg", "-y", "-i", src,
-            "-acodec", "pcm_s24le", "-ar", str(target_sr), dst
-        ], capture_output=True)
-        if result.returncode != 0:
-            stderr = result.stderr.decode(errors='replace')
-            logger.warning(f"ffmpeg non-zero exit: {stderr[-400:]}")
-            # Fallback: soundfile passthrough
-            data, sr = sf.read(src, always_2d=True)
-            sf.write(dst, data, sr, format="WAV", subtype="PCM_24")
-    def _resample(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
-        if orig_sr == target_sr:
-            return audio
-        try:
-            import librosa
-            return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
-        except Exception:
-            length = int(len(audio) * target_sr / orig_sr)
-            return np.interp(
-                np.linspace(0, len(audio), length),
-                np.arange(len(audio)), audio
-            ).astype(np.float32)