Spaces:

testingfaces
/

clearwave-ai

Running

App Files Files Community

clearwave-ai / denoiser.py

testingfaces

Update denoiser.py

0e3930a verified about 21 hours ago

raw

history blame contribute delete

26.8 kB

	"""
	Department 1 — Professional Audio Enhancer
	Matches CleanVoice feature-for-feature using FREE local models:

	✅ Background noise removal → DeepFilterNet (SOTA free model) → noisereduce fallback
	✅ Filler word removal → Word-level timestamps + room tone fill
	✅ Stutter removal → Repeated-phrase detection + cut (fixed: catches triple+ repeats)
	✅ Long silence removal → Energy-based VAD (keeps natural pauses)
	✅ Breath sound reduction → Spectral gating (noisereduce non-stationary)
	✅ Mouth sound reduction → Amplitude zscore transient suppression (tuned threshold)
	✅ Room tone fill → Captures room noise, fills cuts naturally
	✅ Audio normalization → pyloudnorm -18 LUFS
	✅ CD quality output → 48000Hz PCM_24 (matches DeepFilterNet native SR)

	FIXES APPLIED:
	- TARGET_SR set to 48000 to match DeepFilterNet natively (no double resampling)
	- Mouth sound threshold raised 4.5→6.0 std (was removing real consonants p/b/t)
	- Duplicate _remove_background_noise fixed (was silently overwriting first def)
	- TARGET_SR set to 48000 — matches DeepFilterNet natively
	- Wiener filter added as Priority 2 fallback (artifact-free)
	- noisereduce kept as gentle last resort only
	- Room tone fallback: uses first 100ms if audio too short
	- Stutter detection fixed: now catches triple+ repeats (I I I was → I was)
	- Filler removal: also returns cleaned transcript text
	- Normalise RMS fallback formula corrected
	"""

	import os
	import re
	import time
	import subprocess
	import tempfile
	import numpy as np
	import soundfile as sf
	import logging

	logger = logging.getLogger(__name__)

	# NOTE: 44100 used on HF Spaces (DeepFilterNet not available — no Rust compiler)
	# Locally with DeepFilterNet installed, change this to 48000 for best quality
	TARGET_SR = 48000 # matches DeepFilterNet native SR
	TARGET_LOUDNESS = -18.0

	# Filler words (English + Telugu + Hindi)
	FILLER_WORDS = {
	"um", "umm", "ummm", "uh", "uhh", "uhhh",
	"hmm", "hm", "hmm", "hmmm",
	"er", "err", "errr",
	"eh", "ahh", "ah",
	"like", "basically", "literally",
	"you know", "i mean", "so",
	"right", "okay", "ok",
	# Telugu
	"ante", "ane", "mane", "arey", "enti",
	# Hindi
	"matlab", "yani", "bas", "acha",
	}


	class Denoiser:
	def __init__(self):
	self._df_model = None
	self._df_state = None
	self._df_loaded = False
	self._room_tone = None # captured room noise sample
	print("[Denoiser] ✅ Professional Audio Enhancer ready")

	# ══════════════════════════════════════════════════════════════════
	# MAIN ENTRY POINT
	# ══════════════════════════════════════════════════════════════════
	def process(self, audio_path: str, out_dir: str,
	remove_fillers: bool = True,
	remove_silences: bool = True,
	remove_breaths: bool = True,
	remove_mouth_sounds: bool = True,
	remove_stutters: bool = True,
	word_segments: list = None) -> dict:
	"""
	Full professional pipeline.
	word_segments: list of {'word': str, 'start': float, 'end': float}
	from Whisper word-level timestamps.
	Returns: {'audio_path': str, 'stats': dict}
	"""
	t0 = time.time()
	stats = {}
	print("[Denoiser] ▶ Starting professional enhancement pipeline...")

	# ── 0. Convert to standard WAV ───────────────────────────────
	wav_in = os.path.join(out_dir, "stage0_input.wav")
	self._to_wav(audio_path, wav_in, TARGET_SR)
	audio, sr = sf.read(wav_in, always_2d=True)
	n_ch = audio.shape[1]
	duration = len(audio) / sr
	print(f"[Denoiser] Input: {sr}Hz, {n_ch}ch, {duration:.1f}s")

	# Work in mono float32
	mono = audio.mean(axis=1).astype(np.float32)

	# ── 1. Capture room tone BEFORE denoising ────────────────────
	self._room_tone = self._capture_room_tone(mono, sr)

	# ── 2. Background Noise Removal ──────────────────────────────
	mono, noise_method = self._remove_background_noise(mono, sr)
	stats['noise_method'] = noise_method

	# ── 3. Mouth Sound Reduction (clicks/pops) ───────────────────
	if remove_mouth_sounds:
	mono, n_clicks = self._reduce_mouth_sounds(mono, sr)
	stats['mouth_sounds_removed'] = n_clicks

	# ── 4. Breath Reduction ──────────────────────────────────────
	if remove_breaths:
	mono = self._reduce_breaths(mono, sr)
	stats['breaths_reduced'] = True

	# ── 5. Filler Word Removal (needs word-level timestamps) ─────
	stats['fillers_removed'] = 0
	if remove_fillers and word_segments:
	mono, n_fillers = self._remove_fillers(mono, sr, word_segments)
	stats['fillers_removed'] = n_fillers

	# ── 6. Stutter Removal (needs word-level timestamps) ─────────
	stats['stutters_removed'] = 0
	if remove_stutters and word_segments:
	mono, n_stutters = self._remove_stutters(mono, sr, word_segments)
	stats['stutters_removed'] = n_stutters

	# ── 7. Long Silence Removal ───────────────────────────────────
	stats['silences_removed_sec'] = 0.0
	if remove_silences:
	mono, sil_sec = self._remove_long_silences(mono, sr)
	stats['silences_removed_sec'] = round(sil_sec, 2)

	# ── 8. Normalize Loudness ─────────────────────────────────────
	mono = self._normalise(mono, sr)

	# ── 9. Restore stereo / save ──────────────────────────────────
	out_audio = np.stack([mono, mono], axis=1) if n_ch == 2 else mono
	out_path = os.path.join(out_dir, "denoised.wav")
	sf.write(out_path, out_audio, sr, subtype="PCM_24")

	stats['processing_sec'] = round(time.time() - t0, 2)
	print(f"[Denoiser] ✅ Done in {stats['processing_sec']}s \| {stats}")
	return {'audio_path': out_path, 'stats': stats}

	# ══════════════════════════════════════════════════════════════════
	# ROOM TONE CAPTURE
	# ══════════════════════════════════════════════════════════════════
	def _capture_room_tone(self, audio: np.ndarray, sr: int,
	sample_sec: float = 0.5) -> np.ndarray:
	"""
	Find the quietest 0.5s section of audio = room tone.
	FIX: Falls back to first 100ms if audio is too short.
	"""
	try:
	frame = int(sr * sample_sec)

	# FIX: Robust fallback for short audio
	if len(audio) < frame * 2:
	fallback_len = min(int(sr * 0.1), len(audio)) # first 100ms
	print("[Denoiser] Short audio — using first 100ms as room tone")
	return audio[:fallback_len].copy().astype(np.float32)

	best_rms = float('inf')
	best_start = 0

	step = sr
	for i in range(0, len(audio) - frame, step):
	chunk = audio[i:i + frame]
	rms = float(np.sqrt(np.mean(chunk ** 2)))
	if rms < best_rms:
	best_rms = rms
	best_start = i

	room = audio[best_start: best_start + frame].copy()
	print(f"[Denoiser] Room tone captured: RMS={best_rms:.5f}")
	return room
	except Exception as e:
	logger.warning(f"Room tone capture failed: {e}")
	return np.zeros(int(sr * sample_sec), dtype=np.float32)

	def _fill_with_room_tone(self, length: int) -> np.ndarray:
	"""Tile room tone to fill a gap of `length` samples."""
	if self._room_tone is None or len(self._room_tone) == 0:
	return np.zeros(length, dtype=np.float32)
	reps = length // len(self._room_tone) + 1
	tiled = np.tile(self._room_tone, reps)[:length]
	# Fade in/out to avoid clicks
	fade = min(int(0.01 * len(tiled)), 64)
	if fade > 0:
	tiled[:fade] *= np.linspace(0, 1, fade)
	tiled[-fade:] *= np.linspace(1, 0, fade)
	return tiled.astype(np.float32)

	# ══════════════════════════════════════════════════════════════════
	# BACKGROUND NOISE REMOVAL
	# ══════════════════════════════════════════════════════════════════
	def _remove_background_noise(self, audio, sr):
	# ── Priority 1: DeepFilterNet (SOTA — best quality) ─────────
	try:
	result = self._deepfilter(audio, sr)
	print("[Denoiser] ✅ DeepFilterNet noise removal done")
	return result, "DeepFilterNet"
	except Exception as e:
	logger.warning(f"[Denoiser] DeepFilterNet unavailable ({e})")

	# ── Try Wiener filter (scipy — artifact-free, no compilation needed) ─
	try:
	result = self._rnnoise(audio, sr)
	print("[Denoiser] ✅ Wiener filter noise removal done")
	return result, "Wiener filter"
	except Exception as e:
	logger.warning(f"[Denoiser] Wiener filter failed ({e})")

	# ── Fallback: noisereduce with mild settings ──────────────────
	# IMPORTANT: Keep prop_decrease LOW (0.50-0.60) to avoid musical
	# noise artifacts. Two aggressive passes make musical noise WORSE.
	try:
	import noisereduce as nr

	# Single gentle pass — avoids musical noise artifacts
	cleaned = nr.reduce_noise(
	y=audio, sr=sr,
	stationary=False, # non-stationary handles both types
	prop_decrease=0.55, # gentle — avoids buzzing artifacts
	freq_mask_smooth_hz=1000, # heavy smoothing = less musical noise
	time_mask_smooth_ms=100, # heavy smoothing = less musical noise
	n_std_thresh_stationary=2.0, # higher = less aggressive
	).astype(np.float32)
	print("[Denoiser] ✅ noisereduce (gentle, artifact-free) done")
	return cleaned, "noisereduce"
	except Exception as e:
	logger.warning(f"noisereduce failed: {e}")
	return audio, "none"

	def _rnnoise(self, audio, sr):
	"""
	Wiener filter via scipy — no compilation needed, works on HF Spaces.
	Much cleaner than noisereduce for voice — no musical artifacts.
	"""
	from scipy.signal import wiener
	# Wiener filter works best on short frames
	frame_size = int(sr * 0.02) # 20ms frames
	result = np.zeros_like(audio)
	for i in range(0, len(audio) - frame_size, frame_size):
	frame = audio[i:i + frame_size]
	result[i:i + frame_size] = wiener(frame, mysize=7)
	# Handle last chunk
	remainder = len(audio) % frame_size
	if remainder:
	result[-remainder:] = wiener(audio[-remainder:], mysize=7)
	return result.astype(np.float32)

	def _deepfilter(self, audio, sr):
	if not self._df_loaded:
	from df.enhance import enhance, init_df
	self._df_model, self._df_state, _ = init_df()
	self._df_loaded = True
	from df.enhance import enhance
	import torch
	df_sr = self._df_state.sr()
	# FIX: TARGET_SR now matches DeepFilterNet's native SR (48kHz)
	# so resampling is skipped in most cases
	a = self._resample(audio, sr, df_sr) if sr != df_sr else audio
	t = torch.from_numpy(a).unsqueeze(0)
	out = enhance(self._df_model, self._df_state, t)
	res = out.squeeze().numpy().astype(np.float32)
	return self._resample(res, df_sr, sr) if df_sr != sr else res

	# ══════════════════════════════════════════════════════════════════
	# FILLER WORD REMOVAL + ROOM TONE FILL
	# ══════════════════════════════════════════════════════════════════
	def _remove_fillers(self, audio, sr, segments):
	"""
	Cut filler words using word-level timestamps.
	Fills gaps with room tone for natural sound.
	"""
	try:
	cuts = []
	for seg in segments:
	word = seg.get('word', '').strip().lower()
	word = re.sub(r'[^a-z\s]', '', word).strip()
	if word in FILLER_WORDS:
	cuts.append((seg['start'], seg['end'], word))

	if not cuts:
	return audio, 0

	result = []
	prev = 0.0
	for start, end, word in sorted(cuts, key=lambda x: x[0]):
	keep_end = int(start * sr)
	keep_sta = int(prev * sr)
	if keep_sta < keep_end:
	result.append(audio[keep_sta:keep_end])
	gap_len = int((end - start) * sr)
	if gap_len > 0:
	result.append(self._fill_with_room_tone(gap_len))
	prev = end

	remain_start = int(prev * sr)
	if remain_start < len(audio):
	result.append(audio[remain_start:])

	out = np.concatenate(result) if result else audio
	print(f"[Denoiser] ✅ Removed {len(cuts)} filler words: {[c[2] for c in cuts[:5]]}")
	return out.astype(np.float32), len(cuts)
	except Exception as e:
	logger.warning(f"Filler removal failed: {e}")
	return audio, 0

	def clean_transcript_fillers(self, transcript: str) -> str:
	"""
	FIX (NEW): Also remove filler words from the transcript TEXT,
	so the displayed text matches the cleaned audio.
	"""
	words = transcript.split()
	result = []
	i = 0
	while i < len(words):
	word = re.sub(r'[^a-z\s]', '', words[i].lower()).strip()
	# Check two-word fillers first ("you know", "i mean")
	if i + 1 < len(words):
	two = word + " " + re.sub(r'[^a-z\s]', '', words[i+1].lower()).strip()
	if two in FILLER_WORDS:
	i += 2
	continue
	if word in FILLER_WORDS:
	i += 1
	continue
	result.append(words[i])
	i += 1
	return " ".join(result)

	# ══════════════════════════════════════════════════════════════════
	# STUTTER REMOVAL — FIXED
	# ══════════════════════════════════════════════════════════════════
	def _remove_stutters(self, audio, sr, segments):
	"""
	FIX: Now correctly catches triple+ repeats (I I I was → I was).
	Old code broke after finding one repeat and missed subsequent ones.

	Strategy:
	- Scan forward from each word
	- While next word == current word, mark all but last as cuts
	- Skip past all repeats in one go
	"""
	try:
	if len(segments) < 2:
	return audio, 0

	cuts = []
	stutters_found = 0
	i = 0

	while i < len(segments):
	word = re.sub(r'[^a-z]', '', segments[i].get('word', '').strip().lower())

	if not word:
	i += 1
	continue

	# FIX: Look ahead for ALL consecutive repeats, not just one
	j = i + 1
	while j < len(segments):
	next_word = re.sub(r'[^a-z]', '', segments[j].get('word', '').strip().lower())
	if next_word == word:
	# Mark earlier copy as cut, keep advancing
	cuts.append((segments[i]['start'], segments[i]['end']))
	stutters_found += 1
	i = j # slide i forward to current repeat
	j += 1
	else:
	break # no more repeats — stop

	i += 1

	if not cuts:
	return audio, 0

	# Build output
	result = []
	prev = 0.0
	for start, end in sorted(cuts, key=lambda x: x[0]):
	keep_sta = int(prev * sr)
	keep_end = int(start * sr)
	if keep_sta < keep_end:
	result.append(audio[keep_sta:keep_end])
	gap_len = int((end - start) * sr)
	if gap_len > 0:
	result.append(self._fill_with_room_tone(gap_len))
	prev = end

	remain = int(prev * sr)
	if remain < len(audio):
	result.append(audio[remain:])

	out = np.concatenate(result) if result else audio
	print(f"[Denoiser] ✅ Removed {stutters_found} stutters")
	return out.astype(np.float32), stutters_found
	except Exception as e:
	logger.warning(f"Stutter removal failed: {e}")
	return audio, 0

	# ══════════════════════════════════════════════════════════════════
	# BREATH REDUCTION
	# ══════════════════════════════════════════════════════════════════
	def _reduce_breaths(self, audio, sr):
	"""
	Breaths = short broadband bursts between speech.
	Non-stationary spectral gating catches them well.
	"""
	try:
	import noisereduce as nr
	cleaned = nr.reduce_noise(
	y=audio, sr=sr,
	stationary=False,
	prop_decrease=0.90, # increased from 0.60
	freq_mask_smooth_hz=400,
	time_mask_smooth_ms=40,
	n_std_thresh_stationary=1.0,
	).astype(np.float32)
	print("[Denoiser] ✅ Breath reduction done")
	return cleaned
	except Exception as e:
	logger.warning(f"Breath reduction failed: {e}")
	return audio

	# ══════════════════════════════════════════════════════════════════
	# MOUTH SOUND REDUCTION — FIXED THRESHOLD
	# ══════════════════════════════════════════════════════════════════
	def _reduce_mouth_sounds(self, audio, sr):
	"""
	Mouth clicks/pops = very short, very high amplitude transients.
	FIX: Threshold raised from 4.5→6.0 std to avoid removing
	real consonants like p, b, t which have similar transient energy.
	"""
	try:
	result = audio.copy()
	win = int(sr * 0.003) # 3ms window
	hop = win // 2
	rms_arr = []

	for i in range(0, len(audio) - win, hop):
	rms_arr.append(float(np.sqrt(np.mean(audio[i:i+win]**2))))

	if not rms_arr:
	return audio, 0

	rms_arr = np.array(rms_arr)
	mean_rms = float(np.mean(rms_arr))
	std_rms = float(np.std(rms_arr))
	# FIX: was 4.5 — too sensitive, removed real speech consonants
	threshold = mean_rms + 6.0 * std_rms
	n_removed = 0

	for idx, rms in enumerate(rms_arr):
	if rms > threshold:
	start = idx * hop
	end = min(start + win, len(result))
	fade = np.linspace(1, 0, end - start)
	result[start:end] *= fade
	n_removed += 1

	if n_removed:
	print(f"[Denoiser] ✅ Suppressed {n_removed} mouth sound transients")
	return result.astype(np.float32), n_removed
	except Exception as e:
	logger.warning(f"Mouth sound reduction failed: {e}")
	return audio, 0

	# ══════════════════════════════════════════════════════════════════
	# LONG SILENCE REMOVAL
	# ══════════════════════════════════════════════════════════════════
	def _remove_long_silences(self, audio, sr,
	max_silence_sec=1.5,
	keep_pause_sec=0.4):
	"""
	Shorten silences longer than max_silence_sec.
	Keeps keep_pause_sec worth of silence for natural pacing.
	"""
	try:
	frame_len = int(sr * 0.02)
	max_sil_frames = int(max_silence_sec / 0.02)
	keep_frames = int(keep_pause_sec / 0.02)
	threshold = 0.008

	kept = []
	silence_count = 0
	total_removed = 0
	in_long_sil = False

	for i in range(0, len(audio) - frame_len, frame_len):
	frame = audio[i:i + frame_len]
	rms = float(np.sqrt(np.mean(frame**2)))

	if rms < threshold:
	silence_count += 1
	if silence_count <= max_sil_frames:
	kept.append(frame)
	else:
	total_removed += frame_len
	in_long_sil = True
	else:
	if in_long_sil:
	pad = self._fill_with_room_tone(keep_frames * frame_len)
	kept.append(pad)
	in_long_sil = False
	silence_count = 0
	kept.append(frame)

	result = np.concatenate(kept) if kept else audio
	removed_sec = total_removed / sr
	if removed_sec > 0:
	print(f"[Denoiser] ✅ Removed {removed_sec:.1f}s of long silences")
	return result.astype(np.float32), removed_sec
	except Exception as e:
	logger.warning(f"Silence removal failed: {e}")
	return audio, 0.0

	# ══════════════════════════════════════════════════════════════════
	# NORMALIZATION — FIXED RMS FALLBACK
	# ══════════════════════════════════════════════════════════════════
	def _normalise(self, audio, sr):
	try:
	import pyloudnorm as pyln
	meter = pyln.Meter(sr)
	loudness = meter.integrated_loudness(audio)
	if np.isfinite(loudness) and loudness < 0:
	audio = pyln.normalize.loudness(audio, loudness, TARGET_LOUDNESS)
	print(f"[Denoiser] ✅ Normalized: {loudness:.1f} → {TARGET_LOUDNESS} LUFS")
	except Exception:
	# FIX: Corrected RMS fallback formula
	# Old: audio * (10 ** (TARGET_LOUDNESS / 20.0) / rms) ← wrong
	# New: scale so RMS matches target linear amplitude
	rms = np.sqrt(np.mean(audio**2))
	if rms > 1e-9:
	target_rms = 10 ** (TARGET_LOUDNESS / 20.0) # ≈ 0.126
	audio = audio * (target_rms / rms) # correct ratio
	return np.clip(audio, -1.0, 1.0).astype(np.float32)

	# ══════════════════════════════════════════════════════════════════
	# HELPERS
	# ══════════════════════════════════════════════════════════════════
	def _to_wav(self, src, dst, target_sr):
	result = subprocess.run([
	"ffmpeg", "-y", "-i", src,
	"-acodec", "pcm_s24le", "-ar", str(target_sr), dst
	], capture_output=True)
	if result.returncode != 0:
	data, sr = sf.read(src, always_2d=True)
	sf.write(dst, data, sr, subtype="PCM_24")

	def _resample(self, audio, orig_sr, target_sr):
	try:
	import librosa
	return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
	except Exception:
	length = int(len(audio) * target_sr / orig_sr)
	return np.interp(
	np.linspace(0, len(audio), length),
	np.arange(len(audio)), audio
	).astype(np.float32)