Spaces:
Sleeping
Sleeping
File size: 3,376 Bytes
ea2601f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | """Shared audio preprocessing utilities."""
import numpy as np
import librosa
# ββ Constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
SAMPLE_RATE = 16_000 # all models normalized to 16 kHz
WINDOW_SECONDS = 1.0 # inference window size
SILENCE_RMS_THRESHOLD = 0.001 # skip silent frames (low for phone speaker playback)
HOP_LENGTH = 512
N_FFT = 1024
N_MELS = 128
N_MFCC = 40
def resample(audio_np: np.ndarray, from_sr: int, to_sr: int) -> np.ndarray:
"""Resample audio from *from_sr* to *to_sr* using librosa."""
if from_sr == to_sr:
return audio_np
return librosa.resample(audio_np, orig_sr=from_sr, target_sr=to_sr)
def extract_mfcc_features(
audio_np: np.ndarray,
sr: int,
n_mels: int = N_MELS,
) -> np.ndarray:
"""Return a feature vector (MFCCs + chroma + mel + contrast + tonnetz mean).
Concatenation order matches foduucom/baby-cry-classification training code.
``n_mels`` can be overridden when the SVC model was trained with a different
mel band count.
"""
# MFCCs β 40 coeffs
mfcc = librosa.feature.mfcc(y=audio_np, sr=sr, n_mfcc=N_MFCC)
mfcc_mean = np.mean(mfcc, axis=1) # (40,)
# Chroma β 12 bins
stft = np.abs(librosa.stft(audio_np, n_fft=N_FFT, hop_length=HOP_LENGTH))
chroma = librosa.feature.chroma_stft(S=stft, sr=sr)
chroma_mean = np.mean(chroma, axis=1) # (12,)
# Mel spectrogram summary
mel = librosa.feature.melspectrogram(
y=audio_np, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=n_mels,
)
mel_mean = np.mean(mel, axis=1) # (n_mels,)
# Spectral contrast β 7 bands
contrast = librosa.feature.spectral_contrast(
S=stft, sr=sr, n_bands=6, fmin=200.0,
)
contrast_mean = np.mean(contrast, axis=1) # (7,)
# Tonnetz β 6 dims
tonnetz = librosa.feature.tonnetz(
y=librosa.effects.harmonic(audio_np), sr=sr,
)
tonnetz_mean = np.mean(tonnetz, axis=1) # (6,)
# Order: mfcc, chroma, mel, contrast, tonnetz (matches foduucom training)
return np.concatenate([mfcc_mean, chroma_mean, mel_mean, contrast_mean, tonnetz_mean])
def extract_mel_spectrogram(audio_np: np.ndarray, sr: int) -> np.ndarray:
"""Return a mel spectrogram of shape (128, T) as float32."""
mel = librosa.feature.melspectrogram(
y=audio_np, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS,
)
return librosa.power_to_db(mel, ref=np.max).astype(np.float32)
def is_silent(audio_np: np.ndarray) -> bool:
"""Return True when audio RMS is below the silence threshold."""
rms = np.sqrt(np.mean(audio_np ** 2))
return rms < SILENCE_RMS_THRESHOLD
def compute_rms(audio_np: np.ndarray) -> float:
"""Return the RMS energy of the audio window."""
return float(np.sqrt(np.mean(audio_np ** 2)))
def normalize_audio(audio_np: np.ndarray) -> np.ndarray:
"""Peak-normalize audio to [-1, 1].
Crucial when playing cry samples through a phone speaker β laptop mic,
since the captured signal can be very quiet and models perform poorly
on low-amplitude inputs.
"""
peak = np.max(np.abs(audio_np))
if peak < 1e-6:
return audio_np
return audio_np / peak
|