Spaces:

grungecoder
/

tot-talk

Sleeping

File size: 3,376 Bytes

ea2601f

"""Shared audio preprocessing utilities."""

import numpy as np
import librosa

# ── Constants ──────────────────────────────────────────────────────────────────
SAMPLE_RATE = 16_000          # all models normalized to 16 kHz
WINDOW_SECONDS = 1.0          # inference window size
SILENCE_RMS_THRESHOLD = 0.001 # skip silent frames (low for phone speaker playback)
HOP_LENGTH = 512
N_FFT = 1024
N_MELS = 128
N_MFCC = 40


def resample(audio_np: np.ndarray, from_sr: int, to_sr: int) -> np.ndarray:
    """Resample audio from *from_sr* to *to_sr* using librosa."""
    if from_sr == to_sr:
        return audio_np
    return librosa.resample(audio_np, orig_sr=from_sr, target_sr=to_sr)


def extract_mfcc_features(
    audio_np: np.ndarray,
    sr: int,
    n_mels: int = N_MELS,
) -> np.ndarray:
    """Return a feature vector (MFCCs + chroma + mel + contrast + tonnetz mean).

    Concatenation order matches foduucom/baby-cry-classification training code.
    ``n_mels`` can be overridden when the SVC model was trained with a different
    mel band count.
    """
    # MFCCs — 40 coeffs
    mfcc = librosa.feature.mfcc(y=audio_np, sr=sr, n_mfcc=N_MFCC)
    mfcc_mean = np.mean(mfcc, axis=1)  # (40,)

    # Chroma — 12 bins
    stft = np.abs(librosa.stft(audio_np, n_fft=N_FFT, hop_length=HOP_LENGTH))
    chroma = librosa.feature.chroma_stft(S=stft, sr=sr)
    chroma_mean = np.mean(chroma, axis=1)  # (12,)

    # Mel spectrogram summary
    mel = librosa.feature.melspectrogram(
        y=audio_np, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=n_mels,
    )
    mel_mean = np.mean(mel, axis=1)  # (n_mels,)

    # Spectral contrast — 7 bands
    contrast = librosa.feature.spectral_contrast(
        S=stft, sr=sr, n_bands=6, fmin=200.0,
    )
    contrast_mean = np.mean(contrast, axis=1)  # (7,)

    # Tonnetz — 6 dims
    tonnetz = librosa.feature.tonnetz(
        y=librosa.effects.harmonic(audio_np), sr=sr,
    )
    tonnetz_mean = np.mean(tonnetz, axis=1)  # (6,)

    # Order: mfcc, chroma, mel, contrast, tonnetz (matches foduucom training)
    return np.concatenate([mfcc_mean, chroma_mean, mel_mean, contrast_mean, tonnetz_mean])


def extract_mel_spectrogram(audio_np: np.ndarray, sr: int) -> np.ndarray:
    """Return a mel spectrogram of shape (128, T) as float32."""
    mel = librosa.feature.melspectrogram(
        y=audio_np, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS,
    )
    return librosa.power_to_db(mel, ref=np.max).astype(np.float32)


def is_silent(audio_np: np.ndarray) -> bool:
    """Return True when audio RMS is below the silence threshold."""
    rms = np.sqrt(np.mean(audio_np ** 2))
    return rms < SILENCE_RMS_THRESHOLD


def compute_rms(audio_np: np.ndarray) -> float:
    """Return the RMS energy of the audio window."""
    return float(np.sqrt(np.mean(audio_np ** 2)))


def normalize_audio(audio_np: np.ndarray) -> np.ndarray:
    """Peak-normalize audio to [-1, 1].

    Crucial when playing cry samples through a phone speaker → laptop mic,
    since the captured signal can be very quiet and models perform poorly
    on low-amplitude inputs.
    """
    peak = np.max(np.abs(audio_np))
    if peak < 1e-6:
        return audio_np
    return audio_np / peak