Spaces:

bbmb
/

flange-ml-api

Sleeping

File size: 8,894 Bytes

"""
Feature extraction — 82-dimensional physics-informed vector.
EXACTLY matches Cell 4 of final_project_saurav_silwal.ipynb.

Group 1: Relative PSD in 20 log-spaced bins (50 Hz – 8 kHz)    → 20 dims
Group 2: MFCC mean/std + delta MFCC mean/std (13 coeffs each)  → 52 dims
Group 3: Physics features (centroid, bandwidth, rolloff, ZCR,
         peak freq, decay τ, energy ratio, RMS, Q-factor)       → 10 dims
Total                                                            → 82 dims
"""

import warnings
import numpy as np
import librosa
from scipy.signal import welch

from config import SR, N_MELS, N_FFT, HOP_LENGTH_MEL, SPEC_TIME_FRAMES

# ── Constants matching notebook ────────────────────────────────────────────
N_PSD_BINS     = 20
PSD_FMIN       = 50.0
PSD_FMAX       = 8000.0
WELCH_NPERSEG  = 2048
WELCH_NOVERLAP = 1024

N_MFCC         = 13
MFCC_NFFT      = 2048
MFCC_HOP       = 512

DECAY_FIT_MS       = 200
EARLY_LATE_FRAC    = 0.20
PEAK_SAMPLE_IN_WIN = int(0.020 * SR)   # 960 samples = 20 ms pre-peak

FMIN_MEL = 0
FMAX_MEL = SR // 2   # Nyquist = 24 000 Hz


# ── Group 1: Relative PSD ─────────────────────────────────────────────────

def relative_psd_log_bins(y, sr=SR, n_bins=N_PSD_BINS,
                          f_min=PSD_FMIN, f_max=PSD_FMAX):
    """Welch PSD → 20 log-spaced bins → normalized so sum=1."""
    f, pxx = welch(y, fs=sr,
                   nperseg=min(WELCH_NPERSEG, len(y)),
                   noverlap=min(WELCH_NOVERLAP, len(y) // 2))
    edges = np.logspace(np.log10(f_min), np.log10(f_max), n_bins + 1)
    bins  = np.zeros(n_bins, dtype=np.float32)
    for i in range(n_bins):
        mask    = (f >= edges[i]) & (f < edges[i + 1])
        bins[i] = pxx[mask].sum()
    total = bins.sum()
    if total > 1e-20:
        bins /= total
    return bins, f, pxx


# ── Group 2: MFCC + delta statistics ──────────────────────────────────────

def mfcc_stats(y, sr=SR):
    """13 MFCCs → mean+std (26) + delta mean+std (26) = 52 dims."""
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        mfcc  = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC,
                                      n_fft=MFCC_NFFT, hop_length=MFCC_HOP)
        delta = librosa.feature.delta(mfcc)
    return np.concatenate([
        mfcc.mean(axis=1),  mfcc.std(axis=1),
        delta.mean(axis=1), delta.std(axis=1),
    ]).astype(np.float32)


# ── Group 3: Physics features ──────────────────────────────────────────────

def peak_frequency(f, pxx):
    if pxx.max() <= 0:
        return 0.0
    return float(f[np.argmax(pxx)])


def q_factor(f, pxx):
    """Q = f_peak / -3 dB bandwidth. High Q = tight (rings cleanly)."""
    if pxx.max() <= 0:
        return 0.0
    pdb      = 10 * np.log10(pxx + 1e-20)
    peak_idx = int(np.argmax(pdb))
    threshold = pdb[peak_idx] - 3.0
    L = peak_idx
    while L > 0 and pdb[L] >= threshold:
        L -= 1
    R = peak_idx
    while R < len(pdb) - 1 and pdb[R] >= threshold:
        R += 1
    bw = max(f[R] - f[L], 1.0)
    return float(f[peak_idx] / bw)


def decay_tau(y, peak_sample=PEAK_SAMPLE_IN_WIN, sr=SR, fit_ms=DECAY_FIT_MS):
    """Decay time constant τ. Loose → small τ. Tight → large τ."""
    n_fit  = int(fit_ms * sr / 1000)
    seg    = y[peak_sample:min(peak_sample + n_fit, len(y))]
    if len(seg) < 100:
        return np.nan
    env_w = max(1, int(0.005 * sr))
    env   = np.convolve(np.abs(seg), np.ones(env_w) / env_w, mode='same')
    if env.max() < 1e-8:
        return np.nan
    active = np.where(env > 0.05 * env.max())[0]
    if len(active) < 50:
        return np.nan
    n_active = active[-1] + 1
    eps      = env.max() * 1e-4
    log_env  = np.log(env[:n_active] + eps)
    t        = np.arange(n_active) / sr
    slope, _ = np.polyfit(t, log_env, 1)
    if slope >= 0:
        return np.nan
    tau = -1.0 / slope
    return float(tau) if 0.001 <= tau <= 10.0 else np.nan


def energy_ratio(y, frac=EARLY_LATE_FRAC):
    """E_late / E_early. Tight flanges still ringing → high ratio."""
    n_chunk = int(frac * len(y))
    e_early = np.sqrt(np.mean(y[:n_chunk] ** 2))
    e_late  = np.sqrt(np.mean(y[-n_chunk:] ** 2))
    return float(e_late / (e_early + 1e-12))


# ── Master 82-dim extractor ───────────────────────────────────────────────

def extract_features(y: np.ndarray, sr: int = SR) -> np.ndarray:
    """Return 82-dim feature vector for one hit window."""
    psd_bins, f_psd, pxx = relative_psd_log_bins(y, sr)    # 20

    cepstral = mfcc_stats(y, sr)                            # 52

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        sc   = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        sb   = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
        sr85 = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)[0]
        zcr  = librosa.feature.zero_crossing_rate(y)[0]

    physics = np.array([
        sc.mean(),  sc.std(),           # 2: centroid mean/std
        sb.mean(),                      # 1: bandwidth mean
        sr85.mean(),                    # 1: rolloff 85%
        zcr.mean(),                     # 1: zero-crossing rate
        peak_frequency(f_psd, pxx),     # 1: dominant freq
        decay_tau(y),                   # 1: τ (NaN → imputed after)
        energy_ratio(y),                # 1: E_late / E_early
        float(np.sqrt(np.mean(y ** 2))),# 1: RMS energy
        q_factor(f_psd, pxx),           # 1: Q-factor
    ], dtype=np.float32)               # 10 total

    return np.concatenate([psd_bins, cepstral, physics])


def impute_nans(X: np.ndarray, y_labels: np.ndarray, n_classes: int = 3) -> np.ndarray:
    """Per-class median imputation for NaN columns (tau can be NaN)."""
    X = X.copy()
    nan_cols = np.where(np.isnan(X).any(axis=0))[0]
    for c in nan_cols:
        for cls in range(n_classes):
            cls_mask   = (y_labels == cls)
            median_val = float(np.nanmedian(X[cls_mask, c]))
            if np.isnan(median_val):
                median_val = float(np.nanmedian(X[:, c]))
            fill_mask = cls_mask & np.isnan(X[:, c])
            X[fill_mask, c] = median_val
    return X


# ── Feature name list ─────────────────────────────────────────────────────

def _build_feature_names() -> list[str]:
    names = []
    edges = np.logspace(np.log10(PSD_FMIN), np.log10(PSD_FMAX), N_PSD_BINS + 1)
    for i in range(N_PSD_BINS):
        names.append(f'psd_{edges[i]:.0f}_{edges[i+1]:.0f}Hz')
    names += [f'mfcc{i:02d}_mean'  for i in range(N_MFCC)]
    names += [f'mfcc{i:02d}_std'   for i in range(N_MFCC)]
    names += [f'dmfcc{i:02d}_mean' for i in range(N_MFCC)]
    names += [f'dmfcc{i:02d}_std'  for i in range(N_MFCC)]
    names += ['spec_centroid_mean', 'spec_centroid_std',
              'spec_bandwidth_mean', 'spec_rolloff85_mean',
              'zero_cross_rate_mean', 'peak_frequency',
              'decay_tau', 'energy_ratio', 'rms_energy', 'q_factor']
    return names

FEATURE_NAMES: list[str] = _build_feature_names()
assert len(FEATURE_NAMES) == 82


# ── Mel spectrogram (CNN / BiLSTM input) — matches notebook Cell 5 ────────

def extract_mel_spectrogram(y: np.ndarray, sr: int = SR,
                             n_mels: int = N_MELS,
                             n_fft: int = N_FFT,
                             hop_length: int = HOP_LENGTH_MEL,
                             target_frames: int = SPEC_TIME_FRAMES) -> np.ndarray:
    """One hit → standardized log-mel spectrogram of shape (n_mels, target_frames)."""
    mel    = librosa.feature.melspectrogram(
        y=y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length,
        fmin=FMIN_MEL, fmax=FMAX_MEL, power=2.0,
    )
    mel_db = librosa.power_to_db(mel, ref=np.max).astype(np.float32)

    n_frames = mel_db.shape[1]
    if n_frames < target_frames:
        pad_val = float(mel_db.min())
        mel_db  = np.pad(mel_db, ((0, 0), (0, target_frames - n_frames)),
                         mode='constant', constant_values=pad_val)
    elif n_frames > target_frames:
        start  = (n_frames - target_frames) // 2
        mel_db = mel_db[:, start:start + target_frames]

    # per-sample standardize
    mu, sigma = mel_db.mean(), mel_db.std()
    return (mel_db - mu) / (sigma + 1e-6)   # shape (64, 128)